In [None]:
# ==========================================================
#  Project — Supermarket Sales Analysis
# ==========================================================
# This notebook performs Exploratory Data Analysis (EDA)
# on the "SuperMarket Analysis.csv" dataset.
# ==========================================================

# ==========================================================
# 1️- NOTEBOOK SETUP
# ==========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

plt.style.use("seaborn-v0_8")

# ----------------------------------------------------------
# LOAD THE DATASET
# ----------------------------------------------------------
df = pd.read_csv("/kaggle/input/supermarket-sales-dataset/SuperMarket Analysis.csv")

# Clean and standardize column names
df.columns = df.columns.str.strip().str.title().str.replace(" ", "_")

print("🧾 Columns in dataset:")
print(df.columns.tolist())

# Display first 10 rows
print("\n📄 First 10 rows of dataset:")
display(df.head(10))

# Dataset Info
print("\n📊 Dataset Info:")
df.info()

# ----------------------------------------------------------
# 2️- BASIC EXPLORATORY ANALYSIS
# ----------------------------------------------------------
print("\n🔹 Dataset shape:", df.shape)

# Summary statistics
display(df.describe())

# Count unique values in categorical columns
print("\n🔸 Unique value counts:")
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].nunique()}")

# ----------------------------------------------------------
# Visualizations
# ----------------------------------------------------------
def safe_countplot(column, title):
    """Safely plot a countplot if column exists"""
    if column in df.columns:
        plt.figure(figsize=(5,4))
        sns.countplot(data=df, x=column)
        plt.title(title)
        plt.show()
    else:
        print(f"⚠️ Column '{column}' not found in dataset.")

# Gender distribution
safe_countplot("Gender", "Gender Distribution")

# Customer Type counts
safe_countplot("Customer_Type", "Customer Type Counts")

# Number of sales per Branch
safe_countplot("Branch", "Number of Sales per Branch")

# Number of sales per City
safe_countplot("City", "Number of Sales per City")

# ----------------------------------------------------------
# 3️- BIVARIATE ANALYSIS
# ----------------------------------------------------------

if {"Branch", "Total"}.issubset(df.columns):
    branch_sales = df.groupby("Branch")["Total"].sum().sort_values(ascending=False)
    print("\n🏬 Total Sales by Branch:")
    display(branch_sales)
    branch_sales.plot(kind='bar', color='skyblue', figsize=(6,4), title="Total Sales by Branch")
    plt.show()

if "City" in df.columns:
    city_transactions = df["City"].value_counts()
    print("\n🏙️ Transactions per City:")
    display(city_transactions)
    city_transactions.plot(kind='bar', color='orange', figsize=(6,4), title="Transactions per City")
    plt.show()

if {"Product_Line", "Total"}.issubset(df.columns):
    product_revenue = df.groupby("Product_Line")["Total"].sum().sort_values(ascending=False)
    print("\n🛒 Revenue by Product Line:")
    display(product_revenue)
    product_revenue.plot(kind='bar', color='teal', figsize=(7,4), title="Revenue by Product Line")
    plt.show()

if {"Customer_Type", "Total"}.issubset(df.columns):
    spending_by_type = df.groupby("Customer_Type")["Total"].mean()
    print("\n💳 Average Spending by Customer Type:")
    display(spending_by_type)
    spending_by_type.plot(kind='bar', color='green', figsize=(5,4), title="Average Spending by Customer Type")
    plt.show()

# ----------------------------------------------------------
# 4️- CORRELATIONS & RELATIONSHIPS
# ----------------------------------------------------------
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

if {"Quantity", "Total", "Gender"}.issubset(df.columns):
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df, x="Quantity", y="Total", hue="Gender")
    plt.title("Quantity vs Total")
    plt.show()

if {"Unit_Price", "Total"}.issubset(df.columns):
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df, x="Unit_Price", y="Total")
    plt.title("Unit Price vs Total")
    plt.show()

# ----------------------------------------------------------
# 5️- INSIGHTS & CONCLUSION
# ----------------------------------------------------------
# 👉 Use Markdown cells in Kaggle to write:
# - Branch with highest total sales and why
# - Which city has the most transactions
# - Top product line by revenue
# - Customer type that spends more
# - Correlation observations from heatmap

# ----------------------------------------------------------
# 6️- BONUS — DASHBOARD KPIs
# ----------------------------------------------------------
if "Total" in df.columns:
    total_sales = df["Total"].sum()
    avg_basket = df["Total"].mean()
else:
    total_sales = avg_basket = np.nan

num_transactions = df.shape[0]
avg_rating = df["Rating"].mean() if "Rating" in df.columns else np.nan

print("\n📊 Quick Dashboard KPIs:")
print(f"Total Sales: ${total_sales:,.2f}")
print(f"Average Basket Size: ${avg_basket:,.2f}")
print(f"Number of Transactions: {num_transactions}")
print(f"Average Rating: {avg_rating:.2f}")

print("\n✅ Analysis Complete — Proceed to Markdown cells for insights.")


https://github.com/MohamedAhmed20375844/Assiment1