"""
Sales Analysis - Full Python Script
Author: Monalika Kapoor
Purpose: 
 - Performs data wrangling (missing values, parsing dates)
 - Normalizes numeric columns (Min-Max)
 - Performs descriptive stats for Sales and Unit
 - Aggregates by State and Group and by time (daily/weekly/monthly/quarterly)
 - Produces and saves visualizations and CSV outputs
"""

In [1]:
# ----------------------------------------------
# IMPORT LIBRARIES
# ----------------------------------------------
# Library	Purpose
# pandas	Load & manipulate CSV data
# numpy	Numerical computation
# Pathlib	Clean file path handling
# MinMaxScaler	Normalization of numeric columns
# matplotlib/seaborn	Data visualization & charts
# DateFormatter	Format time-based axis in plots
# warnings	Suppress irrelevant warnings
# Data handling
import pandas as pd
import numpy as np

# File paths
from pathlib import Path

# Normalization
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import DateFormatter

# Ignore warnings for cleaner outputs
import warnings
warnings.filterwarnings("ignore")


In [4]:
# ----------------------------------------------
# LOAD DATA
# ----------------------------------------------   
DATA_PATH = Path("AusApparalSales4thQrt2020.csv")# <-- using your uploaded CSV
df = pd.read_csv(DATA_PATH)

OUTPUT_DIR = Path("sales_analysis_output")
OUTPUT_DIR.mkdir(exist_ok=True)

df = pd.read_csv(DATA_PATH)

print("Data Loaded Successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Data Loaded Successfully!
Shape: (7560, 6)
Columns: ['Date', 'Time', 'State', 'Group', 'Unit', 'Sales']


In [5]:
# ----------------------------------------------
# DETECT COLUMNS AUTOMATICALLY
# ----------------------------------------------
def detect_columns(df):
    cols = df.columns
    date_col = next((c for c in cols if "date" in c.lower()), None)
    sales_col = next((c for c in cols if c.lower() == "sales"), None)
    unit_col = next((c for c in cols if "unit" in c.lower()), None)
    state_col = next((c for c in cols if "state" in c.lower()), None)
    group_col = next((c for c in cols if "group" in c.lower()), None)
    return date_col, sales_col, unit_col, state_col, group_col

DATE_COL, SALES_COL, UNIT_COL, STATE_COL, GROUP_COL = detect_columns(df)
print("\nDetected Columns:")
print("Date:", DATE_COL)
print("Sales:", SALES_COL)
print("Units:", UNIT_COL)
print("State:", STATE_COL)
print("Group:", GROUP_COL)



Detected Columns:
Date: Date
Sales: Sales
Units: Unit
State: State
Group: Group


In [6]:

# ----------------------------------------------
# DATA WRANGLING
# ----------------------------------------------

# Parse dates
df["ParsedDate"] = pd.to_datetime(df[DATE_COL], errors="coerce")

# Missing value check
print("\nMissing Values Summary:")
print(df.isna().sum())

# Fill numeric missing with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical missing with 'Unknown'
for col in df.select_dtypes(include=["object"]).columns:
    df[col].fillna("Unknown", inplace=True)



Missing Values Summary:
Date          0
Time          0
State         0
Group         0
Unit          0
Sales         0
ParsedDate    0
dtype: int64


In [7]:
# ----------------------------------------------
# NORMALIZATION (Min–Max Scaling)
# ----------------------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

scaler = MinMaxScaler()
normalized = scaler.fit_transform(df[numeric_cols])
normalized_df = pd.DataFrame(normalized, columns=[f"{c}_norm" for c in numeric_cols])

df = pd.concat([df, normalized_df], axis=1)

normalized_df.head(20).to_csv(OUTPUT_DIR/"normalized_sample.csv", index=False)

print("\nNormalization Complete!")


Normalization Complete!


In [8]:
# ----------------------------------------------
# DESCRIPTIVE STATISTICS
# ----------------------------------------------
print("\nDescriptive Statistics – Sales:")
print(df[SALES_COL].describe())

print("\nDescriptive Statistics – Units:")
print(df[UNIT_COL].describe())

# ----------------------------------------------
# HIGHEST & LOWEST SALES (STATE + GROUP)
# ----------------------------------------------
state_sales = df.groupby(STATE_COL)[SALES_COL].sum().sort_values(ascending=False)
group_sales = df.groupby(GROUP_COL)[SALES_COL].sum().sort_values(ascending=False)

print("\nTop States by Sales:\n", state_sales.head())
print("\nBottom States by Sales:\n", state_sales.tail())
print("\nTop Groups by Sales:\n", group_sales.head())
print("\nBottom Groups by Sales:\n", group_sales.tail())

state_sales.to_csv(OUTPUT_DIR/"state_total_sales.csv")
group_sales.to_csv(OUTPUT_DIR/"group_total_sales.csv")



Descriptive Statistics – Sales:
count      7560.000000
mean      45013.558201
std       32253.506944
min        5000.000000
25%       20000.000000
50%       35000.000000
75%       65000.000000
max      162500.000000
Name: Sales, dtype: float64

Descriptive Statistics – Units:
count    7560.000000
mean       18.005423
std        12.901403
min         2.000000
25%         8.000000
50%        14.000000
75%        26.000000
max        65.000000
Name: Unit, dtype: float64

Top States by Sales:
 State
VIC    105565000
NSW     74970000
SA      58857500
QLD     33417500
TAS     22760000
Name: Sales, dtype: int64

Bottom States by Sales:
 State
SA     58857500
QLD    33417500
TAS    22760000
NT     22580000
WA     22152500
Name: Sales, dtype: int64

Top Groups by Sales:
 Group
Men        85750000
Women      85442500
Kids       85072500
Seniors    84037500
Name: Sales, dtype: int64

Bottom Groups by Sales:
 Group
Men        85750000
Women      85442500
Kids       85072500
Seniors    84037500
Na

In [9]:
# ----------------------------------------------
# TIME-BASED REPORTING
# ----------------------------------------------
df = df.set_index("ParsedDate")

daily = df[SALES_COL].resample("D").sum()
weekly = df[SALES_COL].resample("W").sum()
monthly = df[SALES_COL].resample("M").sum()
quarterly = df[SALES_COL].resample("Q").sum()

daily.to_csv(OUTPUT_DIR/"daily_sales.csv")
weekly.to_csv(OUTPUT_DIR/"weekly_sales.csv")
monthly.to_csv(OUTPUT_DIR/"monthly_sales.csv")
quarterly.to_csv(OUTPUT_DIR/"quarterly_sales.csv")

print("\nTime-Based Reports Generated!")


Time-Based Reports Generated!


In [10]:
# ----------------------------------------------
# VISUALIZATION
# ----------------------------------------------
sns.set(style="whitegrid")

# 1) State-wise sales by group
pivot_table = df.pivot_table(values=SALES_COL, index=STATE_COL, columns=GROUP_COL, aggfunc="sum", fill_value=0)

plt.figure(figsize=(12,6))
pivot_table.plot(kind="bar")
plt.title("State-wise Sales by Group")
plt.ylabel("Sales")
plt.tight_layout()
plt.savefig(OUTPUT_DIR/"state_group_sales.png")
plt.close()

# 2) Total sales by group
plt.figure(figsize=(8,5))
group_sales.plot(kind="bar")
plt.title("Total Sales by Group")
plt.ylabel("Sales")
plt.tight_layout()
plt.savefig(OUTPUT_DIR/"group_sales.png")
plt.close()

# 3) Daily Sales Trend
plt.figure(figsize=(10,5))
daily.plot()
plt.title("Daily Sales Trend")
plt.ylabel("Sales")
plt.tight_layout()
plt.savefig(OUTPUT_DIR/"daily_sales_plot.png")
plt.close()

# 4) Boxplot (Sales & Units)
plt.figure(figsize=(8,5))
sns.boxplot(data=df[[SALES_COL, UNIT_COL]])
plt.title("Boxplot - Sales & Units")
plt.tight_layout()
plt.savefig(OUTPUT_DIR/"boxplot_sales_units.png")
plt.close()

print("\nVisualizations Saved!")


Visualizations Saved!


<Figure size 1200x600 with 0 Axes>

In [12]:
# ----------------------------------------------
# REPORT GENERATION (MARKDOWN)
# ----------------------------------------------
with open(OUTPUT_DIR/"summary_report.md","w") as f:
    f.write("# Sales Analysis Report\n")
    f.write("Dataset: AusApparalSales4thQrt2020.csv\n\n")
    f.write("## Detected Columns\n")
    f.write(f"- Date Column: {DATE_COL}\n")
    f.write(f"- Sales Column: {SALES_COL}\n")
    f.write(f"- Units Column: {UNIT_COL}\n")
    f.write(f"- State Column: {STATE_COL}\n")
    f.write(f"- Group Column: {GROUP_COL}\n\n")

    f.write("## Top States by Sales\n")
    f.write(state_sales.head().to_markdown() + "\n\n")

    f.write("## Top Groups by Sales\n")
    f.write(group_sales.head().to_markdown() + "\n\n")

print("\nMarkdown Report Generated Successfully!")
print("\nAll outputs saved to:", OUTPUT_DIR)


Markdown Report Generated Successfully!

All outputs saved to: sales_analysis_output
