In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("AusApparalSales4thQrt2020.csv")

# Inspect the data
print(df.head())
print(df.info())

# Check for missing values
print("Missing values:\n", df.isna().sum())

# Convert Date to datetime format
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

# Normalize 'Sales' and 'Unit' columns using Min-Max Normalization
def normalize(col):
    return (col - col.min()) / (col.max() - col.min())

df['Sales_Norm'] = normalize(df['Sales'])
df['Unit_Norm'] = normalize(df['Unit'])


         Date        Time State     Group  Unit  Sales
0  1-Oct-2020     Morning    WA      Kids     8  20000
1  1-Oct-2020     Morning    WA       Men     8  20000
2  1-Oct-2020     Morning    WA     Women     4  10000
3  1-Oct-2020     Morning    WA   Seniors    15  37500
4  1-Oct-2020   Afternoon    WA      Kids     3   7500
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7560 entries, 0 to 7559
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    7560 non-null   object
 1   Time    7560 non-null   object
 2   State   7560 non-null   object
 3   Group   7560 non-null   object
 4   Unit    7560 non-null   int64 
 5   Sales   7560 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 236.3+ KB
None
Missing values:
 Date     0
Time     0
State    0
Group    0
Unit     0
Sales    0
dtype: int64


In [2]:
# Descriptive statistics
print("Descriptive stats for Sales and Unit:")
print(df[['Sales', 'Unit']].describe())

Descriptive stats for Sales and Unit:
               Sales         Unit
count    7560.000000  7560.000000
mean    45013.558201    18.005423
std     32253.506944    12.901403
min      5000.000000     2.000000
25%     20000.000000     8.000000
50%     35000.000000    14.000000
75%     65000.000000    26.000000
max    162500.000000    65.000000


In [3]:
# Group-wise sales
group_sales = df.groupby('Group')['Sales'].sum().sort_values(ascending=False)
print("Sales by Group:\n", group_sales)

# State-wise sales
state_sales = df.groupby('State')['Sales'].sum().sort_values(ascending=False)
print("Sales by State:\n", state_sales)

Sales by Group:
 Group
Men        85750000
Women      85442500
Kids       85072500
Seniors    84037500
Name: Sales, dtype: int64
Sales by State:
 State
VIC    105565000
NSW     74970000
SA      58857500
QLD     33417500
TAS     22760000
NT      22580000
WA      22152500
Name: Sales, dtype: int64


In [4]:
# Create additional time-based columns
df['Week'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter

# Aggregated reports
weekly_report = df.groupby('Week')['Sales'].sum()
monthly_report = df.groupby('Month')['Sales'].sum()
quarterly_report = df.groupby('Quarter')['Sales'].sum()

print("Weekly Sales:\n", weekly_report)
print("Monthly Sales:\n", monthly_report)
print("Quarterly Sales:\n", quarterly_report)

Weekly Sales:
 Week
40    15045000
41    27002500
42    26640000
43    26815000
44    21807500
45    20865000
46    21172500
47    21112500
48    21477500
49    29622500
50    31525000
51    31655000
52    31770000
53    13792500
Name: Sales, dtype: int64
Monthly Sales:
 Month
10    114290000
11     90682500
12    135330000
Name: Sales, dtype: int64
Quarterly Sales:
 Quarter
4    340302500
Name: Sales, dtype: int64


In [None]:
## Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

In [None]:
# State-wise
plt.figure(figsize=(10,6))
sns.barplot(data=df, x='State', y='Sales', hue='Group', estimator=sum)
plt.title('State-wise Sales by Group')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=df, x='Group', y='Sales', hue='State', estimator=sum)
plt.title('Group-wise Sales by State')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='Time', y='Sales')
plt.title('Sales Distribution by Time of Day')
plt.show()

In [None]:
daily_sales = df.groupby('Date')['Sales'].sum().reset_index()

plt.figure(figsize=(12,5))
plt.plot(daily_sales['Date'], daily_sales['Sales'], marker='o')
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,5))

sns.barplot(x=weekly_report.index, y=weekly_report.values, ax=axes[0])
axes[0].set_title('Weekly Sales')
axes[0].set_xlabel('Week')
axes[0].set_ylabel('Sales')

sns.barplot(x=monthly_report.index, y=monthly_report.values, ax=axes[1])
axes[1].set_title('Monthly Sales')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Sales')

plt.tight_layout()
plt.show()

In [None]:
# Boxplot for descriptive stats
plt.figure(figsize=(6,4))
sns.boxplot(data=df[['Sales', 'Unit']])
plt.title('Boxplot for Sales and Units')
plt.show()

# Distribution plot for Sales
plt.figure(figsize=(6,4))
sns.histplot(df['Sales'], kde=True)
plt.title('Sales Distribution')
plt.show()

In [7]:
# Step 2: Data Analysis

# Descriptive statistics for Sales and Unit
descriptive_stats = df[['Sales', 'Unit']].describe()

# Group-wise sales analysis
group_sales = df.groupby('Group')['Sales'].sum().sort_values(ascending=False)

# State-wise sales analysis
state_sales = df.groupby('State')['Sales'].sum().sort_values(ascending=False)

# Add time-based features
df['Week'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter

# Weekly, monthly, and quarterly summaries
weekly_report = df.groupby('Week')['Sales'].sum()
monthly_report = df.groupby('Month')['Sales'].sum()
quarterly_report = df.groupby('Quarter')['Sales'].sum()

descriptive_stats, group_sales, state_sales, weekly_report, monthly_report, quarterly_report


(               Sales         Unit
 count    7560.000000  7560.000000
 mean    45013.558201    18.005423
 std     32253.506944    12.901403
 min      5000.000000     2.000000
 25%     20000.000000     8.000000
 50%     35000.000000    14.000000
 75%     65000.000000    26.000000
 max    162500.000000    65.000000,
 Group
 Men        85750000
 Women      85442500
 Kids       85072500
 Seniors    84037500
 Name: Sales, dtype: int64,
 State
 VIC    105565000
 NSW     74970000
 SA      58857500
 QLD     33417500
 TAS     22760000
 NT      22580000
 WA      22152500
 Name: Sales, dtype: int64,
 Week
 40    15045000
 41    27002500
 42    26640000
 43    26815000
 44    21807500
 45    20865000
 46    21172500
 47    21112500
 48    21477500
 49    29622500
 50    31525000
 51    31655000
 52    31770000
 53    13792500
 Name: Sales, dtype: int64,
 Month
 10    114290000
 11     90682500
 12    135330000
 Name: Sales, dtype: int64,
 Quarter
 4    340302500
 Name: Sales, dtype: int64)