In [None]:
'''
Project Overview[Switch 2 Pre-sales Demand Forecasting]:
This project seeks to understand how different demographics influence pre-sale volumes across regions. 

Goal:To analyze regional pre-order patterns and customer segmentation

Leveraged historical pre-sale transaction data to extract meaningful insights that can guide marketing strategies.

'''

'\nProject Overview[User Interaction Insights]:\nThis project supports Google by analyzing user engagement with search result pages\n\nGoal: To understand how different numbers of search results impact user interaction time\n\nThis will help optimize the current search results presentation strategy.\n\n'

In [52]:
# Importing necessary libraries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Getting the Dataset
pre_sale_data = pd.read_csv(r'D:\Data Journey\Python-Summer-Party\DataSets\pre_sale_data.csv')

# Display the first few rows to understand the data
print(pre_sale_data.head(10))  # Shows the first 5 rows with columns
print(list(pre_sale_data.columns))
print('Number of rows and columns is:', pre_sale_data.shape)

          region customer_id pre_order_date demographic_group  \
0  North America        C001       7/2/2024             Gamer   
1         Europe        C002       7/3/2024            Casual   
2           Asia        C003       7/4/2024   Tech Enthusiast   
3  Latin America        C004       7/5/2024            Family   
4        Oceania        C005       7/6/2024           Student   
5  North America        C006       7/7/2024             Gamer   
6         Europe        C007       7/8/2024               NaN   
7            NaN        C008       7/9/2024            Casual   
8           Asia        C009      7/10/2024            Family   
9  North America        C010      7/11/2024             Gamer   

   pre_order_quantity  
0                   1  
1                   2  
2                   1  
3                   3  
4                   2  
5                   5  
6                   2  
7                   1  
8                   4  
9                   1  
['region', 'customer

In [4]:
# Exploratory Data Analysis
# Missing column data
pre_sale_data.isna().sum()

region                1
customer_id           0
pre_order_date        0
demographic_group     3
pre_order_quantity    0
dtype: int64

In [14]:
# Question One:
# What percentage of records have missing values in at least one column?

# 0. Always drop duplicates first
duplicates = pre_sale_data.duplicated().sum()
print(duplicates)
pre_sale_data = pre_sale_data.drop_duplicates()

# 1. Boolean Series: True if a row has any missing value
rows_with_missing = pre_sale_data.isnull().any(axis=1)

# 2. Count how many rows have missing values
num_missing_rows = rows_with_missing.sum()

# 3. Total number of rows
total_rows = pre_sale_data.shape[0]

# 4. Calculate percentage
percent_missing_rows = (num_missing_rows / total_rows) * 100

print(f"Rows with missing values: {num_missing_rows}")
print(f"Total rows: {total_rows}")
print(f"Percentage of rows with missing values: {percent_missing_rows:.2f}%")

# Handling of missing data
pre_sale_data['region'] = pre_sale_data['region'].fillna('Unknown')
pre_sale_data['demographic_group'] = pre_sale_data['demographic_group'].fillna('Unknown')
pre_sale_data


0
Rows with missing values: 0
Total rows: 55
Percentage of rows with missing values: 0.00%


Unnamed: 0,region,customer_id,pre_order_date,demographic_group,pre_order_quantity
0,North America,C001,7/2/2024,Gamer,1
1,Europe,C002,7/3/2024,Casual,2
2,Asia,C003,7/4/2024,Tech Enthusiast,1
3,Latin America,C004,7/5/2024,Family,3
4,Oceania,C005,7/6/2024,Student,2
5,North America,C006,7/7/2024,Gamer,5
6,Europe,C007,7/8/2024,Unknown,2
7,Unknown,C008,7/9/2024,Casual,1
8,Asia,C009,7/10/2024,Family,4
9,North America,C010,7/11/2024,Gamer,1


In [31]:
# Question 2:
# Calculating the total pre-sale orders per month for each region and demographic group.

# 0. Calculating total pre-sale orders
total_orders = pre_sale_data['pre_order_quantity'].sum()
print(f'Total pre-sale orders are: {total_orders}')

# 1. Extracting month
pre_sale_data['month'] = pre_sale_data['pre_order_date'].dt.to_period('M')    #dt.month to extract month number  # dt.strftime('%B') to extract month name

# 2. Grouping by month, region, demographic group
total_orders_per_group = (
    pre_sale_data.groupby(['month', 'region', 'demographic_group'])['pre_order_quantity']
    .sum()
    .reset_index(name='total_orders') # Resetting series to a dataframe with columns for readability.
    .sort_values(by=['month','region','demographic_group'])
)

total_orders_per_group

Total pre-sale orders are: 135


Unnamed: 0,month,region,demographic_group,total_orders
0,2024-07,Asia,Casual,4
1,2024-07,Asia,Family,4
2,2024-07,Asia,Gamer,2
3,2024-07,Asia,Student,3
4,2024-07,Asia,Tech Enthusiast,1
5,2024-07,Europe,Casual,2
6,2024-07,Europe,Family,4
7,2024-07,Europe,Gamer,2
8,2024-07,Europe,Student,7
9,2024-07,Europe,Unknown,2


In [53]:
# Question Three:
# Predicting the total pre-sales quantity for each region for September 2024
# Assuming that the growth rate from August to September is the same as the growth rate from July to August in each region

# 0. Filter orders for July and August 2024
July_orders = pre_sale_data[(pre_sale_data['pre_order_date'].dt.year == 2024) & (pre_sale_data['pre_order_date'].dt.month == 7)]
August_orders = pre_sale_data[(pre_sale_data['pre_order_date'].dt.year == 2024) & (pre_sale_data['pre_order_date'].dt.month == 8)]


# 1. Calculate total pre-sale orders for July and August
July_total_orders = July_orders['pre_order_quantity'].sum()
August_total_orders = August_orders['pre_order_quantity'].sum()
print('Total pre-sale orders in July 2024:', July_total_orders)
print('Total pre-sale orders in August 2024:', August_total_orders)

# 2. Calculate overall month-over-month growth rate
growth_rate = (August_total_orders - July_total_orders) / July_total_orders
print(f'Overall month-over-month growth rate: {growth_rate:.2%}')

# 3. Calculate total orders per region for July and August
July_region = July_orders.groupby('region')['pre_order_quantity'].sum()
August_region = August_orders.groupby('region')['pre_order_quantity'].sum()

# 4. Calculate growth rate per region
growth_rate_region = (August_region - July_region) / July_region.replace(0, np.nan) # Handling any zero values for July to avoid zero division
growth_rate_region = growth_rate_region.fillna(0)


# 5. Predict September 2024 totals per region
September_region_pred = August_region * (1 + growth_rate_region)
# Convert Series to DataFrame for easier readability
September_region_pred = September_region_pred.reset_index(name='predicted_total_orders')
# Display predicted totals per region
September_region_pred


Total pre-sale orders in July 2024: 69
Total pre-sale orders in August 2024: 66
Overall month-over-month growth rate: -4.35%


Unnamed: 0,region,predicted_total_orders
0,Asia,14.0
1,Europe,5.882353
2,Latin America,7.692308
3,North America,30.083333
4,Oceania,14.083333
5,Unknown,


In [50]:

# ---------------------------------------------------
July_orders = pre_sale_data[
    (pre_sale_data['pre_order_date'].dt.year == 2024) &
    (pre_sale_data['pre_order_date'].dt.month == 7)
]

August_orders = pre_sale_data[
    (pre_sale_data['pre_order_date'].dt.year == 2024) &
    (pre_sale_data['pre_order_date'].dt.month == 8)
]

# ---------------------------------------------------
# 1. Calculate total pre-sale orders for July and August
# ---------------------------------------------------
July_total_orders = July_orders['pre_order_quantity'].sum()
August_total_orders = August_orders['pre_order_quantity'].sum()

print('Total pre-sale orders in July 2024:', July_total_orders)
print('Total pre-sale orders in August 2024:', August_total_orders)

# ---------------------------------------------------
# 2. Calculate overall month-over-month growth rate
# ---------------------------------------------------
# Avoid division by zero if July total is 0
if July_total_orders == 0:
    overall_growth_rate = 0
else:
    overall_growth_rate = (August_total_orders - July_total_orders) / July_total_orders

print(f'Overall month-over-month growth rate: {overall_growth_rate:.2%}')

# ---------------------------------------------------
# 3. Calculate total orders per region for July and August
# ---------------------------------------------------
July_region = July_orders.groupby('region')['pre_order_quantity'].sum()
August_region = August_orders.groupby('region')['pre_order_quantity'].sum()

# ---------------------------------------------------
# 4. Calculate growth rate per region safely
# ---------------------------------------------------
# Replace zero orders in July with NaN to avoid division by zero
growth_rate_region = (August_region - July_region) / July_region.replace(0, np.nan)
# For regions where July orders were zero, assume growth = 0 (prediction = August orders)
growth_rate_region = growth_rate_region.fillna(0)

# ---------------------------------------------------
# 5. Predict September 2024 totals per region
# ---------------------------------------------------
September_region_pred = August_region * (1 + growth_rate_region)

# Convert Series to DataFrame for easier readability
September_region_pred = September_region_pred.reset_index(name='predicted_total_orders')

# Display predicted totals per region
September_region_pred


Total pre-sale orders in July 2024: 69
Total pre-sale orders in August 2024: 66
Overall month-over-month growth rate: -4.35%


NameError: name 'np' is not defined