In [None]:
'''
Project Overview[Switch 2 Pre-sales Demand Forecasting]:
This project seeks to understand how different demographics influence pre-sale volumes across regions. 

Goal:To analyze regional pre-order patterns and customer segmentation

Leveraged historical pre-sale transaction data to extract meaningful insights that can guide marketing strategies.

'''

'\nProject Overview[User Interaction Insights]:\nThis project supports Google by analyzing user engagement with search result pages\n\nGoal: To understand how different numbers of search results impact user interaction time\n\nThis will help optimize the current search results presentation strategy.\n\n'

In [2]:
# Importing necessary libraries for analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Getting the Dataset
pre_sale_data = pd.read_csv(r'D:\Data Journey\Python-Summer-Party\DataSets\pre_sale_data.csv')

# Display the first few rows to understand the data
print(pre_sale_data.head(10))  # Shows the first 5 rows with columns
print(list(pre_sale_data.columns))
print('Number of rows and columns is:', pre_sale_data.shape)

          region customer_id pre_order_date demographic_group  \
0  North America        C001       7/2/2024             Gamer   
1         Europe        C002       7/3/2024            Casual   
2           Asia        C003       7/4/2024   Tech Enthusiast   
3  Latin America        C004       7/5/2024            Family   
4        Oceania        C005       7/6/2024           Student   
5  North America        C006       7/7/2024             Gamer   
6         Europe        C007       7/8/2024               NaN   
7            NaN        C008       7/9/2024            Casual   
8           Asia        C009      7/10/2024            Family   
9  North America        C010      7/11/2024             Gamer   

   pre_order_quantity  
0                   1  
1                   2  
2                   1  
3                   3  
4                   2  
5                   5  
6                   2  
7                   1  
8                   4  
9                   1  
['region', 'customer

In [4]:
# Exploratory Data Analysis
# Missing column data
pre_sale_data.isna().sum()

region                1
customer_id           0
pre_order_date        0
demographic_group     3
pre_order_quantity    0
dtype: int64

In [14]:
# Question One:
# What percentage of records have missing values in at least one column?

# 0. Always drop duplicates first
duplicates = pre_sale_data.duplicated().sum()
print(duplicates)
pre_sale_data = pre_sale_data.drop_duplicates()

# 1. Boolean Series: True if a row has any missing value
rows_with_missing = pre_sale_data.isnull().any(axis=1)

# 2. Count how many rows have missing values
num_missing_rows = rows_with_missing.sum()

# 3. Total number of rows
total_rows = pre_sale_data.shape[0]

# 4. Calculate percentage
percent_missing_rows = (num_missing_rows / total_rows) * 100

print(f"Rows with missing values: {num_missing_rows}")
print(f"Total rows: {total_rows}")
print(f"Percentage of rows with missing values: {percent_missing_rows:.2f}%")

# Handling of missing data
pre_sale_data['region'] = pre_sale_data['region'].fillna('Unknown')
pre_sale_data['demographic_group'] = pre_sale_data['demographic_group'].fillna('Unknown')
pre_sale_data


0
Rows with missing values: 0
Total rows: 55
Percentage of rows with missing values: 0.00%


Unnamed: 0,region,customer_id,pre_order_date,demographic_group,pre_order_quantity
0,North America,C001,7/2/2024,Gamer,1
1,Europe,C002,7/3/2024,Casual,2
2,Asia,C003,7/4/2024,Tech Enthusiast,1
3,Latin America,C004,7/5/2024,Family,3
4,Oceania,C005,7/6/2024,Student,2
5,North America,C006,7/7/2024,Gamer,5
6,Europe,C007,7/8/2024,Unknown,2
7,Unknown,C008,7/9/2024,Casual,1
8,Asia,C009,7/10/2024,Family,4
9,North America,C010,7/11/2024,Gamer,1


In [29]:
# Question 2:
# Calculating the total pre-sale orders per month for each region and demographic group.

# 0. Calculating total pre-sale orders
total_orders = pre_sale_data['pre_order_quantity'].sum()
print(f'Total pre-sale orders are: {total_orders}')

# 1. Extracting month
pre_sale_data['month'] = pre_sale_data['pre_order_date'].dt.to_period('M')    #dt.month to extract month number  # dt.strftime('%B') to extract month name

# 2. Grouping by month, region, demographic group
total_orders_per_group = (
    pre_sale_data.groupby(['month', 'region', 'demographic_group'])['pre_order_quantity']
    .sum()
    .reset_index(name='total_orders') # Resetting series to a dataframe with columns for readability.
    .sort_values(by=['month','region','demographic_group'])
)

total_orders_per_group

Total pre-sale orders are: 135


Unnamed: 0,month,region,demographic_group,total_orders
0,2024-07,Asia,Casual,4
1,2024-07,Asia,Family,4
2,2024-07,Asia,Gamer,2
3,2024-07,Asia,Student,3
4,2024-07,Asia,Tech Enthusiast,1
5,2024-07,Europe,Casual,2
6,2024-07,Europe,Family,4
7,2024-07,Europe,Gamer,2
8,2024-07,Europe,Student,7
9,2024-07,Europe,Unknown,2


In [27]:

# Convert date to datetime
pre_sale_data['pre_order_date'] = pd.to_datetime(pre_sale_data['pre_order_date'])

# Extract month
pre_sale_data['month'] = pre_sale_data['pre_order_date'].dt.month # dt.strftime('%B') to extract month name

# Group by month, region, demographic group
total_orders_per_group = (
    pre_sale_data.groupby(['month', 'region', 'demographic_group'])['pre_order_quantity']
    .sum()
    .reset_index(name='total_orders')
    .sort_values(by=['month','region','demographic_group'])
)

total_orders_per_group


Unnamed: 0,month,region,demographic_group,total_orders
0,7,Asia,Casual,4
1,7,Asia,Family,4
2,7,Asia,Gamer,2
3,7,Asia,Student,3
4,7,Asia,Tech Enthusiast,1
5,7,Europe,Casual,2
6,7,Europe,Family,4
7,7,Europe,Gamer,2
8,7,Europe,Student,7
9,7,Europe,Unknown,2
