In [None]:
import pandas as pd
import numpy as np
import matplotlib as pyplot
import seaborn as sns
%matplotlib inline

In [None]:
# BlackFriday dataset is used

###Problem Statement
A retail company "ABC Private Limited" wants to understand the customer purchase behaviour (specifically, purchase amount)against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details(product_id and product category)and Total_purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalised offer for customers against different products.

In [None]:
dft=pd.read_csv("train.csv")

In [None]:
dft.shape

(550068, 12)

In [None]:
dft.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [None]:
# When any kind of test and train data is given, first of all combine them

In [None]:
dfe = pd.read_csv('test.csv')
dfe.shape

(233599, 11)

In [None]:
# Merge both train and test
# Now, what to use Merge, concat or append?
# first of all append is deprecated lol
# Merge uses a key or id to join, while concat stacks one on the other

In [None]:
df=pd.concat([dft,dfe])

In [None]:
df.head()


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0


In [None]:
## Basic
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 783667 entries, 0 to 233598
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 77.7+ MB


In [None]:
df.describe()


Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,783667.0,783667.0,783667.0,783667.0,537685.0,237858.0,550068.0
mean,1003029.0,8.0793,0.409777,5.366196,9.844506,12.668605,9263.968713
std,1727.267,6.522206,0.491793,3.87816,5.089093,4.12551,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001519.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003075.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [None]:
# since user id not needed for this particular problem statement
df.drop(['User_ID'],axis=1,inplace=True) # axis 0-> HOrizontal 1-> Vertical

In [None]:
# age is of Categorical value. So we'll convert categorical to numerical

In [None]:
pd.get_dummies(df['Gender'], drop_first=1)
# wow get_dummies
# Takes a categorical column (or columns) with discrete values (like "red", "blue", "green")
# Creates new columns for each unique category
# Marks each row with 1 in the column that matches the category and 0 elsewhere

Unnamed: 0,M
0,False
1,False
2,False
3,False
4,True
...,...
233594,False
233595,False
233596,False
233597,False


In [None]:
# get_dummies can be used but with this the process will be a little lengthy
# so we'll use an alternative, actually both can be used

In [None]:
#What it does: Maps each category to a single value you define.
#Output: One column with transformed values.

df['Gender']=df['Gender'].map({'F':0,'M':1})

| Scenario                         | Use `.map()`             | Use `pd.get_dummies()`            |
| -------------------------------- | ------------------------ | --------------------------------- |
| Simple mapping to numeric labels | ✔️                       | ❌                                 |
| One-hot encoding for ML models   | ❌                        | ✔️                                |
| Handling many categories easily  | ❌ (need manual mapping)  | ✔️ (automatic)                    |
| Implies order or not?            | Yes (if numbers ordered) | No (all binary, no order implied) |


In [None]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,0-17,10,A,2,0,3,,,8370.0
1,P00248942,0,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,0,0-17,10,A,2,0,12,,,1422.0
3,P00085442,0,0-17,10,A,2,0,12,14.0,,1057.0
4,P00285442,1,55+,16,C,4+,0,8,,,7969.0


In [None]:
# For this case either get_dummies() or .map({}) can be used
# df['Gender']=pd.get_dummies(df['Gender'], drop_first=1) or
# df['Gender']=df['Gender'].map({'F':0,'M':1})

In [None]:
# SO for this we handeled categorical feature Gender

In [None]:
# Handle categorical feature age
df['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [None]:
# pd.get_dummies(df['Age'], drop_first=True)
# Target Guiding or Tagret ordinal encoding
df['Age'] = df['Age'].map({
    '0-17': 1, '18-25': 2, '26-35': 3,
    '36-45': 4, '46-50': 5, '51-55': 6, '55+': 7
})
# pd.get_dummies wasn't making sense since, these are range values and dummies provided one-head encoding

In [None]:
df['Age'].head()
df['Age'].unique()

array([1, 7, 3, 5, 6, 4, 2])

In [None]:
# Label Encoding can also be done, How?

In [None]:
# from sklearn import preprocessing

# label_encoder = preprocessing.LabelEncoder()
# df['Age']=label_encoder.fit_transform(df['Age'])

# df['Age'].unique


In [None]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,P00069042,0,1,10,A,2,0,3,,,8370.0
1,P00248942,0,1,10,A,2,0,1,6.0,14.0,15200.0
2,P00087842,0,1,10,A,2,0,12,,,1422.0
3,P00085442,0,1,10,A,2,0,12,14.0,,1057.0
4,P00285442,1,7,16,C,4+,0,8,,,7969.0


In [None]:
# fixing categorical City_category
df_city=pd.get_dummies(df["City_Category"], drop_first=True) # drop_first = True helps in avoiding multicollinearity(dependency)

# Benefits of drop_first=True:
#Prevents Multicollinearity: By dropping one of the dummy variables, you remove the perfect linear dependency, thus avoiding the "dummy variable trap."
#Reduces Redundancy: You retain all the necessary information with fewer columns. If all k-1 dummy variables are 0, it implicitly indicates that the original category was the one that was dropped.
#Improved Model Stability and Interpretability (for certain models): For models sensitive to multicollinearity (like linear models), drop_first=True can lead to more stable coefficient estimates and easier interpretation of the variable's impact.

In [None]:
df_city

Unnamed: 0,B,C
0,False,False
1,False,False
2,False,False
3,False,False
4,False,True
...,...,...
233594,True,False
233595,True,False
233596,True,False
233597,False,True


In [None]:
df=pd.concat([df,df_city],axis=1)
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,B,C
0,P00069042,0,1,10,A,2,0,3,,,8370.0,False,False
1,P00248942,0,1,10,A,2,0,1,6.0,14.0,15200.0,False,False
2,P00087842,0,1,10,A,2,0,12,,,1422.0,False,False
3,P00085442,0,1,10,A,2,0,12,14.0,,1057.0,False,False
4,P00285442,1,7,16,C,4+,0,8,,,7969.0,False,True


In [None]:
# since onehead encoding is done with the city category feature, it's redundant now
df.drop("City_Category",axis=1,inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Product_ID,0
Gender,0
Age,0
Occupation,0
Stay_In_Current_City_Years,0
Marital_Status,0
Product_Category_1,0
Product_Category_2,245982
Product_Category_3,545809
Purchase,233599


In [None]:
## Focus on replacing missing values
# some data exploration first
df['Product_Category_2'].unique()

array([nan,  6., 14.,  2.,  8., 15., 16., 11.,  5.,  3.,  4., 12.,  9.,
       10., 17., 13.,  7., 18.])

In [None]:

df['Product_Category_2'].value_counts()

Unnamed: 0_level_0,count
Product_Category_2,Unnamed: 1_level_1
8.0,91317
14.0,78834
2.0,70498
16.0,61687
15.0,54114
5.0,37165
4.0,36705
6.0,23575
11.0,20230
17.0,19104


In [None]:
# Hmm, what should be done to replace the missing values?
# Replace the  missing values with mode

In [None]:
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode)