In [1]:
## Importing the required Packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime, date, time
import joblib
# to display all columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
from pylab import rcParams

#### loading pickled train dataset

In [3]:
# loading pickled train dataset ####
df_train_eda = joblib.load("df_train_eda.pkl")
df_train_eda.shape

(550068, 22)

In [4]:
# columns
df_train_eda.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase', 'Product_ID_n',
       'Product_ID_s', 'Product_ID_Rare', 'Product_Category_1_Rare',
       'Product_Category_2_Rare', 'Product_Category_3_Rare', 'Gender_le',
       'ohe_city_A', 'ohe_city_B', 'ohe_city_C'],
      dtype='object')

### Feature Engineering

#### 1. product id overview

In [5]:
# product total, mean, median, min, max, std
total_purchase_amt = df_train_eda['Purchase'].sum()
df_product = pd.DataFrame()
df_product['f_product_tot_sale_amt'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].sum()
df_product['f_product_id_m'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].mean()
df_product['f_product_id_median'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].median()
df_product['f_product_id_min'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].min()
df_product['f_product_id_max'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].max()
df_product['f_product_id_std'] = df_train_eda.groupby(['Product_ID_Rare'])['Purchase'].std()
#df_product['f_product_percent_sale'] = df_product['f_product_tot_sale_amt']/total_purchase_amt * 100
df_product = df_product.reset_index()
print(df_product.shape)

# adding new columns
df_train_eda = pd.merge(df_train_eda, df_product, how = 'left', on= ['Product_ID_Rare'] )
df_train_eda.shape

(2006, 7)


(550068, 28)

#### 2. f_gender_unique_users_cnt

In [6]:
# gender unique users count
df_gender_unique_users_cnt = pd.DataFrame(df_train_eda.groupby(['Gender'])['User_ID'].nunique())
df_gender_unique_users_cnt = df_gender_unique_users_cnt.reset_index()
df_gender_unique_users_cnt = df_gender_unique_users_cnt.rename(columns = {'User_ID' : 'f_gender_unique_users_cnt'})
print(df_gender_unique_users_cnt.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_gender_unique_users_cnt, how = 'left', on= ['Gender'] )
print(df_train_eda.shape)

(2, 2)
(550068, 29)


#### 3. user id overview

In [7]:
# product count 
df_userid = pd.DataFrame(df_train_eda.groupby(['User_ID']).size(), columns = ['f_userid_cnt']) 
# product mean, median, min, max, std
df_userid['f_userid_tot_purchase_amt'] = df_train_eda.groupby(['User_ID'])['Purchase'].sum()
df_userid['f_userid_m'] = df_train_eda.groupby(['User_ID'])['Purchase'].mean()
df_userid['f_userid_median'] = df_train_eda.groupby(['User_ID'])['Purchase'].median()
df_userid['f_userid_min'] = df_train_eda.groupby(['User_ID'])['Purchase'].min()
df_userid['f_userid_max'] = df_train_eda.groupby(['User_ID'])['Purchase'].max()
df_userid['f_userid_std'] = df_train_eda.groupby(['User_ID'])['Purchase'].std()
df_userid = df_userid.reset_index()
print(df_userid.shape)

# adding new columns
#df_train_eda = pd.merge(df_train_eda, df_userid, how = 'left', on= ['User_ID'] )
df_train_eda.shape

(5891, 8)


(550068, 29)

#### 4. Age Bin - Min & Max

In [8]:
# Age Mean 
df_age = pd.DataFrame(df_train_eda.groupby(['Age'])['Purchase'].mean()) 
df_age['f_age_median'] = df_train_eda.groupby(['Age'])['Purchase'].median()
df_age['f_age_std'] = df_train_eda.groupby(['Age'])['Purchase'].std()
df_age = df_age.reset_index()
df_age = df_age.rename(columns = {'Purchase' : 'f_age_m'})
df_age.shape

(7, 4)

In [9]:
# Age Min & Max values
age_min = []
age_max = []
for i in range(df_age.shape[0]) :
    
    if df_age['Age'][i] == '0-17' :
        age_min.append(0)
        age_max.append(17)
        
    elif df_age['Age'][i] == '18-25' :
        age_min.append(18)
        age_max.append(25)
        
    elif df_age['Age'][i] == '26-35' :
        age_min.append(26)
        age_max.append(35)
    
    elif df_age['Age'][i] == '36-45' :
        age_min.append(36)
        age_max.append(45)
    
    elif df_age['Age'][i] == '46-50' :
        age_min.append(46)
        age_max.append(50)
        
    elif df_age['Age'][i] == '51-55' :
        age_min.append(51)
        age_max.append(55)
    
    else :
        age_min.append(56)
        age_max.append(100)

# add new columns
df_age['f_Age_Min'] = age_min
df_age['f_Age_Max'] = age_max
df_age.shape

(7, 6)

In [10]:
# adding new columns
df_train_eda = pd.merge(df_train_eda, df_age, how = 'left', on= ['Age'] )
df_train_eda.shape

(550068, 34)

#### 5. f_occupation_m

In [11]:
# Occupation mean 
df_occupation = pd.DataFrame(df_train_eda.groupby(['Occupation'])['Purchase'].mean()) 
df_occupation['f_occupation_tot_amt'] = df_train_eda.groupby(['Occupation'])['Purchase'].sum()
df_occupation = df_occupation.reset_index()
df_occupation = df_occupation.rename(columns = {'Purchase' : 'f_occupation_m'})
print(df_occupation.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_occupation, how = 'left', on= ['Occupation'] )
df_train_eda.shape

(21, 3)


(550068, 36)

#### 6. f_current_city_m 

In [12]:
# current city period mean 
df_city_period = pd.DataFrame(df_train_eda.groupby(['Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_city_period['f_current_city_median'] = df_train_eda.groupby(['Stay_In_Current_City_Years'])['Purchase'].median()
df_city_period['f_current_city_min'] = df_train_eda.groupby(['Stay_In_Current_City_Years'])['Purchase'].min()
df_city_period['f_current_city_max'] = df_train_eda.groupby(['Stay_In_Current_City_Years'])['Purchase'].max()
df_city_period['f_current_city_std'] = df_train_eda.groupby(['Stay_In_Current_City_Years'])['Purchase'].std()
df_city_period = df_city_period.reset_index()
df_city_period = df_city_period.rename(columns = {'Purchase' : 'f_current_city_m'})
print(df_city_period.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_city_period, how = 'left', on= ['Stay_In_Current_City_Years'] )
df_train_eda.shape

(5, 6)


(550068, 41)

#### 7. f_prod_category_1_m

In [13]:
# Product Category 1 mean 
df_prod_category_1 = pd.DataFrame(df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].mean())
df_prod_category_1['f_prod_category_1_tot_amt'] = df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].sum()
df_prod_category_1['f_prod_category_1_median'] = df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].median()
df_prod_category_1['f_prod_category_1_min'] = df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].min()
df_prod_category_1['f_prod_category_1_max'] = df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].max()
df_prod_category_1['f_prod_category_1_std'] = df_train_eda.groupby(['Product_Category_1_Rare'])['Purchase'].std()
df_prod_category_1 = df_prod_category_1.reset_index()
df_prod_category_1 = df_prod_category_1.rename(columns = {'Purchase' : 'f_prod_category_1_m'})
print(df_prod_category_1.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_prod_category_1, how = 'left', on= ['Product_Category_1_Rare'] )
df_train_eda.shape

(12, 7)


(550068, 47)

#### 8. f_prod_category_2_m

In [14]:
# Product Category 2 mean 
df_prod_category_2 = pd.DataFrame(df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].mean()) 
df_prod_category_2['f_prod_category_2_tot_amt'] = df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].sum()
df_prod_category_2['f_prod_category_2_median'] = df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].median()
df_prod_category_2['f_prod_category_2_min'] = df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].min()
df_prod_category_2['f_prod_category_2_max'] = df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].max()
df_prod_category_2['f_prod_category_2_std'] = df_train_eda.groupby(['Product_Category_2_Rare'])['Purchase'].std()
df_prod_category_2 = df_prod_category_2.reset_index()
df_prod_category_2 = df_prod_category_2.rename(columns = {'Purchase' : 'f_prod_category_2_m'})
print(df_prod_category_2.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_prod_category_2, how = 'left', on= ['Product_Category_2_Rare'] )
df_train_eda.shape

(14, 7)


(550068, 53)

#### 9. f_prod_category_3_m

In [15]:
# Product Category 3 mean 
df_prod_category_3 = pd.DataFrame(df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].mean()) 
df_prod_category_3['f_prod_category_3_tot_amt'] = df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].sum()
df_prod_category_3['f_prod_category_3_median'] = df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].median()
df_prod_category_3['f_prod_category_3_min'] = df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].min()
df_prod_category_3['f_prod_category_3_max'] = df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].max()
df_prod_category_3['f_prod_category_3_std'] = df_train_eda.groupby(['Product_Category_3_Rare'])['Purchase'].std()
df_prod_category_3 = df_prod_category_3.reset_index()
df_prod_category_3 = df_prod_category_3.rename(columns = {'Purchase' : 'f_prod_category_3_m'})
print(df_prod_category_3.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_prod_category_3, how = 'left', on= ['Product_Category_3_Rare'] )
df_train_eda.shape

(9, 7)


(550068, 59)

#### 10. f_age_occupation_m

In [16]:
# Age & Occupation mean 
df_age_occupation_m = pd.DataFrame(df_train_eda.groupby(['Age', 'Occupation'])['Purchase'].mean()) 
df_age_occupation_m = df_age_occupation_m.reset_index()
df_age_occupation_m = df_age_occupation_m.rename(columns = {'Purchase' : 'f_age_occupation_m'})
print(df_age_occupation_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_occupation_m, how = 'left', on= ['Age', 'Occupation'] )
df_train_eda.shape

(134, 3)


(550068, 60)

#### 11. f_age_city_m

In [17]:
# Age & city category mean 
df_age_city_m = pd.DataFrame(df_train_eda.groupby(['Age', 'City_Category'])['Purchase'].mean()) 
df_age_city_m = df_age_city_m.reset_index()
df_age_city_m = df_age_city_m.rename(columns = {'Purchase' : 'f_age_city_m'})
print(df_age_city_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_city_m, how = 'left', on= ['Age', 'City_Category'] )
df_train_eda.shape

(21, 3)


(550068, 61)

#### 12. f_age_current_city_m

In [18]:
# Age &  Current City mean 
df_age_current_city_m = pd.DataFrame(df_train_eda.groupby(['Age', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_age_current_city_m = df_age_current_city_m.reset_index()
df_age_current_city_m = df_age_current_city_m.rename(columns = {'Purchase' : 'f_age_current_city_m'})
print(df_age_current_city_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_current_city_m, how = 'left', on= ['Age', 'Stay_In_Current_City_Years'] )
df_train_eda.shape

(35, 3)


(550068, 62)

#### 13. f_age_prod_category_1_m

In [19]:
# Age &  Product Category 1 mean 
df_age_prod_category_1_m = pd.DataFrame(df_train_eda.groupby(['Age', 'Product_Category_1_Rare'])['Purchase'].mean()) 
df_age_prod_category_1_m = df_age_prod_category_1_m.reset_index()
df_age_prod_category_1_m = df_age_prod_category_1_m.rename(columns = {'Purchase' : 'f_age_prod_category_1_m'})
print(df_age_prod_category_1_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_prod_category_1_m, how = 'left', on= ['Age', 'Product_Category_1_Rare'] )
df_train_eda.shape

(84, 3)


(550068, 63)

#### 14. f_age_prod_category_2_m

In [20]:
# Age &  Product Category 2 mean 
df_age_prod_category_2_m = pd.DataFrame(df_train_eda.groupby(['Age', 'Product_Category_2_Rare'])['Purchase'].mean()) 
df_age_prod_category_2_m = df_age_prod_category_2_m.reset_index()
df_age_prod_category_2_m = df_age_prod_category_2_m.rename(columns = {'Purchase' : 'f_age_prod_category_2_m'})
print(df_age_prod_category_2_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_prod_category_2_m, how = 'left', on= ['Age', 'Product_Category_2_Rare'] )
df_train_eda.shape

(98, 3)


(550068, 64)

#### 15. f_age_prod_category_3_m

In [21]:
# Age &  Product Category 3 mean 
df_age_prod_category_3_m = pd.DataFrame(df_train_eda.groupby(['Age', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_age_prod_category_3_m = df_age_prod_category_3_m.reset_index()
df_age_prod_category_3_m = df_age_prod_category_3_m.rename(columns = {'Purchase' : 'f_age_prod_category_3_m'})
print(df_age_prod_category_3_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_age_prod_category_3_m, how = 'left', on= ['Age', 'Product_Category_3_Rare'] )
df_train_eda.shape

(63, 3)


(550068, 65)

#### 16. f_city_current_stay_m

In [22]:
# Age &  Product Category 1 mean 
df_city_stay_m = pd.DataFrame(df_train_eda.groupby(['City_Category', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_city_stay_m = df_city_stay_m.reset_index()
df_city_stay_m = df_city_stay_m.rename(columns = {'Purchase' : 'f_city_current_stay_m'})
print(df_city_stay_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_city_stay_m, how = 'left', on= ['City_Category', 'Stay_In_Current_City_Years'] )
df_train_eda.shape

(15, 3)


(550068, 66)

#### 17. f_occupation_city_m

In [23]:
# Occupation & City mean 
df_occupation_city_m = pd.DataFrame(df_train_eda.groupby(['Occupation', 'City_Category'])['Purchase'].mean()) 
df_occupation_city_m = df_occupation_city_m.reset_index()
df_occupation_city_m = df_occupation_city_m.rename(columns = {'Purchase' : 'f_occupation_city_m'})
print(df_occupation_city_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_occupation_city_m, how = 'left', on= ['Occupation', 'City_Category'] )
df_train_eda.shape

(63, 3)


(550068, 67)

#### 18. f_occupation_current_city_m

In [24]:
# Occupation & Current City mean 
df_occupation_current_city_m = pd.DataFrame(df_train_eda.groupby(['Occupation', 'Stay_In_Current_City_Years'])['Purchase'].mean()) 
df_occupation_current_city_m = df_occupation_current_city_m.reset_index()
df_occupation_current_city_m = df_occupation_current_city_m.rename(columns = {'Purchase' : 'f_occupation_current_city_m'})
print(df_occupation_current_city_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_occupation_current_city_m, how = 'left', on= ['Occupation', 'Stay_In_Current_City_Years'] )
df_train_eda.shape

(104, 3)


(550068, 68)

#### 19. f_product_category_1_2_m

In [25]:
# Product Category 1 & 2 mean 
df_product_category_1_2_m = pd.DataFrame(df_train_eda.groupby(['Product_Category_1_Rare', 'Product_Category_2_Rare'])['Purchase'].mean()) 
df_product_category_1_2_m = df_product_category_1_2_m.reset_index()
df_product_category_1_2_m = df_product_category_1_2_m.rename(columns = {'Purchase' : 'f_product_category_1_2_m'})
print(df_product_category_1_2_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_product_category_1_2_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_2_Rare'] )
df_train_eda.shape

(83, 3)


(550068, 69)

#### 20. f_product_category_1_3_m

In [26]:
# Product Category 1 & 3 mean 
df_product_category_1_3_m = pd.DataFrame(df_train_eda.groupby(['Product_Category_1_Rare', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_product_category_1_3_m = df_product_category_1_3_m.reset_index()
df_product_category_1_3_m = df_product_category_1_3_m.rename(columns = {'Purchase' : 'f_product_category_1_3_m'})
print(df_product_category_1_3_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_product_category_1_3_m, how = 'left', on= ['Product_Category_1_Rare', 'Product_Category_3_Rare'] )
df_train_eda.shape

(55, 3)


(550068, 70)

#### 21. f_product_category_2_3_m

In [27]:
# Product Category 1 & 3 mean 
df_product_category_2_3_m = pd.DataFrame(df_train_eda.groupby(['Product_Category_2_Rare', 'Product_Category_3_Rare'])['Purchase'].mean()) 
df_product_category_2_3_m = df_product_category_2_3_m.reset_index()
df_product_category_2_3_m = df_product_category_2_3_m.rename(columns = {'Purchase' : 'f_product_category_2_3_m'})
print(df_product_category_2_3_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_product_category_2_3_m, how = 'left', on= ['Product_Category_2_Rare', 'Product_Category_3_Rare'] )
df_train_eda.shape

(64, 3)


(550068, 71)

#### 22. f_occupation_gender_m

In [28]:
# Product Category 1 & 3 mean 
df_occ_gender_m = pd.DataFrame(df_train_eda.groupby(['Occupation', 'Gender'])['Purchase'].mean()) 
df_occ_gender_m = df_occ_gender_m.reset_index()
df_occ_gender_m = df_occ_gender_m.rename(columns = {'Purchase' : 'f_occupation_gender_m'})
print(df_occ_gender_m.shape)
# add new column
df_train_eda = pd.merge(df_train_eda, df_occ_gender_m, how = 'left', on= ['Occupation', 'Gender'] )
df_train_eda.shape

(42, 3)


(550068, 72)

#### 23. City_Category Overview

In [29]:
# product count 
df_city_category = pd.DataFrame() 
# product mean, median, min, max, std
df_city_category['f_city_m'] = df_train_eda.groupby(['City_Category'])['Purchase'].mean()
df_city_category['f_city_median'] = df_train_eda.groupby(['City_Category'])['Purchase'].median()
df_city_category['f_city_min'] = df_train_eda.groupby(['City_Category'])['Purchase'].min()
df_city_category['f_city_max'] = df_train_eda.groupby(['City_Category'])['Purchase'].max()
df_city_category['f_city_std'] = df_train_eda.groupby(['City_Category'])['Purchase'].std()
df_city_category = df_city_category.reset_index()
print(df_city_category.shape)

# adding new columns
df_train_eda = pd.merge(df_train_eda, df_city_category, how = 'left', on= ['City_Category'] )
df_train_eda.shape

(3, 6)


(550068, 77)

#### 24. City_Category_Product_Category1

In [30]:
# Product Category 1
df_city_product_category_1 = pd.DataFrame() 
df_city_product_category_1['f_city_product_category_1_m'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].mean()
df_city_product_category_1['f_city_product_category_1_tot_amt'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].sum()
df_city_product_category_1['f_city_product_category_1_median'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].median()
df_city_product_category_1['f_city_product_category_1_min'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].min()
df_city_product_category_1['f_city_product_category_1_max'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].max()
df_city_product_category_1['f_city_product_category_1_std'] = df_train_eda.groupby(['City_Category', 'Product_Category_1_Rare'])['Purchase'].std()
df_city_product_category_1 = df_city_product_category_1.reset_index()
print(df_city_product_category_1.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_city_product_category_1, how = 'left', on= ['City_Category', 'Product_Category_1_Rare'] )
df_train_eda.shape


(36, 8)


(550068, 83)

#### 25. City_Category_Product_Category2

In [31]:
# Product Category 2
df_city_product_category_2 = pd.DataFrame() 
df_city_product_category_2['f_city_product_category_2_m'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].mean()
df_city_product_category_2['f_city_product_category_2_tot_amt'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].sum()
df_city_product_category_2['f_city_product_category_2_median'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].median()
df_city_product_category_2['f_city_product_category_2_min'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].min()
df_city_product_category_2['f_city_product_category_2_max'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].max()
df_city_product_category_2['f_city_product_category_2_std'] = df_train_eda.groupby(['City_Category', 'Product_Category_2_Rare'])['Purchase'].std()
df_city_product_category_2 = df_city_product_category_2.reset_index()
print(df_city_product_category_2.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_city_product_category_2, how = 'left', on= ['City_Category', 'Product_Category_2_Rare'] )
df_train_eda.shape

(42, 8)


(550068, 89)

#### 26. City_Category_Product_Category3

In [32]:
# Product Category 3
df_city_product_category_3 = pd.DataFrame() 
df_city_product_category_3['f_city_product_category_3_m'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].mean()
df_city_product_category_3['f_city_product_category_3_tot_amt'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].sum()
df_city_product_category_3['f_city_product_category_3_median'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].median()
df_city_product_category_3['f_city_product_category_3_min'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].min()
df_city_product_category_3['f_city_product_category_3_max'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].max()
df_city_product_category_3['f_city_product_category_3_std'] = df_train_eda.groupby(['City_Category', 'Product_Category_3_Rare'])['Purchase'].std()
df_city_product_category_3 = df_city_product_category_3.reset_index()
print(df_city_product_category_3.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_city_product_category_3, how = 'left', on= ['City_Category', 'Product_Category_3_Rare'] )
df_train_eda.shape

(27, 8)


(550068, 95)

#### 27. Product_Id_age_category

In [33]:
# Product Id and Age Categories
df_productid_age = pd.DataFrame() 
df_productid_age['f_productid_age_m'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].mean()
df_productid_age['f_productid_age_tot_amt'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].sum()
df_productid_age['f_productid_age_median'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].median()
df_productid_age['f_productid_age_min'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].min()
df_productid_age['f_productid_age_max'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].max()
# df_productid_age['f_productid_age_std'] = df_train_eda.groupby(['Product_ID_Rare', 'Age'])['Purchase'].std()
df_productid_age = df_productid_age.reset_index()
print(df_productid_age.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_productid_age, how = 'left', on= ['Product_ID_Rare', 'Age'] )
df_train_eda.shape

# to handle NA values for f_productid_age_min
v_productid_age_min_min = df_productid_age['f_productid_age_min'].min()


(13657, 7)


#### 28. Product_Id_Occupation

In [34]:
# Product Id and Age Categories
df_productid_occupation = pd.DataFrame() 
df_productid_occupation['f_productid_occupation_m'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].mean()
df_productid_occupation['f_productid_occupation_tot_amt'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].sum()
df_productid_occupation['f_productid_occupation_median'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].median()
df_productid_occupation['f_productid_occupation_min'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].min()
df_productid_occupation['f_productid_occupation_max'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].max()
#df_productid_occupation['f_productid_occupation_std'] = df_train_eda.groupby(['Product_ID_Rare', 'Occupation'])['Purchase'].std()
df_productid_occupation = df_productid_occupation.reset_index()
print(df_productid_occupation.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_productid_occupation, how = 'left', on= ['Product_ID_Rare', 'Occupation'] )
df_train_eda.shape

# to handle NA values for f_productid_occupation_min
v_productid_occupation_min_min = df_productid_occupation['f_productid_occupation_min'].min()


(38665, 7)


#### 29. Product_Id_City_Category  City_Category

In [35]:
# Product Id and City Category Categories
df_productid_city_category = pd.DataFrame() 
df_productid_city_category['f_productid_city_cat_m'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].mean()
df_productid_city_category['f_productid_city_cat_tot_amt'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].sum()
df_productid_city_category['f_productid_city_cat_median'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].median()
df_productid_city_category['f_productid_city_cat_min'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].min()
df_productid_city_category['f_productid_city_cat_max'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].max()
df_productid_city_category['f_productid_city_cat_std'] = df_train_eda.groupby(['Product_ID_Rare', 'City_Category'])['Purchase'].std()
df_productid_city_category = df_productid_city_category.reset_index()
print(df_productid_city_category.shape)
# add new columns
df_train_eda = pd.merge(df_train_eda, df_productid_city_category, how = 'left', on= ['Product_ID_Rare', 'City_Category'] )
df_train_eda.shape

(6018, 8)


(550068, 111)

In [39]:
# feature engineering columns
print(df_train_eda.columns)

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2',
       ...
       'f_productid_occupation_tot_amt', 'f_productid_occupation_median',
       'f_productid_occupation_min', 'f_productid_occupation_max',
       'f_productid_city_cat_m', 'f_productid_city_cat_tot_amt',
       'f_productid_city_cat_median', 'f_productid_city_cat_min',
       'f_productid_city_cat_max', 'f_productid_city_cat_std'],
      dtype='object', length=111)


#### Pickling Dataset

In [37]:
#### pickling train dataset for further processing ####
print(df_train_eda.shape)
print(joblib.dump(df_train_eda, "df_train_fe.pkl"))

(550068, 111)
['df_train_fe.pkl']


In [38]:
# pickling feature engineering
print(joblib.dump((df_product, df_gender_unique_users_cnt, df_userid, df_age, df_occupation, df_city_period, df_prod_category_1, df_prod_category_2, df_prod_category_3, df_age_occupation_m, df_age_city_m, df_age_current_city_m, df_age_prod_category_1_m ), "feature_engineering1.pkl"))
print(joblib.dump((df_age_prod_category_2_m, df_age_prod_category_3_m, df_city_stay_m, df_occupation_city_m, df_occupation_current_city_m, df_product_category_1_2_m, df_product_category_1_3_m, df_product_category_2_3_m, df_occ_gender_m, df_city_category), "feature_engineering2.pkl"))
print(joblib.dump((df_city_product_category_1, df_city_product_category_2, df_city_product_category_3, df_productid_age, df_productid_occupation, df_productid_city_category), "feature_engineering3.pkl"))
print(joblib.dump((v_productid_age_min_min, v_productid_occupation_min_min), "feature_engineering4.pkl"))

['feature_engineering1.pkl']
['feature_engineering2.pkl']
['feature_engineering3.pkl']
['feature_engineering4.pkl']


### Comments are appreciated to improve model performance and learning !!!