# Black Friday - Prediction

### 1.0 Importing required libraries and Data Exploration

In [3]:
# Basic packages
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Plot settings
plt.style.use('ggplot') # URL for style sheets (https://matplotlib.org/gallery/style_sheets/style_sheets_reference.html)
plt.rcParams['axes.titlesize'] = 16
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'k'
%matplotlib inline

# warning settings
import warnings
warnings.filterwarnings("ignore")

NameError: name 'matplotlib' is not defined

In [None]:
# Importing data into python from the given csv file
train = pd.read_csv('../data/BlackFriday/train.csv')
test = pd.read_csv('../data/BlackFriday/test.csv')

In [None]:
# Including new column named 'source' and giving the value as 'train' / 'test' to identify from where exactly the data was taken
train['source']='train'
test['source']='test'

In [None]:
# Combining the 2 dataframes (train and Test) into a single dataframe 'dataset'
dataset = pd.concat([train, test],ignore_index=True, sort=False)
print (train.shape, test.shape, dataset.shape)

In [None]:
dataset.head()

In [None]:
# Checking for null values in the columns
dataset.isnull().sum()

__Observation__:
There are significant missing values in 3 columns (Product_Category_2, Product_Category_3 and Purchase)

In [None]:
dataset.dtypes

In [None]:
# Checking details from the columns with 'numeric' datatype
dataset.describe()

We can see that most of the columns are with categorical data, either expressed in numericals or strings

In [None]:
# Obtaining number of unique values in each column
dataset.apply(lambda x: len(x.unique()))

In [None]:
#Filter categorical variables
categorical_columns = [x for x in dataset.dtypes.index if dataset.dtypes[x]=='object']
# Adding other categorical variables indicated as numerics
categorical_columns.extend(['Occupation'])
categorical_columns

In [None]:
# Droping Identifier values from the list
categorical_columns = [x for x in categorical_columns if x not in ['Product_ID','source']]
categorical_columns

In [None]:
# Print frequency of categories
for col in categorical_columns:
    print ('Frequency (Count) of Categories for varible : ', col)
    print (dataset[col].value_counts())
    print('---------------------------------------------------')

### 1.1 Hypothesis

On observations of the column headers and the values the following hypothesis can be carried out

__User_ID__ : <br>
Identification values which links to the demographics of the person like (Gender, Age, Marital Status, Occupation, City Category and years in city) 
<br>

__Product_ID__ : <br>
Identification values which links other sub categories Product_Category_1, Product_Category_2 and Product_Category_3
<br>

__Gender__ : (Male / Female) <br>
Generally females shop more than male. However the User_ID maybe linked with the male who would pay for the purchase
<br>

__Age__ : (Range between 0 to 55+) <br>
Teenagers and middle aged group people tend to make use of the BlackFriday sale
<br>

__Marital Status__ : (Married / Un-married) <br>
Un-married people spend more during the sale season compared to those married
<br>

__Occupation__ : (Range between 0 to 20) <br>
People in Managerial and above position Purchase more compared to mid-level and associate positions
<br>

__City Category__ : (Code A, B, C) <br>
People living in metros spend more during sale compared to less populated cities

__Stay in City__ : (Range between 0 to 4+) <br>
As people stay more in a particular city and based on experience on various aspects in the mart, people who fancy few of their favorite marts may visit them during the sale season.

__Product Category__ : (1, 2, & 3) <br>
Links to Sub categories of Product_ID


### 2. EDA
> Imputing missing values and Treating outliers

In [None]:
# Checking whether Product_ID is related to other Product Category columns
dataset.groupby(['Product_ID'])['Product_Category_1','Product_Category_2','Product_Category_3'].mean().head(20)

__Observation__: <br>
From the above, we can make out that Product_Category2 and Product_Category3 are __not related__ to the Product_ID

In [None]:
# Obtaining list of booleans specifying Product_Category_2 and Product_Category_3 missing  values
miss_Prod2_rows = dataset['Product_Category_2'].isnull()  
miss_Prod3_rows = dataset['Product_Category_3'].isnull()

In [None]:
# Impute 0s in place of NaN in columns Product_Category_2 and Product_Category_3
dataset.loc[miss_Prod2_rows,'Product_Category_2'] = 0
dataset.loc[miss_Prod3_rows,'Product_Category_3'] = 0

In [None]:
dataset[['Product_ID','Product_Category_1','Product_Category_2','Product_Category_3']].head(10)

In [None]:
# Combining 3 Product category columns into 1 maseter category column
dataset['Master_Product_Category']=dataset['Product_Category_1'].astype(str)+dataset['Product_Category_2'].astype(int).astype(str)+dataset['Product_Category_3'].astype(int).astype(str)


In [None]:
dataset[['Product_ID','Product_Category_1','Product_Category_2','Product_Category_3','Master_Product_Category']].head(10)

In [None]:
dataset.isnull().sum()

__Purchase column__ : <br>
>There are quire a few missing values in Purchase (dependent varaible) column. <br>
It is advisable to impute as many values in the dependent variable column to obtain a good model.<br>
Hence it is advisable to find whether there is *correlation between the Product_ID or newly created Master Product Category column* and Purchase column.

In [None]:
# Check Purchase values from the grouping by the newly created Master_Product_Category column
categ_mean = dataset.groupby(['Master_Product_Category'])['Purchase'].mean()

In [None]:
categ_mean.isnull().sum()

In [None]:
# Check Purchase values from the grouping by the Product_ID column
purch_mean_prodID = dataset.groupby(['Product_ID'])['Purchase'].mean()

In [None]:
purch_mean_prodID.isnull().sum()

As the missing values in Purchase column has corresponding Master category values, <br>
incorporating the mean value into the Purchase column based on the Master Category column is advisable.

In [None]:
# Obtain missing Purchase values
miss_Purch_rows = dataset['Purchase'].isnull()

In [None]:
# Impute mean Purchase value based on Master_Product_Category column
dataset.loc[miss_Purch_rows,'Purchase'] = dataset.loc[miss_Purch_rows,'Master_Product_Category'].apply(lambda x: categ_mean.loc[x])

In [None]:
print ('Null values in Purchase column before imputation : ' , miss_Purch_rows.sum())
print ('Results after imputation : ', dataset['Purchase'].isnull().sum())

In [None]:
dataset.isnull().sum()

In [None]:
list(dataset.columns.values)

In [None]:
# Rearranging the columns
dataset = dataset[['User_ID','Product_ID', 'Gender','Age','Occupation', 'City_Category','Stay_In_Current_City_Years',
                   'Marital_Status',  'Master_Product_Category', 'Purchase', 'source']]

In [None]:
list(dataset.columns.values)

In [None]:
dataset.head(10)

In [None]:
dataset.dtypes

In [None]:
# Converting Master_Product_Category datatype object to integer
dataset['Master_Product_Category'] = dataset['Master_Product_Category'].astype(str).astype(int)

In [None]:
dataset.Gender.unique()

In [None]:
#Rename Gender to more intuitive categories:
dataset['Gender'] = dataset['Gender'].map({'F':'Female','M':'Male'})

In [None]:
dataset.Marital_Status.unique()

In [None]:
#Rename them to more intuitive categories:
dataset['Marital_Status'] = dataset['Marital_Status'].map({0:'Un-Married', 1:'Married'})

In [None]:
# Re-look at the unique values of other 'demographical' columns pertaining to the customer
print('Age :' ,sorted(dataset.Age.unique()), sep='\n')
print(sep='\n\n')
print('City_Category :', sorted(dataset.City_Category.unique()), sep='\n')
print(sep='\n\n')
print('Stay_In_Current_City_Years :', sorted(dataset.Stay_In_Current_City_Years.unique()), sep='\n')
print(sep='\n\n')

In [None]:
# Replacing '4+' years of with numerical number 4 
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].replace('4+', 4)

In [None]:
# Converting Stay_In_Current_City_Years data type to integer
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].astype(str).astype(int)

In [None]:
print('Occupation :', sorted(dataset.Occupation.unique()))

__Point to ponder__:

We can observe that there are __(21) categories__ of Occupation of customers. <br>
Based on the country that the data has been taken, there might be standard coding system for categorising occupation, which is been indicated in the dataset.

However, based on International Standard Classification of Occupations, there are (10) main categories
1. Managers
2. Professional
3. Technicians and associate professionals
4. Clerical support workers
5. Service and sales workers
6. Skilled agricultural, forestry and fishery workers
7. Craft and related trades workers
8. Plant and machine operators, and assemblers
9. Elementary occupations
10. Armed forces occupations

(Source : Wikipedia)

### 3. Visualization

Visualization of categorical variables  - __BY COUNT__

In [None]:
plt.figure(figsize=(15,10))

# Pie chart for gender distribution
plt.subplot(2,2,1)
gender_count = [dataset.Gender[dataset['Gender']=='Female'].count(),
                dataset.Gender[dataset['Gender']=='Male'].count()]
gender_lab = dataset.Gender.unique()
expl = (0.1,0)
plt.pie(gender_count, labels=gender_lab, explode=expl, shadow=True , autopct='%1.1f%%');

# Bar chart for Age
plt.subplot(2,2,2)
ordr =dataset.groupby(["Age"]).count().sort_values(by='Purchase',ascending=False).index
sns.countplot(dataset['Age'], label=True, order=ordr)

# Bar chart for Occupation
plt.subplot(2,2,3)
ordr1 =dataset.groupby(["Occupation"]).count().sort_values(by='Purchase',ascending=False).index
sns.countplot(y=dataset['Occupation'], label=True, order=ordr1)

# Donut chart for City Category
plt.subplot(2,2,4)
city_group = dataset.groupby(["City_Category"])
city_count = city_group[['Purchase']].count().values.tolist()
city_lab = dataset.groupby(["City_Category"]).count().index.values
my_circle = plt.Circle( (0,0), 0.4, color='white')
expl1 = (0,0.1,0)
plt.pie(city_count, labels=city_lab,explode=expl1, shadow=True, autopct='%1.1f%%')
plt.gcf().gca().add_artist(my_circle)


plt.show()

In [None]:
plt.figure(figsize=(14,9))

plt.subplot(1,2,1)
ordr2 =dataset.groupby(["Stay_In_Current_City_Years"]).count().sort_values(by='Purchase',ascending=False).index
sns.countplot(dataset['Stay_In_Current_City_Years'], label=True, order=ordr2)

plt.subplot(1,2,2)
ms_count = [dataset.Marital_Status[dataset['Marital_Status']=='Un-Married'].count(),
                dataset.Marital_Status[dataset['Marital_Status']=='Married'].count()]
ms_lab = dataset.Marital_Status.unique()
expl = (0.1,0)
plt.pie(ms_count, labels=ms_lab, explode=expl, shadow=True , autopct='%1.1f%%');

plt.show()

More Insight from Gender and Age

In [None]:
#Creating new column in the dataset 
dataset['Gender_MaritalStatus'] = dataset.apply(lambda x:'%s_%s' % (x['Gender'],x['Marital_Status']),axis=1)

In [None]:
dataset.Gender_MaritalStatus.unique()

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(1,2,1)
group_g_ms = dataset.groupby(["Gender_MaritalStatus"])
count_ms = group_g_ms[['Purchase']].count().values.tolist()
lab1 = dataset.groupby(["Gender_MaritalStatus"]).count().index.values
expl2 = (0,0,0.1,0.1)

plt.pie(count_ms, labels=lab1,explode=expl2, shadow=True, autopct='%1.1f%%')

plt.subplot(1,2,2)
sns.countplot(dataset['Age'],hue=dataset['Gender_MaritalStatus'])

plt.show()

__Observations__ : 
1. __Un-married males__ between the age group of __26-35__ are major customers of the store.
2. Majority customers are from the __City category B (42%) and C (31%)__
4. Customers with __Occupation__ code __4 (13%), 0 (12.7%) and 7(10.8%)__ are more compared to others
5. It can be observed that, as customers __who are new__ in the the current city, tend to purchase more from the store.

Visualization of categorical variables  - __BY AVERAGE PURCHASE__

In [None]:
plt.figure(figsize=(8,6))


sns.catplot(x='Gender', y='Purchase', data=dataset, kind='boxen')

ordr_age =dataset.groupby(["Age"]).mean().sort_values(by='Purchase',ascending=False).index
sns.catplot(x='Age', y='Purchase', order=ordr_age, data=dataset, kind='bar')

plt.show()

In [None]:
plt.figure(figsize=(8,6))

sns.catplot(x='City_Category', y='Purchase', data=dataset, kind='boxen')

ordr_occ =dataset.groupby(["Occupation"]).mean().sort_values(by='Purchase',ascending=False).index
sns.catplot(x='Occupation', y='Purchase', order=ordr_occ, data=dataset, kind='bar')

plt.show()

__Observations__ : 
1. __Males__ spend more than females
2. Age factor : Customers who are __above 35 years__ of age spend more. 
3. __Occupation codes 17, 12 and 15__ appears to be earning more and accordingly spend more 
4. People living in __Category C__ city appears to spend more on items in the store 


 __Encoding categorical variable for - Correlation Visualation__


In [None]:
# Feature Encoding categorical columns to visualize correlation of parameters with Purchase column
# Importing required package
from sklearn.preprocessing import LabelEncoder
encode_x = LabelEncoder()

In [None]:
# Creating copy of the dataset for this activity
dataset_cat = dataset.copy()

In [None]:
dataset_cat.dtypes

In [None]:
# Encoding categorical columns 
dataset_cat['Product_ID'] = encode_x.fit_transform(dataset_cat['Product_ID'])
dataset_cat['Gender'] = encode_x.fit_transform(dataset_cat['Gender'])
dataset_cat['Age'] = encode_x.fit_transform(dataset_cat['Age'])
dataset_cat['City_Category'] = encode_x.fit_transform(dataset_cat['City_Category'])
dataset_cat['Marital_Status'] = encode_x.fit_transform(dataset_cat['Marital_Status'])

In [None]:
dataset_cat.head()

In [None]:
dataset_cat[dataset_cat.columns[0:]].corr()['Purchase'].sort_values(ascending=False)

In [None]:
corrmat = dataset_cat.corr()

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, cmap="YlGnBu", square=True,linewidths=.5, annot=True)
plt.show()

In [None]:
# Obtaining top 5 parameters columns which affects the Purchase the most
k= 5
corrmat.nlargest(k, 'Purchase')

In [None]:
# Replotting the heatmap with the above data
cols = corrmat.nlargest(k, 'Purchase')['Purchase'].index
cm = np.corrcoef(dataset_cat[cols].values.T)
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(cm, cmap="YlGnBu", cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

__Modifiying categorical variable__ <br>
Combining cateogrical variables to obtain good number of counts of categorical varaibles.

In [None]:
dataset.dtypes

In [None]:
# Building list of categorical variables to obtain details
categ_columns = list(dataset.columns.values)
categ_columns

In [None]:
# Droping Identifier values, target values and others which are necessary from the list
categ_columns = [x for x in categ_columns if x not in ['User_ID','Product_ID','Purchase','source']]

In [None]:
# Print frequency of categories
for col in categ_columns:
    print ('Frequency (Count) of Categories for varible : ', col)
    print (dataset[col].value_counts())
    print('---------------------------------------------------')

__Combining Age ranges__

In [None]:
dataset['Modified_Age'] = dataset[['Age']]

In [None]:
dataset['Modified_Age'].value_counts()

In [None]:
# Modify Age Range to obtain obtain good number of counts in each range
dataset['Modified_Age'] =dataset['Modified_Age']. map({'0-17':'0-25','18-25':'0-25',
                                                       '46-50':'46-55+','51-55':'46-55+','55+':'46-55+',
                                                      '26-35':'26-35','36-45':'36-45'})

In [None]:
dataset['Modified_Age'].value_counts()

In [None]:
dataset['Occupation'].value_counts()

__One Hot encoding of variables__

In [None]:
dataset.columns

In [None]:
# Categorical variables
dataset[['Occupation','City_Category','Stay_In_Current_City_Years','Gender_MaritalStatus','Modified_Age']].head()

In [None]:
# Feature Encoding categorical columns to visualize correlation of parameters with Purchase column
# Importing required package
from sklearn.preprocessing import LabelEncoder
encode_x = LabelEncoder()

In [None]:
# Encoding categorical columns 
dataset['City_Category'] = encode_x.fit_transform(dataset['City_Category'])
dataset['Gender_MaritalStatus'] = encode_x.fit_transform(dataset['Gender_MaritalStatus'])
dataset['Modified_Age'] = encode_x.fit_transform(dataset['Modified_Age'])

In [None]:
dataset[['Occupation','City_Category','Stay_In_Current_City_Years','Gender_MaritalStatus','Modified_Age']].head()

In [None]:
# Creating dummy variables, one for each category of a categorical variable
dataset = pd.get_dummies(dataset, columns=['Occupation', 'City_Category','Stay_In_Current_City_Years', 
                                           'Gender_MaritalStatus','Modified_Age'])

In [None]:
dataset.dtypes

#### Exporting the data

In [None]:
# Drop the columns which have been converted to different types:
dataset.drop(['Age','Gender', 'Marital_Status'],axis=1,inplace=True)

In [None]:
dataset.dtypes

In [None]:
# Splitting the dataset back into test and train datasets for model building:
train = dataset.loc[dataset['source']=="train"]
test = dataset.loc[dataset['source']=="test"]

In [None]:
# Dropping unnecessary columns:
test = test.drop(['Purchase','source'],axis=1) # dropping dependent variable 'Purchase' from the test dataset
train = train.drop(['source'],axis=1)

In [None]:
test.columns

In [None]:
train.columns

In [None]:
# Exporting csv files as modified versions:
train.to_csv("./BlackFriday_Outputs/train_modified.csv",index=False)
test.to_csv("./BlackFriday_Outputs/test_modified.csv",index=False)

### 4. Building the model

In [None]:
# Mean based:
mean_purhcase = train['Purchase'].mean()
mean_purhcase

In [None]:
# Defining a dataframe with IDs for submission:
base1 = test[['User_ID','Product_ID']]

In [None]:
# Implying the mean sales value to the Item_Outlet_Sales column
base1.loc[:,'Purchase'] = mean_purhcase

In [None]:
# Exporting the submission file
base1.to_csv("./BlackFriday_Outputs/algor0.csv",index=False)

Function to makes the model, performs cross-validation and generates submission.

In [None]:
# Make the model and perform cross validation
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def modelfit_test(algorthm, dftrain, dftest, predictors, target, IDcol):

    #Fit the algorthmorithm on the data
    algorthm.fit(dftrain[predictors], dftrain[target]) 
        
    #Predict training set:
    dftrain_predictions = algorthm.predict(dftrain[predictors])

    #Perform cross-validation:
    cv_score = cross_val_score(algorthm, dftrain[predictors], dftrain[target], cv=20, n_jobs=-1,scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))
    
    #Print model report:
    print ("\n------Model Report----\n")
    print ("RMSE : " , np.sqrt(metrics.mean_squared_error(dftrain[target].values, dftrain_predictions)))
    print ("CV Score Mean : %.4g" %(np.mean(cv_score)))
    print ("CV Score Std : %.4g" %(np.std(cv_score)))
    print ("CV Score Min : %.4g" %(np.min(cv_score)))
    print ("CV Score Max : %.4g" %(np.max(cv_score)))

In [None]:
# Make the model, perform cross validation and export submission file.

from sklearn.model_selection import cross_val_score
from sklearn import metrics
def modelfit(algorthm, dftrain, dftest, predictors, target, IDcol, filename):

    #Fit the algorthmorithm on the data
    algorthm.fit(dftrain[predictors], dftrain[target]) # similar to the base dataframe created above with the predictor & target columns
        
    #Predict training set:
    dftrain_predictions = algorthm.predict(dftrain[predictors]) # Predicting using the predictors

    #Perform cross-validation:
    cv_score = cross_val_score(algorthm, dftrain[predictors], dftrain[target], cv=20, n_jobs=-1, scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))
    
    #Print model report:
    print ("\n------Model Report----\n")
    print ("RMSE : " , np.sqrt(metrics.mean_squared_error(dftrain[target].values, dftrain_predictions)))
    print ("CV Score Mean : %.4g" %(np.mean(cv_score)))
    print ("CV Score Std : %.4g" %(np.std(cv_score)))
    print ("CV Score Min : %.4g" %(np.min(cv_score)))
    print ("CV Score Max : %.4g" %(np.max(cv_score)))
    
    #Predict on testing data:
    dftest[target] = algorthm.predict(dftest[predictors])
    
    #Export submission file:
    IDcol.append(target)
    submission = pd.DataFrame({ x: dftest[x] for x in IDcol})
    submission.to_csv(filename, index=False)

## Machine Learning - Build Model

In [None]:
train.head()

__Assigning Targets, ID Columns and predictors__

In [None]:
# Assigning columns for model building
target = 'Purchase'
IDcol = ['User_ID','Product_ID']

# Excluding Identifiers (used for submission) and dependent variable column
predictors = [x for x in train.columns if x not in [target]+IDcol]


### Linear Regression Model

In [None]:
# Building a linear regression model 
from sklearn.linear_model import LinearRegression
alg1 = LinearRegression(normalize=True)

In [None]:
modelfit_test(alg1, train, test, predictors, target, IDcol)

In [None]:
# Assigning columns for model building
target = 'Purchase'
IDcol = ['User_ID','Product_ID']

# Excluding Identifiers (used for submission) and dependent variable column
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg1.fit(train[predictors], train[target])

In [None]:
# Using the FUNCTION which takes the algorithm and data as input and makes the model 
modelfit(alg1, train, test, predictors, target, IDcol, 'algor1.csv')

In [None]:
coef1 = pd.Series(alg1.coef_, predictors).sort_values(ascending=True)
coef1

In [None]:
plt.figure(figsize=(8,6))
coef1.plot(kind='bar', title='Model Coefficients')
plt.show()

### Ridge Regression Model:

In [None]:
# Building a list of independent varaibles to build the model
from sklearn.linear_model import Ridge

In [None]:
# Building a Ridge regression model 
alg2 = Ridge(alpha=0.05,normalize=True)

In [None]:
modelfit_test(alg2, train, test, predictors, target, IDcol)

### Decision Tree Model

In [None]:
# Building a Decision Tree regression model 
from sklearn.tree import DecisionTreeRegressor
alg3 = DecisionTreeRegressor(max_depth=25, min_samples_leaf=100)

In [None]:
modelfit_test(alg3, train, test, predictors, target, IDcol)

__Evaluating best parameters by Grid Search__

In [None]:
# Importing required package
from sklearn.model_selection import GridSearchCV

In [None]:
alg3.get_params().keys()

In [None]:
import sklearn.metrics
sklearn.metrics.SCORERS.keys()

In [None]:
#  DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
params = [{'max_depth':[25,50,75,100],'min_samples_leaf' :[50,100,150,200]}]

In [None]:
grid_search = GridSearchCV(estimator = alg3, 
                           param_grid = params,
                           scoring = 'neg_mean_squared_error',
                           cv = 20,
                           n_jobs= -1)

In [None]:
grid_search.fit(train[predictors], train[target])

In [None]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
best_accuracy

In [None]:
best_parameters

In [None]:
# Building a Decision Tree regression model with the above values
from sklearn.tree import DecisionTreeRegressor
alg3_1 = DecisionTreeRegressor(max_depth=50, min_samples_leaf=50)

In [None]:
modelfit(alg3, train, test, predictors, target, IDcol,'./BlackFriday_Outputs/algor3_29_12.csv')

In [None]:
coef3

The RMSE value is 3424 and the cross validation mean score is 3525.
This indicates that there is __over-fitting__ of the model

### Random Forest Model

In [None]:
target = 'Purchase'
IDcol = ['User_ID','Product_ID']
predictors = [x for x in train.columns if x not in [target]+IDcol]

In [None]:
from sklearn.ensemble import RandomForestRegressor
alg5 = RandomForestRegressor(n_estimators=50,max_depth=25, min_samples_leaf=50,n_jobs=1)

In [None]:
modelfit_test(alg5, train, test, predictors, target, IDcol)

In [None]:
target = 'Purchase'
IDcol = ['User_ID','Product_ID']
# Making a decision tree with just **top 2** variables based on the above coeffecient values
predictors = ['Master_Product_Category','City_Category_2']

In [None]:
modelfit_test(alg5, train, test, predictors, target, IDcol)