In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
sns.set(style='ticks', color_codes=True)

In [2]:
#This is a second take on the BlackFriday analysis focusing on using SKLearn for the predictions
#MAIN QUESTION: Predict the purchase amount
#first load the testing and training data set

testBF = pd.read_csv(r'test.csv')
trainBF = pd.read_csv(r'train.csv')

In [3]:
#explore both data sets. Check dimensions, statistics, as well as null values
trainBF.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [4]:
trainBF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
trainBF.shape

(550068, 12)

In [6]:
#trainBF.columns
list(trainBF)

['User_ID',
 'Product_ID',
 'Gender',
 'Age',
 'Occupation',
 'City_Category',
 'Stay_In_Current_City_Years',
 'Marital_Status',
 'Product_Category_1',
 'Product_Category_2',
 'Product_Category_3',
 'Purchase']

In [7]:
#Checking for null values in training set
trainBF.isnull().any()


User_ID                       False
Product_ID                    False
Gender                        False
Age                           False
Occupation                    False
City_Category                 False
Stay_In_Current_City_Years    False
Marital_Status                False
Product_Category_1            False
Product_Category_2             True
Product_Category_3             True
Purchase                      False
dtype: bool

In [8]:
trainBF.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [9]:
#check for null values in testing set, and chekc testing set shape
testBF.shape

(233599, 11)

In [10]:
#we'll notice that there is one less column in the testing set of data
#testBF.columns
list(testBF)

['User_ID',
 'Product_ID',
 'Gender',
 'Age',
 'Occupation',
 'City_Category',
 'Stay_In_Current_City_Years',
 'Marital_Status',
 'Product_Category_1',
 'Product_Category_2',
 'Product_Category_3']

In [11]:
#the testing data set does not have the "Purchase" column. 
#Therefore we will want to use the testing set as our X vlaues (dependant variable) is it contains all the things outsid the "purchase" column
#the Training dataset will be our Y vairable


In [12]:
#Now we want to handle the NAN or Null values in both data sets
#I'm unsure what value to replace NAN or Null values, so let's just drop those values as to not uninentionally skew data by filling in inaapropriate values
trainBF = trainBF.dropna()
testBF = testBF.dropna()


In [13]:
#confirm no more NAN or NUll values in data sets
trainBF.isna().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [14]:
testBF.isna().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
dtype: int64

In [15]:
#No more missing values
#Now we can begin training model
#We may want to implement several models and see which produces most accurate results
#Models we can use: Linear Regression, Decision Tree, Random Forest, 


In [16]:
#BEFORE we assign out x and y values, we will want to turn Categorical data into numerical.
#this will help the model learn better
#we can use the pandas to_numeric method, or hard code it
#first find which columns are categorical. These will not have an int dtype.
#they WILL be of dtype object
trainBF.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [17]:
#Make copy of data sets
trainCopy = trainBF.copy()
testCopy = testBF.copy()


In [18]:
#We will start with Decision tree
#First thing to do is to define our x and y training and testing data
#we need an x_train, x_test, y_train, y_test for our Decsion tree model and we will split up BOTH the train and test data sets
#THE Y_TEST IS WHAT WE ARE LOOKING FOR! THIS IS THE UNKNOWN!
#the x values will NOT have the "Purchase" column in it. VERY IMPORTANT!
#we do this by using the pd.drop() fucntion.Remeber the you must specify the axis inwhich you want to split the data
#axis = 0 means the index, axis = 1 measn the columns
#Y will have the only the 'Purchase' column


#X_test = testBF.drop(['Purchase'], axis=1)
x_train = trainCopy.drop(['Purchase'], axis=1)
#y_test = testBF['Purchase']
y_train = trainCopy['Purchase']

In [19]:
#We may need to use the Label Encoder on the x_train data. Double check when are propper times to use Label encoder
LE = LabelEncoder()


In [22]:
x_train = x_train.apply(LE.fit_transform)

In [24]:
#the product_ID, Gender, Age, City_Category, Stay_In_current_city_years columns are all categorical
#now let's make them into numeric
#Note: I think we only need to convert categorical into Numeric for the Training (X) values

x_train['Product_Category_1'] = pd.to_numeric(x_train['Product_Category_1'])
x_train['Product_Category_2'] = pd.to_numeric(x_train['Product_Category_2'])
x_train['Product_Category_3'] = pd.to_numeric(x_train['Product_Category_3'])
x_train['Gender'] = pd.to_numeric(x_train['Gender'])
x_train['Age'] = pd.to_numeric(x_train['Age'])
x_train['City_Category'] = pd.to_numeric(x_train['City_Category'])
x_train['Stay_In_Current_City_Years'] = pd.to_numeric(x_train['Stay_In_Current_City_Years'])


In [45]:
#now we can use the DecesionTreeRegrssor and fit our model with it

dtr = DecisionTreeRegressor()

dtr_test = dtr.fit(x_train, y_train)
dtr_test

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [47]:
#Linear Regressoin
LR = LinearRegression()
LR_test = LR.fit(x_train, y_train)
LR_test

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
#GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR_test = GBR.fit(x_train, y_train)
GBR_test

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
#RandomForestRegresoion
RFR =RandomForestRegressor()
RFR_test =RFR.fit(x_train, y_train)
RFR_test.score(x_train, y_train)

  warn("The default value of n_estimators will change from "


0.9182603569913936

In [50]:
#Since the model has been trained, we can now call the vairous regressors and see how they compare with one another
print(f'DecisionTreeRegression score: {dtr_test.score(x_train, y_train)}')
print(f'LinearRegression score: {LR_test.score(x_train, y_train)}')
print(f'GradientBoostingRegressor score:{GBR_test.score(x_train, y_train)}')
print(f'RandomForestRegressor score: {RFR_test.score(x_train, y_train)}')

DecisionTreeRegression score: 1.0
LinearRegression score: 0.19410146061613398
GradientBoostingRegressor score:0.5453611103072871
RandomForestRegressor score: 0.9182603569913936


1.0

NameError: name 'y_test' is not defined