### Source of the data
Kaggle: https://www.kaggle.com/mehdidag/black-friday

In [1]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
bkFriday_df = pd.read_csv('D:/ProgramData/Data Resources/BlackFriday/BlackFriday.csv', sep=',')

In [3]:
bkFriday_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
bkFriday_df.shape

(537577, 12)

In [5]:
bkFriday_df.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [6]:
bkFriday_df.describe(include = 'all')

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,537577.0,537577,537577,537577,537577.0,537577,537577.0,537577.0,537577.0,370591.0,164278.0,537577.0
unique,,3623,2,7,,3,5.0,,,,,
top,,P00265242,M,26-35,,B,1.0,,,,,
freq,,1858,405380,214690,,226493,189192.0,,,,,
mean,1002992.0,,,,8.08271,,,0.408797,5.295546,9.842144,12.66984,9333.859853
std,1714.393,,,,6.52412,,,0.491612,3.750701,5.087259,4.124341,4981.022133
min,1000001.0,,,,0.0,,,0.0,1.0,2.0,3.0,185.0
25%,1001495.0,,,,2.0,,,0.0,1.0,5.0,9.0,5866.0
50%,1003031.0,,,,7.0,,,0.0,5.0,9.0,14.0,8062.0
75%,1004417.0,,,,14.0,,,1.0,8.0,15.0,16.0,12073.0


### Columns that I will use to build the model
Some of them

In [7]:
toAdd_df = pd.get_dummies(bkFriday_df[['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']])

In [8]:
toAdd_df.shape

(537577, 17)

### Predictors DataFrame
All of them

In [9]:
pred_df = bkFriday_df[['Occupation', 'Marital_Status']].join(toAdd_df)

In [10]:
pred_df.shape

(537577, 19)

In [11]:
pred_df.head()

Unnamed: 0,Occupation,Marital_Status,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,10,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,10,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,10,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,10,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
4,16,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1


In [12]:
pred_df.isnull().any()

Occupation                       False
Marital_Status                   False
Gender_F                         False
Gender_M                         False
Age_0-17                         False
Age_18-25                        False
Age_26-35                        False
Age_36-45                        False
Age_46-50                        False
Age_51-55                        False
Age_55+                          False
City_Category_A                  False
City_Category_B                  False
City_Category_C                  False
Stay_In_Current_City_Years_0     False
Stay_In_Current_City_Years_1     False
Stay_In_Current_City_Years_2     False
Stay_In_Current_City_Years_3     False
Stay_In_Current_City_Years_4+    False
dtype: bool

#### Normalizing `Occupation` column

In [13]:
pred_df['Occupation'] = pred_df['Occupation']/pred_df['Occupation'].max()

#### Normalizing `Purchase` (target) column

In [14]:
targ_df = bkFriday_df['Purchase']/bkFriday_df['Purchase'].max()

In [15]:
targ_df.head()

0    0.349318
1    0.634364
2    0.059346
3    0.044113
4    0.332582
Name: Purchase, dtype: float64

In [16]:
targ_df.shape

(537577,)

In [17]:
targ_df.isnull().any()

False

### Define classification model

In [18]:
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(50, activation='relu', input_shape=(19,)))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1))
    
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [19]:
# build the model
model = regression_model()

### Using a normalized target

In [25]:
model.fit(pred_df, targ_df, validation_split=0.3, epochs=10, verbose=2)

Train on 376303 samples, validate on 161274 samples
Epoch 1/10
 - 8s - loss: 597019.1829 - val_loss: 2.6506
Epoch 2/10
 - 8s - loss: 0.6978 - val_loss: 0.0456
Epoch 3/10
 - 8s - loss: 0.0436 - val_loss: 0.0434
Epoch 4/10
 - 8s - loss: 0.0433 - val_loss: 0.0431
Epoch 5/10
 - 7s - loss: 0.0430 - val_loss: 0.0430
Epoch 6/10
 - 8s - loss: 0.0429 - val_loss: 0.0431
Epoch 7/10
 - 8s - loss: 0.0428 - val_loss: 0.0430
Epoch 8/10
 - 8s - loss: 0.0427 - val_loss: 0.0428
Epoch 9/10
 - 8s - loss: 0.0426 - val_loss: 0.0431
Epoch 10/10
 - 8s - loss: 0.0426 - val_loss: 0.0428


<keras.callbacks.History at 0x2d64e63d4e0>

### Using a target that is not normalized

In [20]:
targ_df1 = bkFriday_df['Purchase']

In [21]:
model.fit(pred_df, targ_df1, validation_split=0.3, epochs=10, verbose=2)

Train on 376303 samples, validate on 161274 samples
Epoch 1/10
 - 10s - loss: 27449712.1709 - val_loss: 24691241.0429
Epoch 2/10
 - 9s - loss: 24545862.6251 - val_loss: 24694663.1769
Epoch 3/10
 - 9s - loss: 24547027.7592 - val_loss: 24691479.8413
Epoch 4/10
 - 9s - loss: 24549062.7868 - val_loss: 24693882.1764
Epoch 5/10
 - 9s - loss: 24546786.5014 - val_loss: 24694371.9963
Epoch 6/10
 - 9s - loss: 24548953.5945 - val_loss: 24693229.1358
Epoch 7/10
 - 9s - loss: 24546170.8858 - val_loss: 24692611.7169
Epoch 8/10
 - 9s - loss: 24547501.0794 - val_loss: 24700711.9410
Epoch 9/10
 - 9s - loss: 24548278.9858 - val_loss: 24690487.4462
Epoch 10/10
 - 9s - loss: 24548030.6103 - val_loss: 24702105.0564


<keras.callbacks.History at 0x2d64c66ff60>

In [22]:
model.fit(pred_df, targ_df1, validation_split=0.15, epochs=10, verbose=2)

Train on 456940 samples, validate on 80637 samples
Epoch 1/10
 - 10s - loss: 24586327.9632 - val_loss: 24653801.4718
Epoch 2/10
 - 9s - loss: 24585566.4708 - val_loss: 24627707.5077
Epoch 3/10
 - 9s - loss: 24584066.6777 - val_loss: 24643639.1527
Epoch 4/10
 - 9s - loss: 24586553.6928 - val_loss: 24641940.0411
Epoch 5/10
 - 10s - loss: 24585490.4367 - val_loss: 24631302.9432
Epoch 6/10
 - 9s - loss: 24587013.1371 - val_loss: 24632934.3933
Epoch 7/10
 - 9s - loss: 24585025.4388 - val_loss: 24631243.6892
Epoch 8/10
 - 10s - loss: 24585659.6494 - val_loss: 24627567.3796
Epoch 9/10
 - 10s - loss: 24584968.9057 - val_loss: 24630597.8898
Epoch 10/10
 - 10s - loss: 24585098.6254 - val_loss: 24635982.3599


<keras.callbacks.History at 0x2d64c66f908>

In [24]:
targ_df1.dtype

dtype('int64')