<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Data</a></span></li><li><span><a href="#Explore-Data" data-toc-modified-id="Explore-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Explore Data</a></span></li><li><span><a href="#Tidy-Data" data-toc-modified-id="Tidy-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tidy Data</a></span><ul class="toc-item"><li><span><a href="#Dummy-Variables" data-toc-modified-id="Dummy-Variables-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Dummy Variables</a></span></li><li><span><a href="#Scaling-and-Spliting-Training-and-Testing-Dataset" data-toc-modified-id="Scaling-and-Spliting-Training-and-Testing-Dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Scaling and Spliting Training and Testing Dataset</a></span></li></ul></li><li><span><a href="#Explore-Different-Models" data-toc-modified-id="Explore-Different-Models-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Explore Different Models</a></span><ul class="toc-item"><li><span><a href="#Functions-for-Automation-" data-toc-modified-id="Functions-for-Automation--4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Functions for Automation <a class="anchor" id="41" rel="nofollow"></a></a></span></li><li><span><a href="#Baseline:-Naive-Model" data-toc-modified-id="Baseline:-Naive-Model-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Baseline: Naive Model</a></span></li><li><span><a href="#Linear-Regression-Model" data-toc-modified-id="Linear-Regression-Model-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Linear Regression Model</a></span></li><li><span><a href="#Super-Vector-Machine" data-toc-modified-id="Super-Vector-Machine-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Super Vector Machine</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-4.5"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Random Forest</a></span></li><li><span><a href="#Artificial-Neural-Networks" data-toc-modified-id="Artificial-Neural-Networks-4.6"><span class="toc-item-num">4.6&nbsp;&nbsp;</span>Artificial Neural Networks</a></span></li><li><span><a href="#Simple-Recurrent-Neuron-Networks" data-toc-modified-id="Simple-Recurrent-Neuron-Networks-4.7"><span class="toc-item-num">4.7&nbsp;&nbsp;</span>Simple Recurrent Neuron Networks</a></span></li><li><span><a href="#Comparison-and-Model-Selection" data-toc-modified-id="Comparison-and-Model-Selection-4.8"><span class="toc-item-num">4.8&nbsp;&nbsp;</span>Comparison and Model Selection</a></span></li><li><span><a href="#Intepretation" data-toc-modified-id="Intepretation-4.9"><span class="toc-item-num">4.9&nbsp;&nbsp;</span>Intepretation</a></span></li><li><span><a href="#For-other-customer-group" data-toc-modified-id="For-other-customer-group-4.10"><span class="toc-item-num">4.10&nbsp;&nbsp;</span>For other customer group</a></span></li><li><span><a href="#Dummy-Variables" data-toc-modified-id="Dummy-Variables-4.11"><span class="toc-item-num">4.11&nbsp;&nbsp;</span>Dummy Variables</a></span></li><li><span><a href="#Scaling-and-Spliting-Training-and-Testing-Dataset" data-toc-modified-id="Scaling-and-Spliting-Training-and-Testing-Dataset-4.12"><span class="toc-item-num">4.12&nbsp;&nbsp;</span>Scaling and Spliting Training and Testing Dataset</a></span></li></ul></li></ul></div>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

# for visualization
import seaborn as sn
import matplotlib.pyplot as plt
import seaborn as sns

# for model training
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from keras import Input # for instantiating a keras tensor
from keras.layers import SimpleRNN

# small tools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import statistics
from statistics import stdev as sd
from statistics import mean

# Import Data

In [3]:
df = pd.read_csv('full_customer_video_ratings.csv')

In [4]:
df.shape

(26922, 245)

In [5]:
df.head(2)

Unnamed: 0,Customer Type,Age,House Type,Children at Home,House Tenure,Income,Gender,Accessories,Adult,Advertisement,...,Water,Weapon,Wedding,Wine,Wine Glass,Woman,Wood,content_lengths,video_id,rating
0,Affluent Achievers,79,Detached house,0,Owned outright,40,Male,0,21,0,...,0,0,21,0,0,21,0,53280,0,6.36
1,Affluent Achievers,57,Detached house,0,Owned outright,39,Male,0,21,0,...,0,0,21,0,0,21,0,53280,0,6.21


# Explore Data

# Tidy Data

In [6]:
#
df.drop(['video_id'], axis=1, inplace=True)

In [7]:
customer_types = df['Customer Type'].unique()

In [8]:
customer_types

array(['Affluent Achievers', 'Rising Prosperity',
       'Comfortable Communities', 'Financially Stretched',
       'Urban Adversity'], dtype=object)

In [9]:
ct1 = df.loc[df['Customer Type'] == customer_types[0],:]
ct1.drop(['Customer Type'], axis=1, inplace=True)

## Dummy Variables

In [10]:
categorical_varaibles = ct1.dtypes[ct1.dtypes == 'object'].index

In [11]:
ct1 = pd.get_dummies(ct1,
                    columns = categorical_varaibles,
                    drop_first=True)

## Scaling and Spliting Training and Testing Dataset

In [12]:
scale= StandardScaler()

In [13]:
scaled_ct1 = pd.DataFrame(scale.fit_transform(ct1), index=ct1.index, columns=ct1.columns)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(scaled_ct1.drop(columns=['rating']), scaled_ct1['rating'],
                                                   random_state=2222)

In [15]:
x_train.shape

(4599, 240)

In [16]:
x_test.shape

(1533, 240)

# Explore Different Models

## Functions for Automation <a class='anchor' id='41'></a>

In [17]:
def rmse(pred, actual):
    return (sum((pred - actual)**2)/len(actual))**0.5

In [18]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## Baseline: Naive Model

In [19]:
y_pred_train_naive = y_train.mean()

In [20]:
train_rmse_naive = rmse(y_pred_train_naive, y_train)
train_rmse_naive

1.0007345782533932

In [21]:
test_rmse_naive = rmse(y_pred_train_naive, y_test)
test_rmse_naive

0.9978539766269112

## Linear Regression Model

In [22]:
mod = sm.OLS(y_train,x_train)
fii = mod.fit()
p_values = fii.summary2().tables[1]['P>|t|']

In [23]:
y_pred_train_lm = fii.predict(x_train)
train_rmse_lm = rmse(y_pred_train_lm, y_train)
train_rmse_lm

0.9282150717718488

In [24]:
y_pred_test_lm = fii.predict(x_test)
test_rmse_lm = rmse(y_pred_test_lm, y_test)
test_rmse_lm

0.930761637277972

## Super Vector Machine

In [25]:
regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train);

In [26]:
y_pred_train_svm = regressor.predict(x_train)
train_rmse_svm = rmse(y_pred_train_svm, y_train)
train_rmse_svm

0.9291694169941624

In [27]:
y_pred_test_svm = regressor.predict(x_test)
test_rmse_svm =  rmse(y_pred_test_svm, y_test)
test_rmse_svm

0.9361804762585965

## Random Forest

In [28]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)# Train the model on training data
rf.fit(x_train, y_train)

In [29]:
y_pred_train_rf = rf.predict(x_train)
train_rmse_rf = rmse(y_pred_train_rf, y_train)
train_rmse_rf

0.44344811770159537

In [30]:
y_pred_test_rf = rf.predict(x_test)
test_rmse_rf = rmse(y_pred_test_rf, y_test)
test_rmse_rf

1.0307471480222177

In [31]:
# Get numerical feature importances
importances = list(rf.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(x_train, importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances 
['Variable: {:20} Importance: {}'.format(*pair) for pair in feature_importances][0:20]

['Variable: Age                  Importance: 0.33',
 'Variable: Income               Importance: 0.28',
 'Variable: Gender_Male          Importance: 0.06',
 'Variable: content_lengths      Importance: 0.04',
 'Variable: Advertisement        Importance: 0.02',
 'Variable: Poster               Importance: 0.02',
 'Variable: Adult                Importance: 0.01',
 'Variable: Body Part            Importance: 0.01',
 'Variable: Cleaning             Importance: 0.01',
 'Variable: Page                 Importance: 0.01',
 'Variable: Person               Importance: 0.01',
 'Variable: Plant                Importance: 0.01',
 'Variable: Text                 Importance: 0.01',
 'Variable: Children at Home     Importance: 0.0',
 'Variable: Accessories          Importance: 0.0',
 'Variable: Airport              Importance: 0.0',
 'Variable: Alcohol              Importance: 0.0',
 'Variable: Alien                Importance: 0.0',
 'Variable: Aluminium            Importance: 0.0',
 'Variable: Animal

In [32]:
scores = cross_val_score(rf, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [33]:
display_scores(rmse_scores)

Scores: [0.99033473 1.00822362 1.0298299  1.01415719 1.03429761 1.07067924
 1.06672592 1.06581286 1.02911363 1.01819035]
Mean: 1.0327365057208433
Standard deviation: 0.02582355788589316


## Artificial Neural Networks

In [34]:
# create ANN model
model = Sequential()
 
# Defining the Input layer and FIRST hidden layer, both are same!
model.add(Dense(units=60, input_dim=240, kernel_initializer='normal', activation='relu'))
 
# after the first layer we don't have to specify input_dim as keras configure it automatically
model.add(Dense(units=50, kernel_initializer='normal', activation='tanh'))
model.add(Dense(units=20, kernel_initializer='normal', activation='tanh'))
model.add(Dense(units=10, kernel_initializer='normal', activation='tanh'))
# The output neuron is a single fully connected node 
# Since we will be predicting a single number
model.add(Dense(1, kernel_initializer='normal'))
 
# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')
 
# Fitting the ANN to the Training set
model.fit(x_train, y_train ,batch_size = 50, epochs = 10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x26841e8bdc0>

In [35]:
y_pred_train_ann = model.predict(x_train).flatten()
train_rmse_ann = rmse(y_train, y_pred_train_ann)
train_rmse_ann



0.9306275655100446

In [36]:
y_pred_test_ann = model.predict(x_test).flatten()
test_rmse_ann = rmse(y_test, y_pred_test_ann)
test_rmse_ann



0.9343078376488643

In [37]:
## Deep Neural Network

In [38]:
normalizer = tf.keras.layers.Normalization(axis=-1)

In [39]:
normalizer.adapt(np.array(x_train))

In [40]:
def build_and_compile_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    
    model.compile(loss='mean_absolute_error',
                  optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [41]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, 240)              481       
 n)                                                              
                                                                 
 dense_5 (Dense)             (None, 64)                15424     
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 20,130
Trainable params: 19,649
Non-trainable params: 481
_________________________________________________________________


In [42]:
%%time
history = dnn_model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    verbose=0, epochs=100)

CPU times: total: 33.7 s
Wall time: 22.1 s


In [43]:
dnn_model.evaluate(x_test, y_test, verbose=0)

0.7398906350135803

In [44]:
y_pred_train_dnn = dnn_model.predict(x_train).flatten()



In [45]:
train_rmse_dnn = rmse(y_train, y_pred_train_dnn)
train_rmse_dnn

0.9148778024830827

In [46]:
y_pred_test_dnn = dnn_model.predict(x_test).flatten()



In [47]:
test_rmse_dnn = rmse(y_pred_test_dnn, y_test)
test_rmse_dnn

0.970256043620877

## Simple Recurrent Neuron Networks

In [48]:
model_rnn = Sequential()
model_rnn.add(Input(shape=(240,1), name='Input-Layer'))
model_rnn.add(SimpleRNN(units=50, activation='tanh', name='Hidden-Recurrent-Layer')) # Hidden Recurrent Layer
model_rnn.add(Dense(units=50, activation='tanh'))
model_rnn.add(Dense(units=25, activation='tanh'))
model_rnn.add(Dense(units=10, activation='tanh'))
model_rnn.add(Dense(units=1, activation='linear', name='Output-Layer'))


##### Step 5 - Compile keras model
model_rnn.compile(optimizer='adam', # default='rmsprop', an algorithm to be used in backpropagation
              loss='mean_squared_error', # Loss function to be optimized. A string (name of loss function), or a tf.keras.losses.Loss instance.
              metrics='MeanSquaredError', # List of metrics to be evaluated by the model during training and testing. Each of this can be a string (name of a built-in function), function or a tf.keras.metrics.Metric instance. 
              loss_weights=None, # default=None, Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs.
              weighted_metrics=None, # default=None, List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.
              run_eagerly=None, # Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function.
              steps_per_execution=None # Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead.
             )


# Fit keras model on the dataset
model_rnn.fit(x_train, y_train,
          batch_size=20, 
          epochs=5,
          verbose='auto')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x268425592e0>

In [49]:
y_pred_train_rnn = model.predict(x_train).flatten()
train_rmse_rnn = rmse(y_pred_train_rnn, y_train)
train_rmse_rnn



0.9306275655100446

In [50]:
y_pred_test_rnn = model.predict(x_test).flatten()
test_rmse_rnn = rmse(y_pred_test_rnn, y_test)
test_rmse_rnn




0.9343078376488643

## Comparison and Model Selection

In [51]:
model_name = pd.Series(['naive', 'linear regression', 'super vector', 'random forest', 'ANN', 'DNN', 'RNN'])
train_rmse = pd.Series([train_rmse_naive, train_rmse_lm, train_rmse_svm, train_rmse_rf,
                        train_rmse_ann, train_rmse_dnn, train_rmse_rnn])
test_rmse = pd.Series([test_rmse_naive, test_rmse_lm, test_rmse_svm, test_rmse_rf, 
                       test_rmse_ann, test_rmse_dnn, test_rmse_rnn])

model_comparison = pd.DataFrame({'model_name':model_name,
                                 'train_rmse':train_rmse,
                                 
                                 'test_rmse':test_rmse})

model_comparison.sort_values(by='test_rmse')

Unnamed: 0,model_name,train_rmse,test_rmse
1,linear regression,0.928215,0.930762
4,ANN,0.930628,0.934308
6,RNN,0.930628,0.934308
2,super vector,0.929169,0.93618
5,DNN,0.914878,0.970256
0,naive,1.000735,0.997854
3,random forest,0.443448,1.030747


## Intepretation

The best model is multiple linear model

In [52]:
top_labels = fii.params[p_values <= 0.4].sort_values(ascending = False)
top_labels[0:11]

content_lengths    2863.982033
City                 71.492822
Tile                 48.847375
Lotion               36.804364
Game                 33.937468
Super Mario          33.937468
Bottle               32.211304
Blackboard           25.365915
Snow                 19.591248
Snowflake            19.591248
Modern Art           13.602379
dtype: float64

In [53]:
top_labels[len(top_labels)-10:len(top_labels)]

Monitor             -131.564297
Hardware            -131.704574
Computer Hardware   -131.704574
Head                -133.193263
Face                -133.331862
Screen              -133.952495
Water               -136.362167
Sea                 -137.517037
Electronics         -145.208603
Person              -201.619434
dtype: float64

In [54]:
p_values[[p_value in top_labels.index for p_value in p_values.index]]

Children at Home    0.002235
Accessories         0.002233
Adult               0.002231
Advertisement       0.002217
Airport             0.002346
                      ...   
Wine                0.002332
Wine Glass          0.002332
Woman               0.002239
Wood                0.002475
content_lengths     0.002235
Name: P>|t|, Length: 237, dtype: float64

## For other customer group

In [55]:
len(customer_types)

5

In [56]:
def fit_linear_model(n):
    # filter out data and prepare
    ct2 = df.loc[df['Customer Type'] == customer_types[n],:]
    ct2.drop(['Customer Type'], axis=1, inplace=True)
    categorical_varaibles = ct2.dtypes[ct2.dtypes == 'object'].index
    ct2 = pd.get_dummies(ct2,
                        columns = categorical_varaibles,
                        drop_first=True)
    
    # scale and split
    scale= StandardScaler()
    scaled_ct2 = pd.DataFrame(scale.fit_transform(ct2), index=ct2.index, columns=ct2.columns)
    x_train, x_test, y_train, y_test = train_test_split(scaled_ct2.drop(columns=['rating']), scaled_ct2['rating'],
                                                       random_state=2222)
    
    mod = sm.OLS(y_train,x_train)
    fii = mod.fit()
    p_values = fii.summary2().tables[1]['P>|t|']
    
    return fii, p_values
    
    

In [57]:
from itertools import chain

def flatten_chain(matrix):
     return list(chain.from_iterable(matrix))

In [58]:
indexes = []
coefficient_values = []
coefficient_p_values = []

for n in range(1,5):
    fii, p_values = fit_linear_model(n)
    y_pred_train_lm = fii.predict(x_train)
    train_rmse_lm = rmse(y_pred_train_lm, y_train)
    print('train_rmse_lm',train_rmse_lm)
    y_pred_test_lm = fii.predict(x_test)
    test_rmse_lm = rmse(y_pred_test_lm, y_test)
    print('test_rmse_lm',test_rmse_lm)
    
    top_labels = fii.params[p_values <= 0.4].sort_values(ascending = False)
    
    # top labels
    indexes.append(top_labels[0:11].index)
    coefficient_values.append(top_labels[0:11].values)
    coefficient_p_values.append(p_values[[p_value in top_labels.index for p_value in p_values.index]][0:11].values)
    
    # bottom labels
    indexes.append(top_labels[len(top_labels)-10:len(top_labels)].index)
    coefficient_values.append(top_labels[len(top_labels)-10:len(top_labels)].values)
    coefficient_p_values.append(p_values[[p_value in top_labels.index for p_value in p_values.index]][len(top_labels)-10:len(top_labels)].values)
    print(len(flatten_chain(indexes)))

train_rmse_lm 1.0002611355184343
test_rmse_lm 1.0167834151281774
21
train_rmse_lm 0.9772270900184717
test_rmse_lm 0.9742656258845254
23
train_rmse_lm 0.9768973873817824
test_rmse_lm 0.9739444130181198
44
train_rmse_lm 0.9660902751240921
test_rmse_lm 0.9625545451439479
65


In [59]:
# important labels
impor_labels = pd.DataFrame({'index' : flatten_chain(indexes),
                             'coefficient' : flatten_chain(coefficient_values),
                             'p_value' : [round(num,4) for num in flatten_chain(coefficient_p_values)]})

In [60]:
impor_labels.sample(5)

Unnamed: 0,index,coefficient,p_value
48,Super Mario,107.673116,0.0
34,Monitor,-36.97413,0.3929
20,Person,-639.169013,0.1011
28,Super Mario,9.559905,0.3929
54,Painting,43.202116,0.0


In [61]:
# export
# impor_labels.to_csv('impor_labels.csv',index=False)