# VGG predictions
## Load the data

In [1]:
# Importing the libraries to load the data
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
import scipy
import scipy.stats
from scipy.io import loadmat

In [2]:
path = # Please provide own path
# os.listdir(path)

In [3]:
infile_annot_training = open(path+"\\"+"annotation_training.pkl", "rb")
annotation_training = pickle.load(infile_annot_training, encoding = "latin1")

infile_annot_validation = open(path+"\\"+"annotation_validation.pkl", "rb")
annotation_validation = pickle.load(infile_annot_validation, encoding = "latin1")

In [4]:
# Reading in the ethnicity and gender data
# and preparing it to merge with other data frames
excel_data = pd.read_excel(r"C:\Users\Marie-Claire\Downloads\eth_gender_anno_all.xlsx")
excel_to_merge = excel_data.drop(['YouTubeID'], axis = 1)
excel_to_merge.columns = excel_to_merge.columns.str.lower()
excel_to_merge.rename(columns = {"videoname" : "filenames", 
                         }, inplace = True)
#excel_to_merge.head()

## VGG predictions
#### Loading the VGG features

In [5]:
data_dir = # Please provide own path

lgbptop_file_path = os.path.join(data_dir, 'vggfer33fun.mat')
feats = loadmat(lgbptop_file_path)

mdtype = feats['vggfer33fun'].dtype

vgg_ndata = {n: feats['vggfer33fun'][n][0, 0] for n in mdtype.names}

filenames = [filename[0] for filename in vgg_ndata['filename'].squeeze()]

data_vgg = {'filenames': filenames, 'features': vgg_ndata['data']}

print(vgg_ndata['data'].shape)
print(vgg_ndata['filename'].shape)

#pickle.dump(data, open(os.path.join(data_dir, 'lgbptop.pkl'), 'wb'))

(8000, 20480)
(8000, 1)


#### Create the training set

In [6]:
# Preparing the training data by converting the dictionaries to dataframes
# This way the data can be merged later on

# Creating a copy just in case
data_3 = data_vgg.copy()

# Creating a data frame of the features data
                         #test_array = np.array(data_1['features'])
vgg_features_df = pd.DataFrame(data_3['features'])

In [7]:
# Creating a copy just in case
data_4 = data_vgg.copy()

# Creating a data frame of the filenames data
vgg_filenames_df = pd.DataFrame(data_4['filenames'])       # np.array() weggehaald
vgg_filenames_df = vgg_filenames_df.rename(columns= {0 : "filenames"})
#vgg_filenames_df.head()

In [8]:
df_containing_vgg = pd.concat([vgg_filenames_df, vgg_features_df], axis = 1)
df_containing_vgg.head()

Unnamed: 0,filenames,0,1,2,3,4,5,6,7,8,...,20470,20471,20472,20473,20474,20475,20476,20477,20478,20479
0,--Ymqszjv54.001.mp4,-0.055683,-0.236555,-19.559164,-0.36774,-0.235484,-8.412696,-0.449136,3.356551,-5.745764,...,-0.268699,-8.778057,-0.267969,-0.335641,-19.149986,0.558402,-7.428394,-3.032744,-18.559444,-0.137366
1,--Ymqszjv54.003.mp4,-0.166791,-0.243937,-16.41888,-0.372782,-0.237834,-6.789306,-0.43195,4.065738,-2.327925,...,-0.250965,-3.500271,-0.261685,-0.29848,-18.654526,3.006815,-7.294409,-2.013493,-15.928368,-0.127522
2,--Ymqszjv54.004.mp4,-0.095448,-0.242685,-16.341707,-0.371635,-0.249516,-12.727262,-0.463436,4.256013,-3.142007,...,-0.223702,0.982952,-0.277151,-0.332205,-14.339534,9.292833,-9.241794,1.561962,-13.497578,-0.114644
3,--Ymqszjv54.005.mp4,-0.344704,-0.252611,-16.521524,-0.392809,-0.26987,-14.835567,-0.472394,4.873293,-3.527074,...,-0.232808,0.702202,-0.284889,-0.318367,-14.908902,3.359067,-7.684659,1.701056,-11.071396,-0.111634
4,-2qsCrkXdWs.001.mp4,0.470167,-0.277294,2.179229,-0.303704,-0.193008,-2.502256,-0.419951,-1.081634,-5.31924,...,-0.206349,3.991546,-0.161194,-0.221222,2.620975,3.629155,-7.599644,-17.188986,-0.24365,-0.116243


In [9]:
# Checking if the sequence is in correct order
print(data_vgg['filenames'][0])
print(data_vgg['features'][0])

print(data_vgg['filenames'][1])
print(data_vgg['features'][1])

--Ymqszjv54.001.mp4
[ -0.05568334  -0.2365552  -19.559164   ...  -3.0327437  -18.559444
  -0.13736597]
--Ymqszjv54.003.mp4
[ -0.1667905   -0.2439367  -16.41888    ...  -2.0134928  -15.928368
  -0.12752174]


In [10]:
vgg_df_interview = pd.DataFrame(annotation_training["interview"],  index=[0]).T
vgg_df_interview.reset_index(inplace = True)
vgg_df_interview.rename(columns = {"index" : "filenames", 0 : "interview"}, inplace = True)
print(vgg_df_interview.shape)
#vgg_df_interview.head()

(6000, 2)


In [11]:
vgg_1_train = pd.merge(vgg_df_interview, df_containing_vgg, on = ["filenames"], how = "left")
print(vgg_1_train.shape)

# Checking for missing data & outliers in the training set
for column in vgg_1_train.columns:
    if vgg_1_train[column].isnull().any():
        print(column)                           # No output means no missing data in the training set

(6000, 20482)


In [12]:
# Description of training set
vgg_1_train.describe()

Unnamed: 0,interview,0,1,2,3,4,5,6,7,8,...,20470,20471,20472,20473,20474,20475,20476,20477,20478,20479
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.50378,0.270279,-0.285081,-11.687315,-0.325762,-0.207463,-8.980567,-0.445502,5.119049,-6.217986,...,-0.234988,-7.431848,-0.196004,-0.308717,-8.130709,-1.737559,-4.992703,-5.760758,-7.486003,-0.125419
std,0.150148,0.62493,0.065183,7.498257,0.059253,0.042329,7.412726,0.094788,8.331264,7.760859,...,0.056034,7.239368,0.042703,0.071241,7.204863,8.712741,7.813078,6.188989,6.417091,0.033693
min,0.0,-1.843723,-0.542196,-44.137951,-0.573281,-0.417524,-37.007336,-0.920841,-38.12236,-50.331089,...,-0.507402,-39.297752,-0.389587,-0.629192,-34.041065,-33.186592,-33.021378,-30.586414,-32.924835,-0.279788
25%,0.401869,-0.157689,-0.326142,-16.231661,-0.364254,-0.234987,-13.602871,-0.504585,-0.199337,-11.053427,...,-0.270404,-11.921796,-0.222966,-0.356357,-12.747404,-7.484886,-10.314631,-9.545395,-11.566561,-0.146197
50%,0.514019,0.253475,-0.281383,-11.121806,-0.323276,-0.206054,-8.674579,-0.441152,5.125813,-5.575384,...,-0.230486,-7.218858,-0.193964,-0.307008,-8.057387,-1.736244,-4.799426,-5.530936,-7.419476,-0.123316
75%,0.609813,0.689761,-0.239574,-6.617743,-0.286216,-0.177989,-3.944145,-0.379699,10.391348,-0.939446,...,-0.195276,-2.571614,-0.166588,-0.258341,-3.661532,3.86013,0.475111,-1.62388,-3.304805,-0.101691
max,1.0,2.983497,0.0,12.586001,0.0,0.0,16.620678,0.0,40.396881,17.629379,...,0.0,20.489363,0.0,0.0,29.269672,33.160877,20.251741,16.21549,17.485193,0.0


#### Create the test set similarly

In [13]:
vgg_df_interview_test = pd.DataFrame(annotation_validation["interview"],  index=[0]).T
vgg_df_interview_test.reset_index(inplace = True)
vgg_df_interview_test.rename(columns = {"index" : "filenames", 0 : "interview"}, inplace = True)
print(vgg_df_interview_test.shape)

(2000, 2)


In [14]:
vgg_1_test = pd.merge(vgg_df_interview_test, df_containing_vgg, on = ["filenames"], how = "left")
print(vgg_1_test.shape)

# Checking for missing data & outliers in the test set
for column in vgg_1_test.columns:
    if vgg_1_test[column].isnull().any():
        print(column)                           # No output means no missing data in the test set

(2000, 20482)


In [15]:
# Description of test set
vgg_1_test.describe()

Unnamed: 0,interview,0,1,2,3,4,5,6,7,8,...,20470,20471,20472,20473,20474,20475,20476,20477,20478,20479
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.50465,0.273111,-0.284625,-11.56966,-0.323263,-0.205865,-9.085543,-0.443995,5.054893,-6.17626,...,-0.234774,-7.407041,-0.194751,-0.30831,-8.286963,-1.771374,-4.944284,-5.808772,-7.13832,-0.125944
std,0.14516,0.629038,0.06521,7.365011,0.058012,0.041905,7.382323,0.094968,8.368805,7.933972,...,0.056063,7.177917,0.041924,0.070534,7.161254,8.784038,7.645644,6.02773,6.541465,0.03446
min,0.0,-1.67534,-0.575146,-47.315029,-0.531785,-0.376412,-43.38335,-0.885063,-35.396843,-47.439022,...,-0.451487,-32.617569,-0.373997,-0.572558,-37.71365,-34.47686,-30.38068,-30.756332,-34.868942,-0.269901
25%,0.411215,-0.167317,-0.324275,-16.23545,-0.360963,-0.234632,-13.711173,-0.503499,-0.009272,-11.17706,...,-0.269484,-11.950513,-0.219698,-0.35234,-12.904392,-7.654516,-10.270492,-9.617095,-11.398297,-0.146514
50%,0.514019,0.249495,-0.279998,-11.183249,-0.322512,-0.203339,-8.678718,-0.442501,4.887539,-5.539994,...,-0.23145,-7.304997,-0.193672,-0.305563,-8.325303,-1.733344,-4.808083,-5.514301,-7.028734,-0.123758
75%,0.607477,0.684074,-0.238581,-6.701227,-0.285047,-0.17756,-4.269777,-0.37624,10.044693,-0.860253,...,-0.195308,-2.860291,-0.166715,-0.261471,-3.550395,3.990892,0.476962,-1.767284,-2.823713,-0.101118
max,0.915888,2.195411,-0.090423,14.258253,-0.156801,-0.074755,15.895493,-0.186607,39.655376,18.683762,...,-0.090791,18.518583,-0.071463,-0.125923,25.900566,30.128595,17.002779,18.811586,23.617056,-0.04522


<h1 align="center"> Predictions with VGG facial data</h1> 
<h3 align="center"> Multiple Linear Regression</h3>

#### Models that will be trained:
- Model 1 = Job interview ~ Facial
- Model 2 = Job interview ~ Facial + Gender
- Model 3 = Job interview ~ Facial + Ethnicity
- Model 4 = Job interview ~ Facial + Gender + Ethnicity

## Model 1 - Interview ~ facial

#### Regression preparations Model 1

In [16]:
# Splitting the (in)dependent variables
X_1_train = vgg_1_train.drop(['filenames', 'interview'], axis=1).values
y_1_train = vgg_1_train.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_1_test = vgg_1_test.drop(['filenames', 'interview'], axis=1).values
y_1_test = vgg_1_test.loc[:,['interview']].values

#### Baseline prediction

In [17]:
# Creating a baseline by taking the MAE of the test set outcome variable and the mean of the training set outcome variable
1 - mean_absolute_error(y_1_test, np.full(2000, np.mean(y_1_train)))

0.8830107663551402

#### Multiple linear regression Model 1

In [18]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_1_train, y_1_train)

# -------------------------------------------------------
# Predicting the test set labels
y_1_pred = regressor.predict(X_1_test)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_1_test, y_1_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, y_1_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_1_test, y_1_pred)))

MSE 0.06561
MAE 0.1991
1 - MAE 0.8009


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [19]:
# Checking the importance of the added variable(s) (not necessary for Model 1)
importance = regressor.coef_.ravel()

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

In [20]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.031157857
0.041963637


In [21]:
# PCA for linear regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_mlr_PCA = pca.fit_transform(X_1_train)
    X_1_test_mlr_PCA = pca.transform(X_1_test)
    
    regressor.fit(X_1_train_mlr_PCA, y_1_train)
    mlr_y_1_PCA = regressor.predict(X_1_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 1 = 0.01385
MLR with 500 components, MAE Model 1 = 0.09424
MLR with 500 components, 1 - MAE Model 1 = 0.9058
-------------------------------------------
MLR with 1000 components, MSE Model 1 = 0.01404
MLR with 1000 components, MAE Model 1 = 0.09537
MLR with 1000 components, 1 - MAE Model 1 = 0.9046
-------------------------------------------
MLR with 2000 components, MSE Model 1 = 0.01456
MLR with 2000 components, MAE Model 1 = 0.09584
MLR with 2000 components, 1 - MAE Model 1 = 0.9042
-------------------------------------------


In [22]:
# PCA for linear regression Model 1 for additional testing
components = [500]

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_mlr_PCA = pca.fit_transform(X_1_train)
    X_1_test_mlr_PCA = pca.transform(X_1_test)
    
    regressor.fit(X_1_train_mlr_PCA, y_1_train)
    mlr_y_1_PCA = regressor.predict(X_1_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 1 = 0.01371
MLR with 500 components, MAE Model 1 = 0.09371
MLR with 500 components, 1 - MAE Model 1 = 0.9063
-------------------------------------------


#### Random forest regression Model 1

In [23]:
# Random forest regression

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_1_train, y_1_train.ravel())

# Using .ravel() to get the correct format
rfr_y_1 = rfr.predict(X_1_test)
print("MSE {:.4}".format(mean_squared_error(y_1_test, rfr_y_1)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, rfr_y_1)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_1_test, rfr_y_1)))

MSE 0.01456
MAE 0.09535
1 - MAE 0.9046


In [24]:
# PCA for random forest regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_1_train_PCA = pca.fit_transform(X_1_train)
    rfr_X_1_test_PCA = pca.transform(X_1_test)
    
    rfr.fit(rfr_X_1_train_PCA, y_1_train.ravel())
    rfr_y_1_PCA = rfr.predict(rfr_X_1_test_PCA)
    print("RFR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, rfr_y_1_PCA)))
    print("RFR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, rfr_y_1_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, rfr_y_1_PCA)))
    print("-------------------------------------------")

RFR with 500 components, MSE Model 1 = 0.01544
RFR with 500 components, MAE Model 1 = 0.09914
RFR with 500 components, 1 - MAE Model 1 = 0.9009
-------------------------------------------
RFR with 1000 components, MSE Model 1 = 0.01652
RFR with 1000 components, MAE Model 1 = 0.1036
RFR with 1000 components, 1 - MAE Model 1 = 0.8964
-------------------------------------------
RFR with 2000 components, MSE Model 1 = 0.01618
RFR with 2000 components, MAE Model 1 = 0.1017
RFR with 2000 components, 1 - MAE Model 1 = 0.8983
-------------------------------------------


#### Support vector regression Model 1

In [25]:
# Scaling the features
# X
X_sc = StandardScaler()
X_1_train_scale = X_sc.fit_transform(X_1_train)
X_1_test_scale = X_sc.transform(X_1_test)

In [26]:
%%time

svr = SVR(kernel = 'rbf')
svr.fit(X_1_train_scale, y_1_train.ravel())
svr_y_1 = svr.predict(X_1_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_1_test, svr_y_1)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, svr_y_1)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_1_test, svr_y_1)))

MSE 0.01216
MAE 0.08813
1 - MAE 0.9119
Wall time: 8min 4s


In [27]:
# PCA for support vector regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_PCA = pca.fit_transform(X_1_train_scale)
    X_1_test_PCA = pca.transform(X_1_test_scale)
    
    svr.fit(X_1_train_PCA, y_1_train.ravel())
    svr_y_1_PCA = svr.predict(X_1_test_PCA)
    print("SVR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, svr_y_1_PCA)))
    print("SVR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, svr_y_1_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, svr_y_1_PCA)))
    print("-------------------------------------------")

SVR with 500 components, MSE Model 1 = 0.01223
SVR with 500 components, MAE Model 1 = 0.08835
SVR with 500 components, 1 - MAE Model 1 = 0.9116
-------------------------------------------
SVR with 1000 components, MSE Model 1 = 0.01217
SVR with 1000 components, MAE Model 1 = 0.08807
SVR with 1000 components, 1 - MAE Model 1 = 0.9119
-------------------------------------------
SVR with 2000 components, MSE Model 1 = 0.01214
SVR with 2000 components, MAE Model 1 = 0.08788
SVR with 2000 components, 1 - MAE Model 1 = 0.9121
-------------------------------------------


## Model 2 - Interview ~ facial + gender
#### Creating the training and validation (test) set for Model 2

In [28]:
# Creating a training set for Model 2
df_gen_int = excel_to_merge[['filenames', 'gender']]
training_face_gen = pd.merge(vgg_1_train, df_gen_int, how = "left", on = ["filenames"])

In [29]:
# Creating the validation/ test set for Model 2
val_face_gen = pd.merge(vgg_1_test, df_gen_int, how = "left", on = ["filenames"])
#val_face_gen

#### Regression preparations Model 2

In [30]:
%%time

# Splitting the (in)dependent variables
X_2_train = training_face_gen.drop(['filenames','interview'], axis=1).values
y_2_train = training_face_gen.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_2_test = val_face_gen.drop(['filenames', 'interview'], axis=1).values
y_2_test = val_face_gen.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of gender
le = LabelEncoder()
X_2_train[:, -1] = le.fit_transform(X_2_train[:, -1])
X_2_test[:, -1] = le.fit_transform(X_2_test[:, -1])

Wall time: 1.13 s


#### Multiple linear regression Model 2

In [31]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_2_train, y_2_train)

# -------------------------------------------------------
# Predicting the test set labels
y_2_pred = regressor.predict(X_2_test)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_2_test, y_2_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, y_2_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, y_2_pred)))

MSE 0.06529
MAE 0.1988
1 - MAE 0.8012


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [32]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Position of gender variable
print(importance[-1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.0317044677696671


In [33]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.0317044677696671
0.04209157119361864


In [34]:
# PCA for linear regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    X_2_train_mlr_PCA = pca.fit_transform(X_2_train)
    X_2_test_mlr_PCA = pca.transform(X_2_test)
    
    regressor.fit(X_2_train_mlr_PCA, y_2_train)
    mlr_y_2_PCA = regressor.predict(X_2_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1 - mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 2 = 0.01378
MLR with 500 components, MAE Model 2 = 0.09386
MLR with 500 components, 1 - MAE Model 2 = 0.9061
-------------------------------------------
MLR with 1000 components, MSE Model 2 = 0.01391
MLR with 1000 components, MAE Model 2 = 0.09461
MLR with 1000 components, 1 - MAE Model 2 = 0.9054
-------------------------------------------
MLR with 2000 components, MSE Model 2 = 0.01446
MLR with 2000 components, MAE Model 2 = 0.09554
MLR with 2000 components, 1 - MAE Model 2 = 0.9045
-------------------------------------------


In [35]:
# PCA for linear regression Model 2 for additional testing
components = [500]

for component in components:
    pca = PCA(n_components = component)   
    X_2_train_mlr_PCA = pca.fit_transform(X_2_train)
    X_2_test_mlr_PCA = pca.transform(X_2_test)
    
    regressor.fit(X_2_train_mlr_PCA, y_2_train)
    mlr_y_2_PCA = regressor.predict(X_2_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1 - mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 2 = 0.01375
MLR with 500 components, MAE Model 2 = 0.09387
MLR with 500 components, 1 - MAE Model 2 = 0.9061
-------------------------------------------


#### Random forest regression Model 2

In [36]:
# Random forest regression

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_2_train, y_2_train.ravel())

# Using .ravel() to get the correct format
rfr_y_2 = rfr.predict(X_2_test)
print("MSE {:.4}".format(mean_squared_error(y_2_test, rfr_y_2)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, rfr_y_2)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, rfr_y_2)))

MSE 0.01459
MAE 0.0954
1 - MAE 0.9046


In [37]:
# PCA for random forest regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_2_train_PCA = pca.fit_transform(X_2_train)
    rfr_X_2_test_PCA = pca.transform(X_2_test)
    
    rfr.fit(rfr_X_2_train_PCA, y_2_train.ravel())
    rfr_y_2_PCA = rfr.predict(rfr_X_2_test_PCA)
    print("RFR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, rfr_y_2_PCA)))
    print("RFR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, rfr_y_2_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, rfr_y_2_PCA)))
    print("-------------------------------------------")

RFR with 500 components, MSE Model 2 = 0.01549
RFR with 500 components, MAE Model 2 = 0.09976
RFR with 500 components, 1 - MAE Model 2 = 0.9002
-------------------------------------------
RFR with 1000 components, MSE Model 2 = 0.01637
RFR with 1000 components, MAE Model 2 = 0.1032
RFR with 1000 components, 1 - MAE Model 2 = 0.8968
-------------------------------------------
RFR with 2000 components, MSE Model 2 = 0.01678
RFR with 2000 components, MAE Model 2 = 0.1041
RFR with 2000 components, 1 - MAE Model 2 = 0.8959
-------------------------------------------


#### Support vector regression Model 2

In [38]:
# Scaling the features
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the gender dummies
X_2_train_scale = X_2_train.copy()
X_2_test_scale = X_2_test.copy()

X_2_train_scale[:, :-1] = X_sc.fit_transform(X_2_train[:, :-1])
X_2_test_scale[:, :-1] = X_sc.transform(X_2_test[:, :-1])

In [39]:
%%time
# Support vector regression

svr = SVR(kernel = 'rbf')
svr.fit(X_2_train_scale, y_2_train.ravel())
svr_y_2 = svr.predict(X_2_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_2_test, svr_y_2)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, svr_y_2)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, svr_y_2)))

MSE 0.01217
MAE 0.08813
1 - MAE 0.9119
Wall time: 7min 40s


In [40]:
%%time
# PCA for support vector regression
components = [500, 1000, 2000]
    
for component in components:
    pca = PCA(n_components = component)   
    X_2_train_PCA = pca.fit_transform(X_2_train_scale)
    X_2_test_PCA = pca.transform(X_2_test_scale)
    
    svr.fit(X_2_train_PCA, y_2_train.ravel())
    svr_y_2_PCA = svr.predict(X_2_test_PCA)
    print("SVR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, svr_y_2_PCA)))
    print("SVR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, svr_y_2_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, svr_y_2_PCA)))
    print("-------------------------------------------")

SVR with 500 components, MSE Model 2 = 0.01223
SVR with 500 components, MAE Model 2 = 0.08832
SVR with 500 components, 1 - MAE Model 2 = 0.9117
-------------------------------------------
SVR with 1000 components, MSE Model 2 = 0.01218
SVR with 1000 components, MAE Model 2 = 0.08811
SVR with 1000 components, 1 - MAE Model 2 = 0.9119
-------------------------------------------
SVR with 2000 components, MSE Model 2 = 0.01213
SVR with 2000 components, MAE Model 2 = 0.08783
SVR with 2000 components, 1 - MAE Model 2 = 0.9122
-------------------------------------------
Wall time: 8min 7s


## Model 3 - Interview ~ facial + ethnicity
#### Creating the training and validation (test) set for Model 3

In [41]:
# Creating the training set for Model 3, merging the new training dataset with
# a previously created dataset that contains 'interview', 'gender' and 'ethnicity'

df_eth_int = excel_to_merge[['filenames', 'ethnicity']]
training_face_eth = pd.merge(vgg_1_train, df_eth_int, how = "left", on = ["filenames"])

In [42]:
# Creating the validation/ test set for Model 3
#df_val_eth_int = merged_val_df[['videoname', 'ethnicity', 'interview']]
val_face_eth = pd.merge(vgg_1_test, df_eth_int, how = "left", on = ["filenames"])
#val_face_eth.head()

#### Regression preparations Model 3

In [43]:
%%time

# Splitting the (in)dependent variables
X_3_train = training_face_eth.drop(['filenames', 'interview'], axis=1).values
y_3_train = training_face_eth.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_3_test = val_face_eth.drop(['filenames', 'interview'], axis=1).values
y_3_test = val_face_eth.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of ethnicity
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
X_3_train = np.array(ct.fit_transform(X_3_train))
X_3_test = np.array(ct.fit_transform(X_3_test))

# -------------------------------------------------------
# Avoid dummy trap
X_3_train_dummy = X_3_train.copy()
X_3_train_dummy = X_3_train_dummy[:,1:]

X_3_test_dummy = X_3_test.copy()
X_3_test_dummy = X_3_test_dummy[:,1:]

Wall time: 12.5 s


#### Multiple linear regression Model 3

In [44]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_3_train_dummy, y_3_train)

# -------------------------------------------------------
# Predicting the test set labels
y_3_pred = regressor.predict(X_3_test_dummy)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_3_test, y_3_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, y_3_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, y_3_pred)))

MSE 0.06561
MAE 0.1991
1 - MAE 0.8009


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [45]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Postions of ethnicity dummies
print(importance[0])
print(importance[1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.011388491580105306
0.030659034221820904


In [46]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.03115504514916712
0.04160529942339224


In [47]:
# PCA for linear regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    X_3_train_mlr_PCA = pca.fit_transform(X_3_train_dummy)
    X_3_test_mlr_PCA = pca.transform(X_3_test_dummy)
    
    regressor.fit(X_3_train_mlr_PCA, y_3_train)
    mlr_y_3_PCA = regressor.predict(X_3_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 3 = 0.01377
MLR with 500 components, MAE Model 3 = 0.09381
MLR with 500 components, 1 - MAE Model 3 = 0.9062
-------------------------------------------
MLR with 1000 components, MSE Model 3 = 0.01402
MLR with 1000 components, MAE Model 3 = 0.09529
MLR with 1000 components, 1 - MAE Model 3 = 0.9047
-------------------------------------------
MLR with 2000 components, MSE Model 3 = 0.01452
MLR with 2000 components, MAE Model 3 = 0.09572
MLR with 2000 components, 1 - MAE Model 3 = 0.9043
-------------------------------------------


In [48]:
# PCA for linear regression Model 3 for additional testing
components = [500]

for component in components:
    pca = PCA(n_components = component)   
    X_3_train_mlr_PCA = pca.fit_transform(X_3_train_dummy)
    X_3_test_mlr_PCA = pca.transform(X_3_test_dummy)
    
    regressor.fit(X_3_train_mlr_PCA, y_3_train)
    mlr_y_3_PCA = regressor.predict(X_3_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 3 = 0.0138
MLR with 500 components, MAE Model 3 = 0.09415
MLR with 500 components, 1 - MAE Model 3 = 0.9059
-------------------------------------------


#### Random forest regression Model 3

In [49]:
# Random forest regression

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_3_train_dummy, y_3_train.ravel())

# Using .ravel() to get the correct format
rfr_y_3 = rfr.predict(X_3_test_dummy)
print("MSE {:.4}".format(mean_squared_error(y_3_test, rfr_y_3)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, rfr_y_3)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, rfr_y_3)))

MSE 0.0146
MAE 0.09543
1 - MAE 0.9046


In [50]:
# PCA for random forest regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_3_train_PCA = pca.fit_transform(X_3_train_dummy)
    rfr_X_3_test_PCA = pca.transform(X_3_test_dummy)
    
    rfr.fit(rfr_X_3_train_PCA, y_3_train.ravel())
    rfr_y_3_PCA = rfr.predict(rfr_X_3_test_PCA)
    print("RFR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, rfr_y_3_PCA)))
    print("RFR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, rfr_y_3_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, rfr_y_3_PCA)))
    print("-------------------------------------------")

RFR with 500 components, MSE Model 3 = 0.01571
RFR with 500 components, MAE Model 3 = 0.09962
RFR with 500 components, 1 - MAE Model 3 = 0.9004
-------------------------------------------
RFR with 1000 components, MSE Model 3 = 0.0164
RFR with 1000 components, MAE Model 3 = 0.1021
RFR with 1000 components, 1 - MAE Model 3 = 0.8979
-------------------------------------------
RFR with 2000 components, MSE Model 3 = 0.0164
RFR with 2000 components, MAE Model 3 = 0.1021
RFR with 2000 components, 1 - MAE Model 3 = 0.8979
-------------------------------------------


#### Support vector regression Model 3

In [51]:
# Scaling the features
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the ethnicity dummies
X_3_train_scale = X_3_train_dummy.copy()
X_3_test_scale = X_3_test_dummy.copy()

X_3_train_scale[:, 2:] = X_sc.fit_transform(X_3_train_dummy[:, 2:])
X_3_test_scale[:, 2:] = X_sc.transform(X_3_test_dummy[:, 2:])

In [52]:
%%time
# Support vectore regression

svr = SVR(kernel = 'rbf')
svr.fit(X_3_train_scale, y_3_train.ravel())
svr_y_3 = svr.predict(X_3_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_3_test, svr_y_3)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, svr_y_3)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, svr_y_3)))

MSE 0.01217
MAE 0.08813
1 - MAE 0.9119
Wall time: 7min 41s


In [53]:
%%time
# PCA for support vector regression
components = [500, 1000, 2000]
    
for component in components:
    pca = PCA(n_components = component)   
    X_3_train_PCA = pca.fit_transform(X_3_train_scale)
    X_3_test_PCA = pca.transform(X_3_test_scale)
    
    svr.fit(X_3_train_PCA, y_3_train.ravel())
    svr_y_3_PCA = svr.predict(X_3_test_PCA)
    print("SVR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, svr_y_3_PCA)))
    print("SVR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, svr_y_3_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, svr_y_3_PCA)))
    print("-------------------------------------------")

SVR with 500 components, MSE Model 3 = 0.01224
SVR with 500 components, MAE Model 3 = 0.08832
SVR with 500 components, 1 - MAE Model 3 = 0.9117
-------------------------------------------
SVR with 1000 components, MSE Model 3 = 0.01219
SVR with 1000 components, MAE Model 3 = 0.08812
SVR with 1000 components, 1 - MAE Model 3 = 0.9119
-------------------------------------------
SVR with 2000 components, MSE Model 3 = 0.01214
SVR with 2000 components, MAE Model 3 = 0.08789
SVR with 2000 components, 1 - MAE Model 3 = 0.9121
-------------------------------------------
Wall time: 8min 13s


## Model 4 - Interview ~ facial + gender + ethnicity
#### Creating the training and validation (test) set for Model 4

In [54]:
# Creating the training set for Model 4, merging the new training dataset with
# a previously created dataset that contains 'interview', 'gender' and 'ethnicity'

df_gen_eth_int = excel_to_merge[['filenames', 'ethnicity', 'gender']]
training_face_gen_eth = pd.merge(vgg_1_train, df_gen_eth_int, how = "left", on = ["filenames"])
#training_face_gen_eth.head()

In [55]:
# Creating the validation/ test set for Model 4
val_face_gen_eth = pd.merge(vgg_1_test, df_gen_eth_int, how = "left", on = ["filenames"])
#val_face_gen_eth.head()

#### Regression preparations Model 4

In [56]:
%%time

# Splitting the (in)dependent variables
X_4_train = training_face_gen_eth.drop(['filenames', 'interview'], axis=1).values
y_4_train = training_face_gen_eth.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_4_test = val_face_gen_eth.drop(['filenames', 'interview'], axis=1).values
y_4_test = val_face_gen_eth.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of gender
le = LabelEncoder()
X_4_train[:, -1] = le.fit_transform(X_4_train[:, -1])
X_4_test[:, -1] = le.fit_transform(X_4_test[:, -1])

# -------------------------------------------------------
# Make a dummy variable of the categorical values of ethnicity
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-2])], remainder='passthrough')
X_4_train = ct.fit_transform(X_4_train)
X_4_test = ct.fit_transform(X_4_test)     # np.array() weggehaald om ct... heen

# -------------------------------------------------------
# Avoid dummy trap
X_4_train_dummy = X_4_train.copy()
X_4_train_dummy = X_4_train_dummy[:,1:]

X_4_test_dummy = X_4_test.copy()
X_4_test_dummy = X_4_test_dummy[:,1:]

Wall time: 12.4 s


#### Multiple linear regression Model 4

In [57]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_4_train_dummy, y_4_train)

# -------------------------------------------------------
# Predicting the test set labels
y_4_pred = regressor.predict(X_4_test_dummy)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_4_test, y_4_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, y_4_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, y_4_pred)))

MSE 0.06531
MAE 0.1989
1 - MAE 0.8011


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [58]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Position of ethnicity and gender dummy variables
print(importance[0])
print(importance[1])
print(importance[-1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.010816691292975498
0.03106467110895475
-0.03164995838259116


In [59]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.03164995838259116
0.041891539759025015


In [60]:
# PCA for linear regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    X_4_train_mlr_PCA = pca.fit_transform(X_4_train_dummy)
    X_4_test_mlr_PCA = pca.transform(X_4_test_dummy)
    
    regressor.fit(X_4_train_mlr_PCA, y_4_train)
    mlr_y_4_PCA = regressor.predict(X_4_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 4 = 0.01363
MLR with 500 components, MAE Model 4 = 0.09345
MLR with 500 components, 1 - MAE Model 4 = 0.9065
-------------------------------------------
MLR with 1000 components, MSE Model 4 = 0.01394
MLR with 1000 components, MAE Model 4 = 0.09482
MLR with 1000 components, 1 - MAE Model 4 = 0.9052
-------------------------------------------
MLR with 2000 components, MSE Model 4 = 0.01461
MLR with 2000 components, MAE Model 4 = 0.09608
MLR with 2000 components, 1 - MAE Model 4 = 0.9039
-------------------------------------------


In [61]:
# PCA for linear regression Model 4 for additional testing
components = [500]

for component in components:
    pca = PCA(n_components = component)   
    X_4_train_mlr_PCA = pca.fit_transform(X_4_train_dummy)
    X_4_test_mlr_PCA = pca.transform(X_4_test_dummy)
    
    regressor.fit(X_4_train_mlr_PCA, y_4_train)
    mlr_y_4_PCA = regressor.predict(X_4_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("-------------------------------------------")

MLR with 500 components, MSE Model 4 = 0.01379
MLR with 500 components, MAE Model 4 = 0.09391
MLR with 500 components, 1 - MAE Model 4 = 0.9061
-------------------------------------------


#### Random forest regression Model 4

In [62]:
# Random forest regression

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_4_train_dummy, y_4_train.ravel())

# Using .ravel() to get the correct format
rfr_y_4 = rfr.predict(X_4_test_dummy)
print("MSE {:.4}".format(mean_squared_error(y_4_test, rfr_y_4)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, rfr_y_4)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, rfr_y_4)))

MSE 0.01462
MAE 0.09546
1 - MAE 0.9045


In [63]:
# PCA for random forest regression
components = [500, 1000, 2000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_4_train_PCA = pca.fit_transform(X_4_train_dummy)
    rfr_X_4_test_PCA = pca.transform(X_4_test_dummy)
    
    rfr.fit(rfr_X_4_train_PCA, y_4_train.ravel())
    rfr_y_4_PCA = rfr.predict(rfr_X_4_test_PCA)
    print("RFR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, rfr_y_4_PCA)))
    print("RFR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, rfr_y_4_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, rfr_y_4_PCA)))
    print("-------------------------------------------")

RFR with 500 components, MSE Model 4 = 0.01543
RFR with 500 components, MAE Model 4 = 0.09895
RFR with 500 components, 1 - MAE Model 4 = 0.901
-------------------------------------------
RFR with 1000 components, MSE Model 4 = 0.01634
RFR with 1000 components, MAE Model 4 = 0.1029
RFR with 1000 components, 1 - MAE Model 4 = 0.8971
-------------------------------------------
RFR with 2000 components, MSE Model 4 = 0.0166
RFR with 2000 components, MAE Model 4 = 0.1033
RFR with 2000 components, 1 - MAE Model 4 = 0.8967
-------------------------------------------


#### Support vector regression Model 4

In [64]:
# Scaling the features 
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the gender and ethnicity dummies
X_4_train_scale = X_4_train_dummy.copy()
X_4_test_scale = X_4_test_dummy.copy()

X_4_train_scale[:, 2:-1] = X_sc.fit_transform(X_4_train_dummy[:, 2:-1])
X_4_test_scale[:, 2:-1] = X_sc.transform(X_4_test_dummy[:, 2:-1])

In [65]:
%%time
# Support vector regression

svr = SVR(kernel = 'rbf')
svr.fit(X_4_train_scale, y_4_train.ravel())
svr_y_4 = svr.predict(X_4_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_4_test, svr_y_4)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, svr_y_4)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, svr_y_4)))

MSE 0.01217
MAE 0.08813
1 - MAE 0.9119
Wall time: 7min 41s


In [66]:
%%time
# PCA for support vector regression
components = [500, 1000, 2000]
    
for component in components:
    pca = PCA(n_components = component)   
    X_4_train_PCA = pca.fit_transform(X_4_train_scale)
    X_4_test_PCA = pca.transform(X_4_test_scale)
    
    svr.fit(X_4_train_PCA, y_4_train.ravel())
    svr_y_4_PCA = svr.predict(X_4_test_PCA)
    print("SVR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, svr_y_4_PCA)))
    print("SVR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, svr_y_4_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, svr_y_4_PCA)))
    print("-------------------------------------------")

SVR with 500 components, MSE Model 4 = 0.01226
SVR with 500 components, MAE Model 4 = 0.08839
SVR with 500 components, 1 - MAE Model 4 = 0.9116
-------------------------------------------
SVR with 1000 components, MSE Model 4 = 0.01217
SVR with 1000 components, MAE Model 4 = 0.08804
SVR with 1000 components, 1 - MAE Model 4 = 0.912
-------------------------------------------
SVR with 2000 components, MSE Model 4 = 0.01214
SVR with 2000 components, MAE Model 4 = 0.0879
SVR with 2000 components, 1 - MAE Model 4 = 0.9121
-------------------------------------------
Wall time: 8min 16s


-------------

## Extra testing

#### T-tests between the models that have slight differences in MAEs

In [67]:
scipy.stats.ttest_rel(y_1_pred, y_3_pred)

Ttest_relResult(statistic=array([-2.01252288]), pvalue=array([0.04429873]))

In [68]:
# Related t-test MLR PCA
MLR_1_500 = mlr_y_1_PCA.copy()
MLR_2_500 = mlr_y_2_PCA.copy()
MLR_3_500 = mlr_y_3_PCA.copy()
MLR_4_500 = mlr_y_4_PCA.copy()

print(scipy.stats.ttest_rel(MLR_1_500, MLR_2_500).pvalue)         # models 1 and 2
print(scipy.stats.ttest_rel(MLR_1_500, MLR_3_500).pvalue)         # models 1 and 3
print(scipy.stats.ttest_rel(MLR_1_500, MLR_4_500).pvalue)         # models 1 and 4

[0.80233428]
[0.55279233]
[0.51086652]


In [69]:
# Related t-test RFR
print(scipy.stats.ttest_rel(rfr_y_1, rfr_y_4).pvalue)         # models 1 and 4

0.576340820033024
