# LGBP-TOP predictions

## Load the data

In [1]:
# Importing the libraries to load the data
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
import scipy
import scipy.stats
from scipy.io import loadmat

In [2]:
path = # Please provide own path
#os.listdir(path)

In [3]:
infile_annot_training = open(path+"\\"+"annotation_training.pkl", "rb")
annotation_training = pickle.load(infile_annot_training, encoding = "latin1")

infile_annot_validation = open(path+"\\"+"annotation_validation.pkl", "rb")
annotation_validation = pickle.load(infile_annot_validation, encoding = "latin1")

In [4]:
# Reading in the ethnicity and gender data
# and preparing it to merge with other data frames
excel_data = pd.read_excel(r"C:\Users\Marie-Claire\Downloads\eth_gender_anno_all.xlsx")
excel_to_merge = excel_data.drop(['YouTubeID'], axis = 1)
excel_to_merge.columns = excel_to_merge.columns.str.lower()
excel_to_merge.rename(columns = {"videoname" : "filenames", 
                         }, inplace = True)
#excel_to_merge.head()

## LGBP-TOP predictions

#### Loading the lgbptop features

In [5]:
data_dir = # Please provide own path

lgbptop_file_path = os.path.join(data_dir, 'lgbptop.mat')
feats = loadmat(lgbptop_file_path)

mdtype = feats['lgbptop'].dtype

lgbptop_ndata = {n: feats['lgbptop'][n][0, 0] for n in mdtype.names}

filenames = [filename[0] for filename in lgbptop_ndata['filename'].squeeze()]

data = {'filenames': filenames, 'features': lgbptop_ndata['data']}

#pickle.dump(data, open(os.path.join(data_dir, 'lgbptop.pkl'), 'wb'))

In [6]:
lgbptop_ndata['data'].shape

(8000, 50112)

#### Create the training set

In [7]:
# Preparing the training data by converting the dictionaries to dataframes
# This way the data can be merged later on

# Creating a copy just in case
data_1 = data.copy()

# Creating a data frame of the features data
                         #test_array = np.array(data_1['features'])
features_df = pd.DataFrame(data_1['features'])
#features_df.head()

In [8]:
# Creating a copy just in case
data_2 = data.copy()

# Creating a data frame of the filenames data
filenames_df = pd.DataFrame(data_2['filenames'])       # np.array() weggehaald
filenames_df = filenames_df.rename(columns= {0 : "filenames"})
#filenames_df.head()

In [9]:
df_containing_lgbptop = pd.concat([filenames_df, features_df], axis = 1)
#df_containing_lgbptop.head()

In [10]:
# Checking if the sequence is in correct order
print(data['filenames'][0])
print(data['features'][0])

print(data['filenames'][1])
print(data['features'][1])

--Ymqszjv54.001.mp4
[0.16947712 0.14913054 0.13073076 ... 0.3226819  0.32539284 0.29858854]
--Ymqszjv54.003.mp4
[0.15545914 0.14307977 0.1280805  ... 0.29946554 0.30811465 0.2924719 ]


In [11]:
df_interview = pd.DataFrame(annotation_training["interview"],  index=[0]).T
df_interview.reset_index(inplace = True)
df_interview.rename(columns = {"index" : "filenames", 0 : "interview"}, inplace = True)
print(df_interview.shape)
#df_interview.head()

(6000, 2)


In [12]:
lgbptop_1_train = pd.merge(df_interview,df_containing_lgbptop, on = ["filenames"], how = "left")
print(lgbptop_1_train.shape)
#lgbptop_1_train.head()

(6000, 50114)


In [13]:
# Checking for missing data & outliers in the training set
for column in lgbptop_1_train.columns:
    if lgbptop_1_train[column].isnull().any():
        print(column)                           # No output means no missing data in the training set

In [14]:
lgbptop_1_train.describe()

Unnamed: 0,interview,0,1,2,3,4,5,6,7,8,...,50102,50103,50104,50105,50106,50107,50108,50109,50110,50111
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,0.50378,0.131773,0.130794,0.104524,0.103676,0.140296,0.134568,0.156796,0.144616,0.123416,...,0.270877,0.284464,0.280694,0.293382,0.293395,0.297859,0.310077,0.30851,0.312179,0.292185
std,0.150148,0.021704,0.018125,0.021917,0.023105,0.022883,0.012633,0.014479,0.01715,0.023376,...,0.024335,0.025905,0.027614,0.026429,0.026354,0.0278,0.028314,0.029683,0.030611,0.024684
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.401869,0.118181,0.11838,0.087601,0.087156,0.128297,0.127144,0.148242,0.13578,0.107362,...,0.253954,0.267186,0.261962,0.275575,0.275043,0.27802,0.290144,0.28704,0.290923,0.275801
50%,0.514019,0.132467,0.130464,0.105628,0.104645,0.142826,0.135303,0.158083,0.1468,0.122332,...,0.269839,0.283934,0.280072,0.292645,0.291822,0.296793,0.3092,0.307809,0.310607,0.291694
75%,0.609813,0.146711,0.143214,0.120693,0.120342,0.155087,0.142602,0.166577,0.156039,0.139146,...,0.285992,0.300982,0.297976,0.309925,0.309842,0.316117,0.328546,0.327458,0.331735,0.307382
max,1.0,0.199617,0.20112,0.190379,0.177261,0.221328,0.177877,0.198703,0.208058,0.215836,...,0.397753,0.402384,0.405589,0.421252,0.432532,0.444969,0.43334,0.422226,0.455183,0.406028


#### Create the test set similarly

In [15]:
df_interview_test = pd.DataFrame(annotation_validation["interview"],  index=[0]).T
df_interview_test.reset_index(inplace = True)
df_interview_test.rename(columns = {"index" : "filenames", 0 : "interview"}, inplace = True)
print(df_interview_test.shape)
#df_interview_test.head()

(2000, 2)


In [16]:
lgbptop_1_test = pd.merge(df_interview_test, df_containing_lgbptop, on = ["filenames"], how = "left")
print(lgbptop_1_test.shape)
#lgbptop_1_test.head()

(2000, 50114)


In [17]:
# Checking for missing data & outliers in the test set
for column in lgbptop_1_test.columns:
    if lgbptop_1_test[column].isnull().any():
        print(column)                           # No output means no missing data in the training set

In [18]:
lgbptop_1_test.describe()

Unnamed: 0,interview,0,1,2,3,4,5,6,7,8,...,50102,50103,50104,50105,50106,50107,50108,50109,50110,50111
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.50465,0.131936,0.131003,0.104702,0.104102,0.140878,0.134368,0.156608,0.144762,0.122347,...,0.270992,0.284268,0.27996,0.292701,0.293285,0.298175,0.310036,0.308314,0.312029,0.292279
std,0.14516,0.021385,0.01758,0.021373,0.022947,0.022748,0.012506,0.014496,0.017229,0.023267,...,0.023687,0.025359,0.026714,0.025421,0.025818,0.027097,0.027646,0.029096,0.029786,0.023826
min,0.0,0.045359,0.063862,0.050175,0.035888,0.046517,0.08603,0.106256,0.065572,0.051018,...,0.204821,0.214008,0.202515,0.218756,0.215397,0.225334,0.231037,0.231553,0.215236,0.187953
25%,0.411215,0.118335,0.118982,0.088642,0.087917,0.129544,0.127067,0.14784,0.136353,0.106599,...,0.253867,0.266863,0.261872,0.275247,0.274919,0.279009,0.290256,0.288011,0.291608,0.276596
50%,0.514019,0.132221,0.131372,0.10544,0.104743,0.143621,0.13516,0.157726,0.146522,0.121562,...,0.270134,0.282925,0.278809,0.291514,0.291673,0.29736,0.309224,0.307131,0.310641,0.291552
75%,0.607477,0.146833,0.14253,0.120321,0.120804,0.155869,0.142519,0.1667,0.156474,0.137712,...,0.286083,0.301085,0.296979,0.30932,0.310252,0.315926,0.328515,0.327173,0.330728,0.307309
max,0.915888,0.199747,0.19283,0.174323,0.190292,0.210186,0.176856,0.198523,0.20986,0.206451,...,0.381111,0.373208,0.3886,0.431641,0.44247,0.420939,0.427435,0.421934,0.413157,0.392608


<h1 align="center"> Predictions with LGBP-TOP facial data</h1> 
<h3 align="center"> Multiple Linear Regression</h3>

#### Models that will be trained:
- Model 1 = Job interview ~ Facial
- Model 2 = Job interview ~ Facial + Gender
- Model 3 = Job interview ~ Facial + Ethnicity
- Model 4 = Job interview ~ Facial + Gender + Ethnicity

## Model 1 - Interview ~ facial

#### Regression preparations Model 1

In [15]:
# Splitting the (in)dependent variables
X_1_train = lgbptop_1_train.drop(['filenames', 'interview'], axis=1).values
y_1_train = lgbptop_1_train.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_1_test = lgbptop_1_test.drop(['filenames', 'interview'], axis=1).values
y_1_test = lgbptop_1_test.loc[:,['interview']].values

#### Baseline prediction

In [16]:
# Creating a baseline by taking the MAE of the test set outcome variable and the mean of the training set outcome variable
1 - mean_absolute_error(y_1_test, np.full(2000, np.mean(y_1_train)))

0.8830107663551402

#### Multiple linear regression Model 1

In [17]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_1_train, y_1_train)

# -------------------------------------------------------
# Predicting the test set labels
y_1_pred = regressor.predict(X_1_test)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_1_test, y_1_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, y_1_pred)))
print("1 - MAE {:.4}".format(1 - mean_absolute_error(y_1_test, y_1_pred)))

MSE 0.01629
MAE 0.1005
1 - MAE 0.8995


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [18]:
# Checking the importance of the added variable(s) (not necessary for Model 1)
importance = regressor.coef_.ravel()

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

In [19]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.29155606
0.29348245


In [20]:
# PCA for linear regression
components = [1000, 2000, 3000]     # 300, 500, 700, 4000, 6000 performed worse

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_mlr_PCA = pca.fit_transform(X_1_train)
    X_1_test_mlr_PCA = pca.transform(X_1_test)
    
    regressor.fit(X_1_train_mlr_PCA, y_1_train)
    mlr_y_1_PCA = regressor.predict(X_1_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, mlr_y_1_PCA)))
    print("-------------------------------------------")

MLR with 1000 components, MSE Model 1 = 0.01342
MLR with 1000 components, MAE Model 1 = 0.09189
MLR with 1000 components, 1 - MAE Model 1 = 0.9081
-------------------------------------------
MLR with 2000 components, MSE Model 1 = 0.01321
MLR with 2000 components, MAE Model 1 = 0.09125
MLR with 2000 components, 1 - MAE Model 1 = 0.9087
-------------------------------------------
MLR with 3000 components, MSE Model 1 = 0.01346
MLR with 3000 components, MAE Model 1 = 0.09132
MLR with 3000 components, 1 - MAE Model 1 = 0.9087
-------------------------------------------


In [21]:
# PCA for linear regression Model 1 for additional testing
components = [2000]

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_mlr_PCA = pca.fit_transform(X_1_train)
    X_1_test_mlr_PCA = pca.transform(X_1_test)
    
    regressor.fit(X_1_train_mlr_PCA, y_1_train)
    mlr_1_PCA_testing = regressor.predict(X_1_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, mlr_1_PCA_testing)))
    print("MLR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, mlr_1_PCA_testing)))
    print("MLR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, mlr_1_PCA_testing)))
    print("-------------------------------------------")

MLR with 2000 components, MSE Model 1 = 0.01323
MLR with 2000 components, MAE Model 1 = 0.09101
MLR with 2000 components, 1 - MAE Model 1 = 0.909
-------------------------------------------


#### Random forest regression Model 1

In [22]:
# Random forest regression
rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_1_train, y_1_train.ravel())

# Using .ravel() to get the correct format
rfr_y_1 = rfr.predict(X_1_test)
print("MSE {:.4}".format(mean_squared_error(y_1_test, rfr_y_1)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, rfr_y_1)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_1_test, rfr_y_1)))

MSE 0.01601
MAE 0.1008
1 - MAE 0.8992


In [23]:
# PCA for random forest regression

components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_1_train_PCA = pca.fit_transform(X_1_train)
    rfr_X_1_test_PCA = pca.transform(X_1_test)
    
    rfr.fit(rfr_X_1_train_PCA, y_1_train.ravel())
    rfr_y_1_PCA = rfr.predict(rfr_X_1_test_PCA)
    print("RFR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, rfr_y_1_PCA)))
    print("RFR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, rfr_y_1_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, rfr_y_1_PCA)))
    print("-------------------------------------------")

RFR with 1000 components, MSE Model 1 = 0.01683
RFR with 1000 components, MAE Model 1 = 0.1023
RFR with 1000 components, 1 - MAE Model 1 = 0.8977
-------------------------------------------
RFR with 2000 components, MSE Model 1 = 0.01724
RFR with 2000 components, MAE Model 1 = 0.105
RFR with 2000 components, 1 - MAE Model 1 = 0.895
-------------------------------------------
RFR with 3000 components, MSE Model 1 = 0.01776
RFR with 3000 components, MAE Model 1 = 0.1065
RFR with 3000 components, 1 - MAE Model 1 = 0.8935
-------------------------------------------


#### Support vector regression Model 1

In [24]:
# Scaling the features
# X
X_sc = StandardScaler()
X_1_train_scale = X_sc.fit_transform(X_1_train)
X_1_test_scale = X_sc.transform(X_1_test)

In [25]:
%%time
svr = SVR(kernel = 'rbf')
svr.fit(X_1_train_scale, y_1_train.ravel())
svr_y_1 = svr.predict(X_1_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_1_test, svr_y_1)))
print("MAE {:.4}".format(mean_absolute_error(y_1_test, svr_y_1)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_1_test, svr_y_1)))

MSE 0.01179
MAE 0.08667
1 - MAE 0.9133
Wall time: 19min 1s


In [26]:
# PCA for support vector regression
components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    X_1_train_PCA = pca.fit_transform(X_1_train_scale)
    X_1_test_PCA = pca.transform(X_1_test_scale)
    
    svr.fit(X_1_train_PCA, y_1_train.ravel())
    svr_y_1_PCA = svr.predict(X_1_test_PCA)
    print("SVR with", component, "components, MSE Model 1 = {:.4}".format(mean_squared_error(y_1_test, svr_y_1_PCA)))
    print("SVR with", component, "components, MAE Model 1 = {:.4}".format(mean_absolute_error(y_1_test, svr_y_1_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 1 = {:.4}".format(1-mean_absolute_error(y_1_test, svr_y_1_PCA)))
    print("-------------------------------------------")

SVR with 1000 components, MSE Model 1 = 0.01159
SVR with 1000 components, MAE Model 1 = 0.0857
SVR with 1000 components, 1 - MAE Model 1 = 0.9143
-------------------------------------------
SVR with 2000 components, MSE Model 1 = 0.01161
SVR with 2000 components, MAE Model 1 = 0.08576
SVR with 2000 components, 1 - MAE Model 1 = 0.9142
-------------------------------------------
SVR with 3000 components, MSE Model 1 = 0.01163
SVR with 3000 components, MAE Model 1 = 0.08588
SVR with 3000 components, 1 - MAE Model 1 = 0.9141
-------------------------------------------


## Model 2 - Interview ~ facial + gender

#### Creating the training and validation (test) set for Model 2

In [27]:
# Creating a training set for Model 2
df_gen_int = excel_to_merge[['filenames', 'gender']]
training_face_gen = pd.merge(lgbptop_1_train, df_gen_int, how = "left", on = ["filenames"])

In [28]:
# Creating the validation/ test set for Model 2
val_face_gen = pd.merge(lgbptop_1_test, df_gen_int, how = "left", on = ["filenames"])
#val_face_gen.head()

#### Regression preparations Model 2

In [29]:
%%time
# Splitting the (in)dependent variables
X_2_train = training_face_gen.drop(['filenames','interview'], axis=1).values
y_2_train = training_face_gen.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_2_test = val_face_gen.drop(['filenames', 'interview'], axis=1).values
y_2_test = val_face_gen.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of gender
le = LabelEncoder()
X_2_train[:, -1] = le.fit_transform(X_2_train[:, -1])
X_2_test[:, -1] = le.fit_transform(X_2_test[:, -1])

Wall time: 8.87 s


#### Multiple linear regression Model 2

In [30]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_2_train, y_2_train)

# -------------------------------------------------------
# Predicting the test set labels
y_2_pred = regressor.predict(X_2_test)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_2_test, y_2_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, y_2_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, y_2_pred)))

MSE 0.01634
MAE 0.1007
1 - MAE 0.8993


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [31]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Position of gender variable
print(importance[-1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.03159476334946262


In [32]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.29185789516761973
0.29580361483438145


In [33]:
# PCA for linear regression
components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    X_2_train_mlr_PCA = pca.fit_transform(X_2_train)
    X_2_test_mlr_PCA = pca.transform(X_2_test)
    
    regressor.fit(X_2_train_mlr_PCA, y_2_train)
    mlr_y_2_PCA = regressor.predict(X_2_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, mlr_y_2_PCA)))
    print("-------------------------------------------")

MLR with 1000 components, MSE Model 2 = 0.0135
MLR with 1000 components, MAE Model 2 = 0.09199
MLR with 1000 components, 1 - MAE Model 2 = 0.908
-------------------------------------------
MLR with 2000 components, MSE Model 2 = 0.01323
MLR with 2000 components, MAE Model 2 = 0.09109
MLR with 2000 components, 1 - MAE Model 2 = 0.9089
-------------------------------------------
MLR with 3000 components, MSE Model 2 = 0.01327
MLR with 3000 components, MAE Model 2 = 0.09094
MLR with 3000 components, 1 - MAE Model 2 = 0.9091
-------------------------------------------


In [34]:
# PCA for linear regression Model 2 for additional testing
components = [2000]

for component in components:
    pca = PCA(n_components = component)   
    X_2_train_mlr_PCA = pca.fit_transform(X_2_train)
    X_2_test_mlr_PCA = pca.transform(X_2_test)
    
    regressor.fit(X_2_train_mlr_PCA, y_2_train)
    mlr_2_PCA_testing = regressor.predict(X_2_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, mlr_2_PCA_testing)))
    print("MLR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, mlr_2_PCA_testing)))
    print("MLR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, mlr_2_PCA_testing)))
    print("-------------------------------------------")

MLR with 2000 components, MSE Model 2 = 0.01326
MLR with 2000 components, MAE Model 2 = 0.09157
MLR with 2000 components, 1 - MAE Model 2 = 0.9084
-------------------------------------------


#### Random forest regression Model 2

In [35]:
# Random forest regression
rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_2_train, y_2_train.ravel())

# Using .ravel() to get the correct format
rfr_y_2 = rfr.predict(X_2_test)
print("MSE {:.4}".format(mean_squared_error(y_2_test, rfr_y_2)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, rfr_y_2)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, rfr_y_2)))

MSE 0.01598
MAE 0.1006
1 - MAE 0.8994


In [36]:
# PCA for random forest regression

components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_2_train_PCA = pca.fit_transform(X_2_train)
    rfr_X_2_test_PCA = pca.transform(X_2_test)
    
    rfr.fit(rfr_X_2_train_PCA, y_2_train.ravel())
    rfr_y_2_PCA = rfr.predict(rfr_X_2_test_PCA)
    print("RFR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, rfr_y_2_PCA)))
    print("RFR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, rfr_y_2_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, rfr_y_2_PCA)))
    print("-------------------------------------------")

RFR with 1000 components, MSE Model 2 = 0.01676
RFR with 1000 components, MAE Model 2 = 0.1036
RFR with 1000 components, 1 - MAE Model 2 = 0.8964
-------------------------------------------
RFR with 2000 components, MSE Model 2 = 0.01729
RFR with 2000 components, MAE Model 2 = 0.105
RFR with 2000 components, 1 - MAE Model 2 = 0.895
-------------------------------------------
RFR with 3000 components, MSE Model 2 = 0.01756
RFR with 3000 components, MAE Model 2 = 0.1055
RFR with 3000 components, 1 - MAE Model 2 = 0.8945
-------------------------------------------


#### Support vector regression Model 2

In [37]:
# Scaling the features
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the gender dummies
X_2_train_scale = X_2_train.copy()
X_2_test_scale = X_2_test.copy()

X_2_train_scale[:, :-1] = X_sc.fit_transform(X_2_train[:, :-1])
X_2_test_scale[:, :-1] = X_sc.transform(X_2_test[:, :-1])

In [38]:
%%time
svr = SVR(kernel = 'rbf')
svr.fit(X_2_train_scale, y_2_train.ravel())
svr_y_2 = svr.predict(X_2_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_2_test, svr_y_2)))
print("MAE {:.4}".format(mean_absolute_error(y_2_test, svr_y_2)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_2_test, svr_y_2)))

MSE 0.01179
MAE 0.08668
1 - MAE 0.9133
Wall time: 18min 36s


In [39]:
# PCA for support vector regression
components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    X_2_train_PCA = pca.fit_transform(X_2_train_scale)
    X_2_test_PCA = pca.transform(X_2_test_scale)
    
    svr.fit(X_2_train_PCA, y_2_train.ravel())
    svr_y_2_PCA = svr.predict(X_2_test_PCA)
    print("SVR with", component, "components, MSE Model 2 = {:.4}".format(mean_squared_error(y_2_test, svr_y_2_PCA)))
    print("SVR with", component, "components, MAE Model 2 = {:.4}".format(mean_absolute_error(y_2_test, svr_y_2_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 2 = {:.4}".format(1-mean_absolute_error(y_2_test, svr_y_2_PCA)))
    print("-------------------------------------------")

SVR with 1000 components, MSE Model 2 = 0.01159
SVR with 1000 components, MAE Model 2 = 0.08572
SVR with 1000 components, 1 - MAE Model 2 = 0.9143
-------------------------------------------
SVR with 2000 components, MSE Model 2 = 0.01161
SVR with 2000 components, MAE Model 2 = 0.08575
SVR with 2000 components, 1 - MAE Model 2 = 0.9143
-------------------------------------------
SVR with 3000 components, MSE Model 2 = 0.01164
SVR with 3000 components, MAE Model 2 = 0.08588
SVR with 3000 components, 1 - MAE Model 2 = 0.9141
-------------------------------------------


## Model 3 - Interview ~ facial + ethnicity
#### Creating the training and validation (test) set for Model 3

In [19]:
# Creating the training set for Model 3, merging the new training dataset with
# a previously created dataset that contains 'interview', 'gender' and 'ethnicity'

df_eth_int = excel_to_merge[['filenames', 'ethnicity']]
training_face_eth = pd.merge(lgbptop_1_train, df_eth_int, how = "left", on = ["filenames"])

In [20]:
# Creating the validation/ test set for Model 3
#df_val_eth_int = merged_val_df[['videoname', 'ethnicity', 'interview']]
val_face_eth = pd.merge(lgbptop_1_test, df_eth_int, how = "left", on = ["filenames"])
#val_face_eth.head()

#### Regression preparations Model 3

In [21]:
%%time
# Splitting the (in)dependent variables
X_3_train = training_face_eth.drop(['filenames', 'interview'], axis=1).values
y_3_train = training_face_eth.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_3_test = val_face_eth.drop(['filenames', 'interview'], axis=1).values
y_3_test = val_face_eth.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of ethnicity
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
X_3_train = np.array(ct.fit_transform(X_3_train))
X_3_test = np.array(ct.fit_transform(X_3_test))

# -------------------------------------------------------
# Avoid dummy trap
X_3_train_dummy = X_3_train.copy()
X_3_train_dummy = X_3_train_dummy[:,1:]

X_3_test_dummy = X_3_test.copy()
X_3_test_dummy = X_3_test_dummy[:,1:]

Wall time: 1min 27s


#### Multiple linear regression Model 3

In [22]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_3_train_dummy, y_3_train)

# -------------------------------------------------------
# Predicting the test set labels
y_3_pred = regressor.predict(X_3_test_dummy)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_3_test, y_3_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, y_3_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, y_3_pred)))

MSE 0.01627
MAE 0.1004
1 - MAE 0.8996


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [23]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Postions of ethnicity dummies
print(importance[0])
print(importance[1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.027912389218651423
-0.02177281987814038


In [24]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.29694016922460564
0.29644844689831534


In [25]:
# PCA for linear regression
components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    X_3_train_mlr_PCA = pca.fit_transform(X_3_train_dummy)
    X_3_test_mlr_PCA = pca.transform(X_3_test_dummy)
    
    regressor.fit(X_3_train_mlr_PCA, y_3_train)
    mlr_y_3_PCA = regressor.predict(X_3_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, mlr_y_3_PCA)))
    print("-------------------------------------------")

MLR with 1000 components, MSE Model 3 = 0.01348
MLR with 1000 components, MAE Model 3 = 0.09218
MLR with 1000 components, 1 - MAE Model 3 = 0.9078
-------------------------------------------
MLR with 2000 components, MSE Model 3 = 0.01313
MLR with 2000 components, MAE Model 3 = 0.09115
MLR with 2000 components, 1 - MAE Model 3 = 0.9089
-------------------------------------------
MLR with 3000 components, MSE Model 3 = 0.01341
MLR with 3000 components, MAE Model 3 = 0.09137
MLR with 3000 components, 1 - MAE Model 3 = 0.9086
-------------------------------------------


In [26]:
# PCA for linear regression Model 3 for additional testing
components = [2000]

for component in components:
    pca = PCA(n_components = component)   
    X_3_train_mlr_PCA = pca.fit_transform(X_3_train_dummy)
    X_3_test_mlr_PCA = pca.transform(X_3_test_dummy)
    
    regressor.fit(X_3_train_mlr_PCA, y_3_train)
    mlr_3_PCA_testing = regressor.predict(X_3_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, mlr_3_PCA_testing)))
    print("MLR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, mlr_3_PCA_testing)))
    print("MLR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, mlr_3_PCA_testing)))
    print("-------------------------------------------")

MLR with 2000 components, MSE Model 3 = 0.01328
MLR with 2000 components, MAE Model 3 = 0.09121
MLR with 2000 components, 1 - MAE Model 3 = 0.9088
-------------------------------------------


#### Random forest regression Model 3

In [27]:
# Random forest regression
rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_3_train_dummy, y_3_train.ravel())

# Using .ravel() to get the correct format
rfr_y_3 = rfr.predict(X_3_test_dummy)
print("MSE {:.4}".format(mean_squared_error(y_3_test, rfr_y_3)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, rfr_y_3)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, rfr_y_3)))

MSE 0.01608
MAE 0.1012
1 - MAE 0.8988


In [28]:
# PCA for random forest regression

components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_3_train_PCA = pca.fit_transform(X_3_train_dummy)
    rfr_X_3_test_PCA = pca.transform(X_3_test_dummy)
    
    rfr.fit(rfr_X_3_train_PCA, y_3_train.ravel())
    rfr_y_3_PCA = rfr.predict(rfr_X_3_test_PCA)
    print("RFR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, rfr_y_3_PCA)))
    print("RFR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, rfr_y_3_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, rfr_y_3_PCA)))
    print("-------------------------------------------")

RFR with 1000 components, MSE Model 3 = 0.01676
RFR with 1000 components, MAE Model 3 = 0.1038
RFR with 1000 components, 1 - MAE Model 3 = 0.8962
-------------------------------------------
RFR with 2000 components, MSE Model 3 = 0.01709
RFR with 2000 components, MAE Model 3 = 0.1047
RFR with 2000 components, 1 - MAE Model 3 = 0.8953
-------------------------------------------
RFR with 3000 components, MSE Model 3 = 0.0172
RFR with 3000 components, MAE Model 3 = 0.1041
RFR with 3000 components, 1 - MAE Model 3 = 0.8959
-------------------------------------------


#### Support vector regression Model 3

In [29]:
# Scaling the features
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the ethnicity dummies
X_3_train_scale = X_3_train_dummy.copy()
X_3_test_scale = X_3_test_dummy.copy()

X_3_train_scale[:, 2:] = X_sc.fit_transform(X_3_train_dummy[:, 2:])
X_3_test_scale[:, 2:] = X_sc.transform(X_3_test_dummy[:, 2:])

In [30]:
%%time
svr = SVR(kernel = 'rbf')
svr.fit(X_3_train_scale, y_3_train.ravel())
svr_y_3 = svr.predict(X_3_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_3_test, svr_y_3)))
print("MAE {:.4}".format(mean_absolute_error(y_3_test, svr_y_3)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_3_test, svr_y_3)))

MSE 0.01179
MAE 0.08667
1 - MAE 0.9133
Wall time: 18min 34s


In [31]:
%%time
# PCA for support vector regression
components = [1000, 2000, 3000]
    
for component in components:
    pca = PCA(n_components = component)   
    X_3_train_PCA = pca.fit_transform(X_3_train_scale)
    X_3_test_PCA = pca.transform(X_3_test_scale)
    
    svr.fit(X_3_train_PCA, y_3_train.ravel())
    svr_y_3_PCA = svr.predict(X_3_test_PCA)
    print("SVR with", component, "components, MSE Model 3 = {:.4}".format(mean_squared_error(y_3_test, svr_y_3_PCA)))
    print("SVR with", component, "components, MAE Model 3 = {:.4}".format(mean_absolute_error(y_3_test, svr_y_3_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 3 = {:.4}".format(1-mean_absolute_error(y_3_test, svr_y_3_PCA)))
    print("-------------------------------------------")

SVR with 1000 components, MSE Model 3 = 0.0116
SVR with 1000 components, MAE Model 3 = 0.08574
SVR with 1000 components, 1 - MAE Model 3 = 0.9143
-------------------------------------------
SVR with 2000 components, MSE Model 3 = 0.01161
SVR with 2000 components, MAE Model 3 = 0.08578
SVR with 2000 components, 1 - MAE Model 3 = 0.9142
-------------------------------------------
SVR with 3000 components, MSE Model 3 = 0.01163
SVR with 3000 components, MAE Model 3 = 0.08585
SVR with 3000 components, 1 - MAE Model 3 = 0.9142
-------------------------------------------
Wall time: 32min 42s


## Model 4 - Interview ~ facial + gender + ethnicity
#### Creating the training and validation (test) set for Model 4

In [19]:
# Creating the training set for Model 4, merging the new training dataset with
# a previously created dataset that contains 'interview', 'gender' and 'ethnicity'

df_gen_eth_int = excel_to_merge[['filenames', 'ethnicity', 'gender']]
training_face_gen_eth = pd.merge(lgbptop_1_train, df_gen_eth_int, how = "left", on = ["filenames"])
#training_face_gen_eth.head()

In [20]:
# Creating the validation/ test set for Model 4
val_face_gen_eth = pd.merge(lgbptop_1_test, df_gen_eth_int, how = "left", on = ["filenames"])
#val_face_gen_eth.head()

#### Regression preparations Model 4

In [21]:
%%time

# Splitting the (in)dependent variables
X_4_train = training_face_gen_eth.drop(['filenames', 'interview'], axis=1).values
y_4_train = training_face_gen_eth.loc[:,['interview']].values

# Splitting the (in)dependent variables of validation set
X_4_test = val_face_gen_eth.drop(['filenames', 'interview'], axis=1).values
y_4_test = val_face_gen_eth.loc[:,['interview']].values

# -------------------------------------------------------
# Make a dummy variable of the categorical values of gender
le = LabelEncoder()
X_4_train[:, -1] = le.fit_transform(X_4_train[:, -1])
X_4_test[:, -1] = le.fit_transform(X_4_test[:, -1])

# -------------------------------------------------------
# Make a dummy variable of the categorical values of ethnicity
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-2])], remainder='passthrough')
X_4_train = ct.fit_transform(X_4_train)
X_4_test = ct.fit_transform(X_4_test)     # np.array() weggehaald om ct... heen

# -------------------------------------------------------
# Avoid dummy trap
X_4_train_dummy = X_4_train.copy()
X_4_train_dummy = X_4_train_dummy[:,1:]

X_4_test_dummy = X_4_test.copy()
X_4_test_dummy = X_4_test_dummy[:,1:]

Wall time: 1min 16s


#### Multiple linear regression Model 4

In [22]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_4_train_dummy, y_4_train)

# -------------------------------------------------------
# Predicting the test set labels
y_4_pred = regressor.predict(X_4_test_dummy)

# -------------------------------------------------------
# Various performance measure scores

print("MSE {:.4}".format(mean_squared_error(y_4_test, y_4_pred)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, y_4_pred)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, y_4_pred)))

MSE 0.01632
MAE 0.1006
1 - MAE 0.8994


#### Feature importance

Code adapted from https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [23]:
# Checking the importance of the added variable(s)
importance = regressor.coef_.ravel()

# Position of ethnicity and gender dummy variables
print(importance[0])
print(importance[1])
print(importance[-1])

#importance = regressor.coef_.ravel()
#for i,j in enumerate(importance):
    #print('Feature: %0d, Score: %.5f' % (i,j))

-0.025855189440576583
-0.018492496185072704
-0.03082542496441801


In [24]:
# The minimum and maximum feature importance
print(importance.min())
print(importance.max())

-0.29637027557451034
0.2984327157615102


In [25]:
# PCA for linear regression
components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    X_4_train_mlr_PCA = pca.fit_transform(X_4_train_dummy)
    X_4_test_mlr_PCA = pca.transform(X_4_test_dummy)
    
    regressor.fit(X_4_train_mlr_PCA, y_4_train)
    mlr_y_4_PCA = regressor.predict(X_4_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("MLR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, mlr_y_4_PCA)))
    print("-------------------------------------------")

MLR with 1000 components, MSE Model 4 = 0.01343
MLR with 1000 components, MAE Model 4 = 0.09164
MLR with 1000 components, 1 - MAE Model 4 = 0.9084
-------------------------------------------
MLR with 2000 components, MSE Model 4 = 0.01304
MLR with 2000 components, MAE Model 4 = 0.09048
MLR with 2000 components, 1 - MAE Model 4 = 0.9095
-------------------------------------------
MLR with 3000 components, MSE Model 4 = 0.01324
MLR with 3000 components, MAE Model 4 = 0.09113
MLR with 3000 components, 1 - MAE Model 4 = 0.9089
-------------------------------------------


In [26]:
# PCA for linear regression Model 4 for additional testing
components = [2000]

for component in components:
    pca = PCA(n_components = component)   
    X_4_train_mlr_PCA = pca.fit_transform(X_4_train_dummy)
    X_4_test_mlr_PCA = pca.transform(X_4_test_dummy)
    
    regressor.fit(X_4_train_mlr_PCA, y_4_train)
    mlr_4_PCA_testing = regressor.predict(X_4_test_mlr_PCA)
    print("MLR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, mlr_4_PCA_testing)))
    print("MLR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, mlr_4_PCA_testing)))
    print("MLR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, mlr_4_PCA_testing)))
    print("-------------------------------------------")

MLR with 2000 components, MSE Model 4 = 0.01322
MLR with 2000 components, MAE Model 4 = 0.09138
MLR with 2000 components, 1 - MAE Model 4 = 0.9086
-------------------------------------------


#### Random forest regression Model 4

In [27]:
# Random forest regression

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
rfr.fit(X_4_train_dummy, y_4_train.ravel())

# Using .ravel() to get the correct format
rfr_y_4 = rfr.predict(X_4_test_dummy)
print("MSE {:.4}".format(mean_squared_error(y_4_test, rfr_y_4)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, rfr_y_4)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, rfr_y_4)))

MSE 0.01605
MAE 0.1011
1 - MAE 0.8989


In [28]:
# PCA for random forest regression

components = [1000, 2000, 3000]

for component in components:
    pca = PCA(n_components = component)   
    rfr_X_4_train_PCA = pca.fit_transform(X_4_train_dummy)
    rfr_X_4_test_PCA = pca.transform(X_4_test_dummy)
    
    rfr.fit(rfr_X_4_train_PCA, y_4_train.ravel())
    rfr_y_4_PCA = rfr.predict(rfr_X_4_test_PCA)
    print("RFR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, rfr_y_4_PCA)))
    print("RFR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, rfr_y_4_PCA)))
    print("RFR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, rfr_y_4_PCA)))
    print("-------------------------------------------")

RFR with 1000 components, MSE Model 4 = 0.0169
RFR with 1000 components, MAE Model 4 = 0.1028
RFR with 1000 components, 1 - MAE Model 4 = 0.8972
-------------------------------------------
RFR with 2000 components, MSE Model 4 = 0.01753
RFR with 2000 components, MAE Model 4 = 0.1054
RFR with 2000 components, 1 - MAE Model 4 = 0.8946
-------------------------------------------
RFR with 3000 components, MSE Model 4 = 0.01755
RFR with 3000 components, MAE Model 4 = 0.1051
RFR with 3000 components, 1 - MAE Model 4 = 0.8949
-------------------------------------------


#### Support vector regression Model 4

In [29]:
# Scaling the features
# X
X_sc = StandardScaler()

# Copying the sets to be able to perform feature scaling on every feature but the gender and ethnicity dummies
X_4_train_scale = X_4_train_dummy.copy()
X_4_test_scale = X_4_test_dummy.copy()

X_4_train_scale[:, 2:-1] = X_sc.fit_transform(X_4_train_dummy[:, 2:-1])
X_4_test_scale[:, 2:-1] = X_sc.transform(X_4_test_dummy[:, 2:-1])

In [30]:
%%time
# Support vector regression

svr = SVR(kernel = 'rbf')
svr.fit(X_4_train_scale, y_4_train.ravel())
svr_y_4 = svr.predict(X_4_test_scale)

print("MSE {:.4}".format(mean_squared_error(y_4_test, svr_y_4)))
print("MAE {:.4}".format(mean_absolute_error(y_4_test, svr_y_4)))
print("1 - MAE {:.4}".format(1-mean_absolute_error(y_4_test, svr_y_4)))

MSE 0.01179
MAE 0.08668
1 - MAE 0.9133
Wall time: 18min 46s


In [31]:
%%time
# PCA for support vector regression
components = [1000, 2000, 3000]
    
for component in components:
    pca = PCA(n_components = component)   
    X_4_train_PCA = pca.fit_transform(X_4_train_scale)
    X_4_test_PCA = pca.transform(X_4_test_scale)
    
    svr.fit(X_4_train_PCA, y_4_train.ravel())
    svr_y_4_PCA = svr.predict(X_4_test_PCA)
    print("SVR with", component, "components, MSE Model 4 = {:.4}".format(mean_squared_error(y_4_test, svr_y_4_PCA)))
    print("SVR with", component, "components, MAE Model 4 = {:.4}".format(mean_absolute_error(y_4_test, svr_y_4_PCA)))
    print("SVR with", component, "components, 1 - MAE Model 4 = {:.4}".format(1-mean_absolute_error(y_4_test, svr_y_4_PCA)))
    print("-------------------------------------------")

SVR with 1000 components, MSE Model 4 = 0.01161
SVR with 1000 components, MAE Model 4 = 0.08574
SVR with 1000 components, 1 - MAE Model 4 = 0.9143
-------------------------------------------
SVR with 2000 components, MSE Model 4 = 0.01161
SVR with 2000 components, MAE Model 4 = 0.08575
SVR with 2000 components, 1 - MAE Model 4 = 0.9142
-------------------------------------------
SVR with 3000 components, MSE Model 4 = 0.01163
SVR with 3000 components, MAE Model 4 = 0.08586
SVR with 3000 components, 1 - MAE Model 4 = 0.9141
-------------------------------------------
Wall time: 31min 3s


-----------

## Extra testing

#### T-tests between the models that have slight differences in MAEs

In [33]:
# Related t-test MLR PCA
print(scipy.stats.ttest_rel(mlr_1_PCA_testing, mlr_2_PCA_testing).pvalue)         # models 1 and 2
print(scipy.stats.ttest_rel(mlr_1_PCA_testing, mlr_3_PCA_testing).pvalue)         # models 1 and 3

In [None]:
print(scipy.stats.ttest_rel(mlr_1_PCA_testing, mlr_4_PCA_testing).pvalue)         # models 1 and 4