# Machine Learning

## Set up

In [1]:
# Set up folders
from EDA_functions import folders_set_up
import os

# Work with datarames
import pandas as pd
import numpy as np

# Charts
import seaborn as sns
from matplotlib import pyplot as plt

# X, Y preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# SVR
from sklearn.svm import SVR

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree

# Neural Network
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,Dropout
from keras.optimizers import Adam, SGD

# Pipeline
from sklearn.pipeline import Pipeline

# Evaluate models
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Time
import time

#from scipy.sparse import spmatrixc



In [2]:
# Light GBM
# import lightgbm as lgb

### Folders
Run the code below if you have the following structure:
- Group-project: GitHub folder
- 01 Input
- 02 Output

In [3]:
analysis_folder, input_folder, output_folder = folders_set_up.generate_folders()

## Import and merge data

### Datasets

In [4]:
#Title-level dataset with embeddings
title_embeddings_df = pd.read_pickle(
    os.path.join(output_folder, 'English_fiction_pre_PCA_3_with_av_pool_embeddings')
)

title_embeddings_df.columns

Index(['index', 'Title', 'description', 'authors', 'image', 'previewLink',
       'publisher', 'infoLink', 'categories', 'reviews number',
       'average rating', 'median rating', 'min review date', 'max review date',
       'weighted rating', 'date', 'year', 'description_language', 'Embedding'],
      dtype='object')

### Format data

In [5]:
title_embeddings_df.dtypes

index                     int64
Title                    object
description              object
authors                  object
image                    object
previewLink              object
publisher                object
infoLink                 object
categories               object
reviews number            int64
average rating          float64
median rating           float64
min review date          object
max review date          object
weighted rating         float64
date                     object
year                    float64
description_language     object
Embedding                object
dtype: object

#### Date

In [6]:
dates_columns = ['min review date', 'max review date', 'date']

for date in dates_columns:
    # get date from strings with time
    title_embeddings_df[date] = title_embeddings_df[date].str.split().str[0]
    # convert in datetime
    title_embeddings_df[date] = pd.to_datetime(title_embeddings_df[date])

In [8]:
title_embeddings_df[dates_columns].isna().sum()

min review date    0
max review date    0
date               0
dtype: int64

In [7]:
# what is the max and minimu of the ratings?
print(title_embeddings_df['weighted rating'].min())
print(title_embeddings_df['weighted rating'].max())

1.8228249664151188
4.886083503427672


In [9]:
# NOTE: we work on a subset of data for now to make the ML run faster
#title_embeddings_df = title_embeddings_df.sample(n=1000, random_state=42)

#### Image embeddings
These need may need to be transformed in from arrays to columns if the model we use is not NN

### Indices

In [10]:
title_embeddings_df['index_key'] = title_embeddings_df['index']
title_embeddings_df

Unnamed: 0,index,Title,description,authors,image,previewLink,publisher,infoLink,categories,reviews number,average rating,median rating,min review date,max review date,weighted rating,date,year,description_language,Embedding,index_key
0,3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['fiction'],32,3.718750,5.0,2005-02-14,2006-07-01,3.938400,2005-02-01,2005.0,English,"[0.5179044, -0.7533603, -1.1291503, -0.4418345...",3
1,24,The Forbidden Stories of Marta Veneranda,"Marta Veneranda, a Latina neoyorkina, finds th...",['Sonia Rivera-Valdes'],http://books.google.com/books/content?id=A7aYb...,http://books.google.nl/books?id=A7aYbAvagu8C&p...,Seven Stories Press,http://books.google.nl/books?id=A7aYbAvagu8C&d...,['fiction'],1,5.000000,5.0,2005-01-24,2005-01-24,4.306145,2001-03-06,2001.0,English,"[0.706188, -0.4773652, -0.17887038, 0.07989502...",24
2,42,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",['May Mcgoldrick'],http://books.google.com/books/content?id=VmCRS...,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,Harper Collins,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,['juvenile fiction'],17,4.235294,5.0,2002-10-22,2011-05-25,4.256189,2002-11-01,2002.0,English,"[2.294651, -0.24902871, -0.6188333, -0.7722471...",42
3,49,"Eight Men And A Lady (Elizabeth Sinclair, Harl...",Eight Men And A Lady by Elizabeth Sinclair rel...,['Elizabeth Sinclair'],http://books.google.com/books/content?id=Z6uzJ...,http://books.google.nl/books?id=Z6uzJgLWViUC&q...,Harlequin Treasury-Harlequin American Romance 90s,http://books.google.nl/books?id=Z6uzJgLWViUC&d...,['fiction'],2,5.000000,5.0,1998-04-16,2000-05-14,4.336313,1997-01-01,1997.0,English,"[0.37794992, -0.6178984, -0.81393754, -0.66795...",49
4,73,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],http://books.google.com/books/content?id=c9icD...,http://books.google.nl/books?id=c9icDQAAQBAJ&p...,Simon and Schuster,http://books.google.nl/books?id=c9icDQAAQBAJ&d...,['juvenile fiction'],134,4.768657,5.0,1996-08-07,2012-09-18,4.701517,2016-12-06,2016.0,English,"[0.34032565, -2.1706967, -0.21470371, -0.10447...",73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26769,212361,Calder Pride,The Long-Awaited Addition to the Beloved Calde...,['Janet Dailey'],http://books.google.com/books/content?id=nlsgd...,http://books.google.com/books?id=nlsgd2-kGq4C&...,Harper Collins,https://play.google.com/store/books/details?id...,['fiction'],28,4.035714,5.0,1999-09-30,2012-04-04,4.137453,2009-03-17,2009.0,English,"[1.1648176, 0.56768346, -0.22511423, -0.185316...",212361
26770,212365,The Road Back,The sequel to the masterpiece All Quiet on the...,['Erich Maria Remarque'],http://books.google.com/books/content?id=obZdA...,http://books.google.com/books?id=obZdAAAAQBAJ&...,Random House Trade Paperbacks,http://books.google.com/books?id=obZdAAAAQBAJ&...,['fiction'],17,4.705882,5.0,1997-05-17,2012-01-23,4.466716,1998-01-27,1998.0,English,"[0.023786038, -1.9050528, -0.38564998, 0.14921...",212365
26771,212394,Final things,Grace's father believes in science and builds ...,['Jenny Offill'],http://books.google.com/books/content?id=UbSFB...,http://books.google.com/books?id=UbSFBAAAQBAJ&...,Vintage,https://play.google.com/store/books/details?id...,['fiction'],1,4.000000,4.0,2012-01-26,2012-01-26,4.260690,2015-03-17,2015.0,English,"[2.2700834, -0.11750376, -2.0253444, -1.039558...",212394
26772,212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],http://books.google.com/books/content?id=J7M-N...,http://books.google.com/books?id=J7M-NwAACAAJ&...,Scholastic Paperbacks,http://books.google.com/books?id=J7M-NwAACAAJ&...,['juvenile fiction'],28,4.678571,5.0,1998-07-10,2011-12-31,4.504800,2000-06-01,2000.0,English,"[2.6904726, -0.96442795, 0.093034565, -1.69420...",212399


In [11]:
title_embeddings_df = title_embeddings_df.set_index('index_key')

In [12]:
title_embeddings_df

Unnamed: 0_level_0,index,Title,description,authors,image,previewLink,publisher,infoLink,categories,reviews number,average rating,median rating,min review date,max review date,weighted rating,date,year,description_language,Embedding
index_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3,3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['fiction'],32,3.718750,5.0,2005-02-14,2006-07-01,3.938400,2005-02-01,2005.0,English,"[0.5179044, -0.7533603, -1.1291503, -0.4418345..."
24,24,The Forbidden Stories of Marta Veneranda,"Marta Veneranda, a Latina neoyorkina, finds th...",['Sonia Rivera-Valdes'],http://books.google.com/books/content?id=A7aYb...,http://books.google.nl/books?id=A7aYbAvagu8C&p...,Seven Stories Press,http://books.google.nl/books?id=A7aYbAvagu8C&d...,['fiction'],1,5.000000,5.0,2005-01-24,2005-01-24,4.306145,2001-03-06,2001.0,English,"[0.706188, -0.4773652, -0.17887038, 0.07989502..."
42,42,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",['May Mcgoldrick'],http://books.google.com/books/content?id=VmCRS...,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,Harper Collins,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,['juvenile fiction'],17,4.235294,5.0,2002-10-22,2011-05-25,4.256189,2002-11-01,2002.0,English,"[2.294651, -0.24902871, -0.6188333, -0.7722471..."
49,49,"Eight Men And A Lady (Elizabeth Sinclair, Harl...",Eight Men And A Lady by Elizabeth Sinclair rel...,['Elizabeth Sinclair'],http://books.google.com/books/content?id=Z6uzJ...,http://books.google.nl/books?id=Z6uzJgLWViUC&q...,Harlequin Treasury-Harlequin American Romance 90s,http://books.google.nl/books?id=Z6uzJgLWViUC&d...,['fiction'],2,5.000000,5.0,1998-04-16,2000-05-14,4.336313,1997-01-01,1997.0,English,"[0.37794992, -0.6178984, -0.81393754, -0.66795..."
73,73,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],http://books.google.com/books/content?id=c9icD...,http://books.google.nl/books?id=c9icDQAAQBAJ&p...,Simon and Schuster,http://books.google.nl/books?id=c9icDQAAQBAJ&d...,['juvenile fiction'],134,4.768657,5.0,1996-08-07,2012-09-18,4.701517,2016-12-06,2016.0,English,"[0.34032565, -2.1706967, -0.21470371, -0.10447..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,212361,Calder Pride,The Long-Awaited Addition to the Beloved Calde...,['Janet Dailey'],http://books.google.com/books/content?id=nlsgd...,http://books.google.com/books?id=nlsgd2-kGq4C&...,Harper Collins,https://play.google.com/store/books/details?id...,['fiction'],28,4.035714,5.0,1999-09-30,2012-04-04,4.137453,2009-03-17,2009.0,English,"[1.1648176, 0.56768346, -0.22511423, -0.185316..."
212365,212365,The Road Back,The sequel to the masterpiece All Quiet on the...,['Erich Maria Remarque'],http://books.google.com/books/content?id=obZdA...,http://books.google.com/books?id=obZdAAAAQBAJ&...,Random House Trade Paperbacks,http://books.google.com/books?id=obZdAAAAQBAJ&...,['fiction'],17,4.705882,5.0,1997-05-17,2012-01-23,4.466716,1998-01-27,1998.0,English,"[0.023786038, -1.9050528, -0.38564998, 0.14921..."
212394,212394,Final things,Grace's father believes in science and builds ...,['Jenny Offill'],http://books.google.com/books/content?id=UbSFB...,http://books.google.com/books?id=UbSFBAAAQBAJ&...,Vintage,https://play.google.com/store/books/details?id...,['fiction'],1,4.000000,4.0,2012-01-26,2012-01-26,4.260690,2015-03-17,2015.0,English,"[2.2700834, -0.11750376, -2.0253444, -1.039558..."
212399,212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],http://books.google.com/books/content?id=J7M-N...,http://books.google.com/books?id=J7M-NwAACAAJ&...,Scholastic Paperbacks,http://books.google.com/books?id=J7M-NwAACAAJ&...,['juvenile fiction'],28,4.678571,5.0,1998-07-10,2011-12-31,4.504800,2000-06-01,2000.0,English,"[2.6904726, -0.96442795, 0.093034565, -1.69420..."


### Clean data
Most of the cleaning is done in '02 Consolidate books dataset':
- English description
- category containing the word 'fiction'
- non-missing date
- non-missing author
- non-missing publisher
- non-missing cover image

## X and y set up

### Train test split

In [13]:
title_embeddings_df.columns

Index(['index', 'Title', 'description', 'authors', 'image', 'previewLink',
       'publisher', 'infoLink', 'categories', 'reviews number',
       'average rating', 'median rating', 'min review date', 'max review date',
       'weighted rating', 'date', 'year', 'description_language', 'Embedding'],
      dtype='object')

In [14]:
# Create X and y including all X features and all all teh possible target variables
# NOTE: we will have to add the description PCA in X_features
X_columns = ['year', 'Embedding', 'index', 'Title']

X = title_embeddings_df[X_columns]
y = title_embeddings_df[['average rating', 'weighted rating']]

In [15]:
X.columns

Index(['year', 'Embedding', 'index', 'Title'], dtype='object')

In [16]:
y.columns

Index(['average rating', 'weighted rating'], dtype='object')

In [17]:
# Create train test split

# Need to create train test split for different combinations of data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size= 0.2, 
    random_state= 42,
)

In [18]:
# store indices of train test split for the NLP of description
train_indices = X_train[['Title', 'index']]
test_indices = X_test[['Title', 'index']]

train_indices.to_csv(
    os.path.join(output_folder, 'train_indices.csv')
)


test_indices.to_csv(
    os.path.join(output_folder, 'test_indices.csv')
)

### y cuts

In [19]:
# Y train with average rating
y_avg_r_train = y_train['average rating']
y_avg_r_test = y_test['average rating']

# Y train with weighted rating
y_wr_train = y_train['weighted rating']
y_wr_test = y_test['weighted rating']

### Image embeddings X
Transform the arrays into columns so that they can feed into the models

In [20]:
X_images_train = X_train['Embedding'].apply(pd.Series)
X_images_test = X_test['Embedding'].apply(pd.Series)

# Rename columns
X_images_train = X_images_train.add_prefix('image_')
X_images_test = X_images_test.add_prefix('image_')

In [21]:
X_images_train

Unnamed: 0_level_0,image_0,image_1,image_2,image_3,image_4,image_5,image_6,image_7,image_8,image_9,...,image_246,image_247,image_248,image_249,image_250,image_251,image_252,image_253,image_254,image_255
index_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19757,1.310540,0.253238,-0.208362,-0.584103,-0.794551,-1.803357,-2.700018,-0.848385,0.949902,0.030431,...,-0.475167,-0.116055,0.735540,-2.354414,0.956939,-1.065875,-0.428229,-0.285047,1.098027,-1.029737
111405,1.078856,-0.691140,-0.908770,-0.527087,-1.044688,-0.904328,0.210946,-1.238919,2.290273,-0.155667,...,0.833435,-1.053398,-2.031961,-2.716383,0.817275,-0.434370,-1.456125,0.112614,0.199815,-2.946273
12269,0.350093,-0.135313,0.512653,0.466054,0.326599,0.014968,-2.224684,-0.740723,0.951188,0.978717,...,1.401945,-0.270900,-1.967142,-0.814089,0.170715,0.335253,-0.030882,-0.557203,0.037506,-1.951925
186303,4.560193,0.550376,-0.331547,-2.215812,0.023501,-0.370375,-0.838666,-0.905349,2.655245,0.461321,...,1.073267,0.354717,-1.745163,-2.610591,-0.239567,-1.714204,-0.914772,-0.354720,0.098914,-3.272672
134045,2.236698,-0.498289,-1.946595,-0.618339,-2.150359,1.257699,-1.547900,-1.492419,2.133226,1.513557,...,0.192120,-0.731990,-3.023546,-2.410438,-0.701821,-0.403381,0.522945,0.256027,1.826598,-3.091393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167206,0.998582,-1.018484,-0.603367,-0.467136,0.090738,-1.786356,-1.359131,-0.057187,-0.699324,-0.277699,...,0.967072,-0.407894,0.675619,-2.690143,0.811195,0.162421,-0.319673,-0.655105,1.837679,-2.292945
40262,0.993674,-0.236895,-1.161556,-0.831213,-0.535987,1.596271,-1.988977,-0.020737,0.298276,0.706597,...,1.235213,-0.146208,-1.484007,-1.698481,-0.935188,-0.480799,0.807755,0.346888,1.523457,-1.739759
7002,0.622462,-0.793748,-0.837395,-0.745482,-0.137351,0.551706,0.313828,-0.853430,0.280148,-0.483826,...,1.114822,-0.845614,-1.312257,-0.366303,-0.078197,1.005683,0.627290,-1.296975,1.453560,-0.719741
124694,1.551901,0.323544,-1.019795,-0.265791,-0.342785,0.346762,0.363122,-0.480910,0.268287,-0.495149,...,1.396447,0.472859,-1.687484,-2.504541,-0.655285,0.751038,0.302664,-0.154664,1.174814,-2.710322


### Description processed

#### Import data

In [22]:
# Description NLP test
NLP_df_test = pd.read_csv(
    os.path.join(output_folder, 'X_test_tSVD_3000.csv')
)

# Set indices as in train test split
NLP_df_test = NLP_df_test.set_index('index')

NLP_df_test.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'infoLink', 'categories', 'reviews number', 'average rating',
       ...
       'tSVD2991', 'tSVD2992', 'tSVD2993', 'tSVD2994', 'tSVD2995', 'tSVD2996',
       'tSVD2997', 'tSVD2998', 'tSVD2999', 'tSVD3000'],
      dtype='object', length=3018)

In [23]:
NLP_df_test

Unnamed: 0_level_0,Title,description,authors,image,previewLink,publisher,infoLink,categories,reviews number,average rating,...,tSVD2991,tSVD2992,tSVD2993,tSVD2994,tSVD2995,tSVD2996,tSVD2997,tSVD2998,tSVD2999,tSVD3000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",['Patricia Cornwell'],http://books.google.com/books/content?id=prefg...,http://books.google.nl/books?id=prefgSxnGOwC&p...,Hachette UK,https://play.google.com/store/books/details?id...,['fiction'],157,3.783439,...,0.004673,0.011035,0.000885,0.011355,0.003110,-0.009888,0.001707,-0.000568,0.000850,0.021092
209,Riverworld and Other Stories,Three stories of a world shared by resurrected...,['Philip José Farmer'],http://books.google.com/books/content?id=TP4oD...,http://books.google.nl/books?id=TP4oDwAAQBAJ&p...,Open Road Media,https://play.google.com/store/books/details?id...,['fiction'],7,4.285714,...,0.000682,-0.000072,-0.007662,0.010220,0.001084,-0.007470,-0.006064,0.012055,-0.007764,0.011430
330,Kenny Doin' Just Fine,"KENNY DOIN' JUST FINE Miriam Greenfield, a pro...",['Sadie Wernick Hurwitz'],http://books.google.com/books/content?id=D6Wgi...,http://books.google.nl/books?id=D6WgitXrr8sC&p...,iUniverse,http://books.google.nl/books?id=D6WgitXrr8sC&d...,['fiction'],1,5.000000,...,-0.005573,-0.004538,0.000019,0.002786,-0.015920,0.004716,-0.000231,-0.003591,0.000867,-0.001421
333,Harry on the Rocks,Harry and his boat become stranded on an islan...,['Susan Meddaugh'],http://books.google.com/books/content?id=u5r79...,http://books.google.nl/books?id=u5r79DAUeIYC&q...,Houghton Mifflin Harcourt,http://books.google.nl/books?id=u5r79DAUeIYC&d...,['juvenile fiction'],2,5.000000,...,0.003240,-0.001592,-0.003397,0.001563,0.008503,0.011397,-0.003413,-0.004257,0.006653,-0.002277
371,The National Review Treasury of Classic Childr...,"A collection of over forty stories, tales, poe...","['William F. Buckley, Jr.']",http://books.google.com/books/content?id=NZm7P...,http://books.google.nl/books?id=NZm7PAAACAAJ&d...,Isi Books,http://books.google.nl/books?id=NZm7PAAACAAJ&d...,['juvenile fiction'],3,5.000000,...,-0.002185,-0.001413,0.003879,0.004376,0.000110,0.002358,0.005980,0.005749,-0.000960,-0.003924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212041,Man For Maggie Moore (Montana Matchmakers) (Ha...,You don't know what love is until you have los...,['Steven Labree'],http://books.google.com/books/content?id=NZpeJ...,http://books.google.com/books?id=NZpeJhtmGo8C&...,Steven LaBree,http://books.google.com/books?id=NZpeJhtmGo8C&...,['fiction'],3,4.666667,...,0.004356,0.004014,0.000388,-0.010747,-0.012375,-0.001525,0.006544,0.013144,0.003183,-0.006146
212144,Prancing Tiger,"To clear the name of his ex-girlfriend's son, ...",['Philip Singerman'],http://books.google.com/books/content?id=68R7S...,http://books.google.com/books?id=68R7SppHYHcC&...,William Morrow,http://books.google.com/books?id=68R7SppHYHcC&...,['fiction'],4,4.250000,...,0.011748,-0.000504,0.006213,-0.006230,-0.008297,-0.009560,-0.002252,0.006993,0.007963,-0.001916
212256,Nude Men: A Novel,The internationally acclaimed debut of a novel...,['Amanda Filipacchi'],http://books.google.com/books/content?id=uM-1A...,http://books.google.com/books?id=uM-1AwAAQBAJ&...,Open Road Media,https://play.google.com/store/books/details?id...,['fiction'],23,3.739130,...,-0.010690,0.012784,-0.001923,0.008254,-0.002761,-0.003366,0.018299,-0.007997,0.004508,0.007875
212260,The Tale of Digby,"In Digby, Willy Wink finds himself in the midd...","['Timothy Lee Bonnette, Jr.']",http://books.google.com/books/content?id=pcgBA...,http://books.google.com/books?id=pcgBAAAACAAJ&...,Publishamerica Incorporated,http://books.google.com/books?id=pcgBAAAACAAJ&...,['fiction'],2,4.000000,...,0.002488,0.002354,0.004941,-0.000441,0.008438,0.012563,0.002361,0.008794,0.000451,-0.006659


In [24]:
# Description NLP train
NLP_df_train = pd.read_csv(
    os.path.join(output_folder, 'X_train_tSVD_3000.csv')
)

NLP_df_train = NLP_df_train.set_index('index')

NLP_df_train.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'infoLink', 'categories', 'reviews number', 'average rating',
       ...
       'tSVD2991', 'tSVD2992', 'tSVD2993', 'tSVD2994', 'tSVD2995', 'tSVD2996',
       'tSVD2997', 'tSVD2998', 'tSVD2999', 'tSVD3000'],
      dtype='object', length=3018)

In [25]:
NLP_df_train

Unnamed: 0_level_0,Title,description,authors,image,previewLink,publisher,infoLink,categories,reviews number,average rating,...,tSVD2991,tSVD2992,tSVD2993,tSVD2994,tSVD2995,tSVD2996,tSVD2997,tSVD2998,tSVD2999,tSVD3000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['fiction'],32,3.718750,...,0.001714,-0.012096,0.012755,0.004345,0.016297,0.022422,0.026395,-0.001808,-0.028102,0.011717
24,The Forbidden Stories of Marta Veneranda,"Marta Veneranda, a Latina neoyorkina, finds th...",['Sonia Rivera-Valdes'],http://books.google.com/books/content?id=A7aYb...,http://books.google.nl/books?id=A7aYbAvagu8C&p...,Seven Stories Press,http://books.google.nl/books?id=A7aYbAvagu8C&d...,['fiction'],1,5.000000,...,0.000900,0.002294,-0.002052,-0.013243,-0.002329,-0.005818,-0.000012,0.007939,-0.024247,-0.006868
42,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",['May Mcgoldrick'],http://books.google.com/books/content?id=VmCRS...,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,Harper Collins,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,['juvenile fiction'],17,4.235294,...,-0.002241,-0.007230,-0.005164,0.000416,-0.007837,-0.002487,-0.004066,0.010140,-0.007082,0.001236
49,"Eight Men And A Lady (Elizabeth Sinclair, Harl...",Eight Men And A Lady by Elizabeth Sinclair rel...,['Elizabeth Sinclair'],http://books.google.com/books/content?id=Z6uzJ...,http://books.google.nl/books?id=Z6uzJgLWViUC&q...,Harlequin Treasury-Harlequin American Romance 90s,http://books.google.nl/books?id=Z6uzJgLWViUC&d...,['fiction'],2,5.000000,...,0.004947,-0.003206,0.011330,-0.004898,-0.002988,0.001996,-0.008123,-0.000639,-0.016424,-0.004971
73,Night World: Daughters Of Darkness,"""There’s something strange about the new girls...",['L.J. Smith'],http://books.google.com/books/content?id=c9icD...,http://books.google.nl/books?id=c9icDQAAQBAJ&p...,Simon and Schuster,http://books.google.nl/books?id=c9icDQAAQBAJ&d...,['juvenile fiction'],134,4.768657,...,0.000070,0.003473,-0.013758,0.000954,-0.011287,-0.008795,0.003780,0.002349,-0.013346,0.001237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,Calder Pride,The Long-Awaited Addition to the Beloved Calde...,['Janet Dailey'],http://books.google.com/books/content?id=nlsgd...,http://books.google.com/books?id=nlsgd2-kGq4C&...,Harper Collins,https://play.google.com/store/books/details?id...,['fiction'],28,4.035714,...,-0.003849,-0.000571,0.001393,-0.001693,-0.012675,-0.000349,0.002634,-0.004115,0.004868,-0.006720
212365,The Road Back,The sequel to the masterpiece All Quiet on the...,['Erich Maria Remarque'],http://books.google.com/books/content?id=obZdA...,http://books.google.com/books?id=obZdAAAAQBAJ&...,Random House Trade Paperbacks,http://books.google.com/books?id=obZdAAAAQBAJ&...,['fiction'],17,4.705882,...,0.014579,-0.010071,-0.008840,0.011693,0.006862,-0.001960,0.004995,-0.007247,-0.013905,-0.001202
212394,Final things,Grace's father believes in science and builds ...,['Jenny Offill'],http://books.google.com/books/content?id=UbSFB...,http://books.google.com/books?id=UbSFBAAAQBAJ&...,Vintage,https://play.google.com/store/books/details?id...,['fiction'],1,4.000000,...,0.007651,0.005601,0.000670,-0.010949,-0.012086,0.002205,-0.004838,-0.008230,0.000084,-0.002611
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],http://books.google.com/books/content?id=J7M-N...,http://books.google.com/books?id=J7M-NwAACAAJ&...,Scholastic Paperbacks,http://books.google.com/books?id=J7M-NwAACAAJ&...,['juvenile fiction'],28,4.678571,...,-0.012930,0.005578,-0.000719,-0.004401,0.003777,-0.001121,-0.006025,-0.007464,-0.008325,0.005250


#### Keep relevant variables

In [26]:
columns_to_keep = [col for col in NLP_df_test.columns if col.startswith('tSVD')]

NLP_df_train = NLP_df_train[columns_to_keep]
NLP_df_test = NLP_df_test[columns_to_keep]

In [27]:
NLP_df_train

Unnamed: 0_level_0,tSVD1,tSVD2,tSVD3,tSVD4,tSVD5,tSVD6,tSVD7,tSVD8,tSVD9,tSVD10,...,tSVD2991,tSVD2992,tSVD2993,tSVD2994,tSVD2995,tSVD2996,tSVD2997,tSVD2998,tSVD2999,tSVD3000
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.129603,-0.038296,-0.063023,0.002653,-0.035225,0.004487,-0.033247,0.006327,0.040408,0.035931,...,0.001714,-0.012096,0.012755,0.004345,0.016297,0.022422,0.026395,-0.001808,-0.028102,0.011717
24,0.105464,-0.019206,-0.016426,-0.003522,0.027940,-0.001382,-0.020538,0.003146,-0.013301,-0.015466,...,0.000900,0.002294,-0.002052,-0.013243,-0.002329,-0.005818,-0.000012,0.007939,-0.024247,-0.006868
42,0.110316,-0.032971,-0.038233,-0.010302,0.009022,-0.016867,-0.066746,0.042483,0.019502,0.016915,...,-0.002241,-0.007230,-0.005164,0.000416,-0.007837,-0.002487,-0.004066,0.010140,-0.007082,0.001236
49,0.043620,-0.010660,-0.049538,0.428472,-0.049904,-0.034815,-0.042501,-0.027805,-0.038459,-0.016390,...,0.004947,-0.003206,0.011330,-0.004898,-0.002988,0.001996,-0.008123,-0.000639,-0.016424,-0.004971
73,0.112533,-0.023843,-0.024143,-0.009996,0.020481,0.020480,0.000434,0.000725,-0.014176,-0.008300,...,0.000070,0.003473,-0.013758,0.000954,-0.011287,-0.008795,0.003780,0.002349,-0.013346,0.001237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,0.182936,-0.055312,-0.095027,0.006525,-0.008528,0.031805,-0.063892,0.001145,0.053651,-0.010439,...,-0.003849,-0.000571,0.001393,-0.001693,-0.012675,-0.000349,0.002634,-0.004115,0.004868,-0.006720
212365,0.169263,-0.042064,0.007502,-0.012853,-0.016271,-0.000023,0.023768,-0.028389,0.026683,0.003763,...,0.014579,-0.010071,-0.008840,0.011693,0.006862,-0.001960,0.004995,-0.007247,-0.013905,-0.001202
212394,0.156620,-0.021632,-0.010062,-0.017675,0.014586,-0.020211,-0.044151,-0.082319,0.016963,0.027734,...,0.007651,0.005601,0.000670,-0.010949,-0.012086,0.002205,-0.004838,-0.008230,0.000084,-0.002611
212399,0.085933,-0.024211,-0.004185,-0.009729,0.047267,-0.017817,-0.034370,-0.028692,-0.017410,0.020350,...,-0.012930,0.005578,-0.000719,-0.004401,0.003777,-0.001121,-0.006025,-0.007464,-0.008325,0.005250


### Description dimension reduction NN

-> Questions/notes:
Inputs to choose:
- number of layers:
    - Description NN
        - input
        - noise
        - hidden layer
        - noise
        - hidden layer
        - final layer
    - Description and image embeddings NN
        - input
        - noise
        - hidden layer
        - noise
        - final layer
    Too many?   
- add dense layers to avoid overfitting?
- activation functions
    - ReLu (Rectified linear activation function): piecewise linear function that will output the input directly if it is positive, otherwise, it will output zero. Simple but effective.
- Use linear in the last layer to obtain a continuous variable
- optimizer: 
    - Adam; works with momentums of first and second order. 
    - sdg: variant of Gradient Descent (Gradient Descent is the most basic but most used optimization algorithm. It’s used heavily in linear regression and classification algorithms. It's easy and works well but there is the risk that the model gets stuck in local minima)
- loss function
    - MSE?
- number of epochs
- which metric to use to evaluate the model?
    - MSE
    - MAE

- Use gridsearch to optimise hyperparameters?

#### Set up

In [28]:
# get number of inputs - second element of shape (i.e. number of columns in X)
input_shape = NLP_df_train.shape[1]

# neurons number
n_neurons = 512

# define a model
baseline_model = keras.Sequential()

# Add input layer
baseline_model.add(layers.Dense(
            n_neurons, # number of neurons
            input_dim = input_shape, # number of inputs 
            activation = 'relu' # activation faunction
            ))

# Hidden - Layers
baseline_model.add(layers.Dense(
                    256, 
                    activation = "linear"))

baseline_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#### Compile

In [29]:
# Compile model

baseline_model.compile(
    optimizer='adam',
    loss=['mean_squared_error'], 
    metrics = ['mae', 'mean_squared_error']
    )

#### Train

In [30]:
# Train model
epochs_hist = baseline_model.fit(
    NLP_df_train, # input
    y_wr_train, # output
    epochs=100, # number of iterations
    batch_size=50, # number of observations taken to train the data
    verbose=1,
    validation_data = (NLP_df_test, y_wr_test),
    shuffle = True
    #validation_split=0.2,    
)

Epoch 1/100
[1m 59/429[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 6ms/step - loss: 16.1463 - mae: 3.9923 - mean_squared_error: 16.1463

[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 5.7757 - mae: 1.6848 - mean_squared_error: 5.7757 - val_loss: 0.0666 - val_mae: 0.1820 - val_mean_squared_error: 0.0648
Epoch 2/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.0606 - mae: 0.1794 - mean_squared_error: 0.0606 - val_loss: 0.0655 - val_mae: 0.1814 - val_mean_squared_error: 0.0639
Epoch 3/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.0634 - mae: 0.1841 - mean_squared_error: 0.0634 - val_loss: 0.0654 - val_mae: 0.1786 - val_mean_squared_error: 0.0639
Epoch 4/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.0616 - mae: 0.1809 - mean_squared_error: 0.0616 - val_loss: 0.0625 - val_mae: 0.1749 - val_mean_squared_error: 0.0611
Epoch 5/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.0592 - mae: 0.1769 - mean_squared_error: 0.0592 - val_l

### Calculate intermediate description features with lower dimensionality

In [31]:
# Predict baseline X train and X test 

NLP_intermediate_train = baseline_model.predict(NLP_df_train)
NLP_intermediate_test = baseline_model.predict(NLP_df_test)

[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [32]:
NLP_intermediate_train

array([[4.4089246, 4.420882 , 4.4168897, ..., 4.41385  , 4.417997 ,
        4.416403 ],
       [4.2271676, 4.231065 , 4.2289457, ..., 4.224927 , 4.231066 ,
        4.2293634],
       [4.1883283, 4.194393 , 4.1940336, ..., 4.195767 , 4.191303 ,
        4.189158 ],
       ...,
       [4.00177  , 3.984882 , 3.9928422, ..., 3.9939249, 3.9958043,
        3.9959364],
       [4.3644314, 4.3600974, 4.362086 , ..., 4.3642035, 4.3628893,
        4.363909 ],
       [4.200646 , 4.206281 , 4.2025747, ..., 4.202437 , 4.211825 ,
        4.1969256]], dtype=float32)

In [33]:
len(NLP_intermediate_train[0])

256

In [34]:
# Store these into a dataframe
NLP_intermediate_train_df = pd.DataFrame(NLP_intermediate_train, index=NLP_df_train.index)
NLP_intermediate_test_df = pd.DataFrame(NLP_intermediate_test, index=NLP_df_test.index)

In [35]:
# Check that indices are correct
NLP_intermediate_train_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,4.408925,4.420882,4.416890,4.418366,4.417753,4.417819,4.419184,4.419909,4.414232,4.415713,...,4.419310,4.408286,4.422114,4.418770,4.415804,4.421103,4.423278,4.413850,4.417997,4.416403
24,4.227168,4.231065,4.228946,4.230300,4.231961,4.235414,4.234513,4.230048,4.229636,4.231685,...,4.236648,4.226399,4.229749,4.232564,4.227556,4.231503,4.229834,4.224927,4.231066,4.229363
42,4.188328,4.194393,4.194034,4.185140,4.193111,4.193242,4.191313,4.192410,4.186793,4.191766,...,4.193043,4.191612,4.194394,4.195267,4.187212,4.196862,4.197066,4.195767,4.191303,4.189158
49,4.525066,4.526392,4.522406,4.529515,4.528418,4.531471,4.528793,4.527687,4.526726,4.536198,...,4.529837,4.530842,4.525315,4.529678,4.528812,4.528993,4.534094,4.525110,4.526510,4.527074
73,4.337204,4.342837,4.341679,4.339909,4.336855,4.340528,4.337700,4.337024,4.336594,4.337668,...,4.342944,4.339314,4.341485,4.337862,4.339727,4.343600,4.340711,4.333910,4.343233,4.339843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,4.445068,4.436575,4.432493,4.443172,4.436512,4.438483,4.444669,4.450074,4.446179,4.442899,...,4.453914,4.448549,4.443313,4.438620,4.446571,4.438053,4.443110,4.444717,4.442869,4.436579
212365,4.096858,4.100168,4.090538,4.097077,4.097855,4.086901,4.091972,4.088095,4.093302,4.093448,...,4.098001,4.095216,4.087335,4.103864,4.084136,4.091631,4.090065,4.091859,4.090687,4.088017
212394,4.001770,3.984882,3.992842,3.992184,3.985528,3.998158,4.007915,3.994741,3.993707,3.998885,...,3.997579,4.003650,3.992235,3.993426,4.008648,3.990647,4.016080,3.993925,3.995804,3.995936
212399,4.364431,4.360097,4.362086,4.358554,4.364618,4.365531,4.364735,4.365673,4.358621,4.364763,...,4.368271,4.357871,4.366419,4.363382,4.362494,4.364899,4.362829,4.364203,4.362889,4.363909


### Create X and y cuts
We are going to run two models for two target variables
- Target variable: Average rating
  - baseline (i.e. excluding image embeddings)
  - including image embeddings
- Target variable: weighted rating
  - baseline (i.e. excluding image embeddings)
  - including image embeddings

We therefore need to create the following datsets
- X train and X test with embeddings
- X train and X text without embeddings
- y train and y test using average rating
- y train and y test using weighted rating

#### Baseline X and y

In [36]:
%who


Adam	 Dense	 Dropout	 NLP_df_test	 NLP_df_train	 NLP_intermediate_test	 NLP_intermediate_test_df	 NLP_intermediate_train	 NLP_intermediate_train_df	 
Pipeline	 RandomForestRegressor	 SGD	 SVR	 StandardScaler	 X	 X_columns	 X_images_test	 X_images_train	 
X_test	 X_train	 analysis_folder	 baseline_model	 columns_to_keep	 date	 dates_columns	 epochs_hist	 folders_set_up	 
input_folder	 input_shape	 keras	 layers	 mean_absolute_error	 mean_squared_error	 n_neurons	 np	 os	 
output_folder	 pd	 plt	 sns	 test_indices	 time	 title_embeddings_df	 train_indices	 train_test_split	 
tree	 y	 y_avg_r_test	 y_avg_r_train	 y_test	 y_train	 y_wr_test	 y_wr_train	 


##### SVR

In [37]:
# Baseline model data for SVR
X_baseline_train = pd.merge(
    NLP_df_train,
    X_train['year'],
    right_index = True,
    left_index = True,
    how = 'inner')

X_baseline_test = pd.merge(
    NLP_df_test,
    X_test['year'],
    right_index = True,
    left_index = True,
    how = 'inner')

In [38]:
X_baseline_train

Unnamed: 0,tSVD1,tSVD2,tSVD3,tSVD4,tSVD5,tSVD6,tSVD7,tSVD8,tSVD9,tSVD10,...,tSVD2992,tSVD2993,tSVD2994,tSVD2995,tSVD2996,tSVD2997,tSVD2998,tSVD2999,tSVD3000,year
3,0.129603,-0.038296,-0.063023,0.002653,-0.035225,0.004487,-0.033247,0.006327,0.040408,0.035931,...,-0.012096,0.012755,0.004345,0.016297,0.022422,0.026395,-0.001808,-0.028102,0.011717,2005.0
24,0.105464,-0.019206,-0.016426,-0.003522,0.027940,-0.001382,-0.020538,0.003146,-0.013301,-0.015466,...,0.002294,-0.002052,-0.013243,-0.002329,-0.005818,-0.000012,0.007939,-0.024247,-0.006868,2001.0
42,0.110316,-0.032971,-0.038233,-0.010302,0.009022,-0.016867,-0.066746,0.042483,0.019502,0.016915,...,-0.007230,-0.005164,0.000416,-0.007837,-0.002487,-0.004066,0.010140,-0.007082,0.001236,2002.0
49,0.043620,-0.010660,-0.049538,0.428472,-0.049904,-0.034815,-0.042501,-0.027805,-0.038459,-0.016390,...,-0.003206,0.011330,-0.004898,-0.002988,0.001996,-0.008123,-0.000639,-0.016424,-0.004971,1997.0
73,0.112533,-0.023843,-0.024143,-0.009996,0.020481,0.020480,0.000434,0.000725,-0.014176,-0.008300,...,0.003473,-0.013758,0.000954,-0.011287,-0.008795,0.003780,0.002349,-0.013346,0.001237,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,0.182936,-0.055312,-0.095027,0.006525,-0.008528,0.031805,-0.063892,0.001145,0.053651,-0.010439,...,-0.000571,0.001393,-0.001693,-0.012675,-0.000349,0.002634,-0.004115,0.004868,-0.006720,2009.0
212365,0.169263,-0.042064,0.007502,-0.012853,-0.016271,-0.000023,0.023768,-0.028389,0.026683,0.003763,...,-0.010071,-0.008840,0.011693,0.006862,-0.001960,0.004995,-0.007247,-0.013905,-0.001202,1998.0
212394,0.156620,-0.021632,-0.010062,-0.017675,0.014586,-0.020211,-0.044151,-0.082319,0.016963,0.027734,...,0.005601,0.000670,-0.010949,-0.012086,0.002205,-0.004838,-0.008230,0.000084,-0.002611,2015.0
212399,0.085933,-0.024211,-0.004185,-0.009729,0.047267,-0.017817,-0.034370,-0.028692,-0.017410,0.020350,...,0.005578,-0.000719,-0.004401,0.003777,-0.001121,-0.006025,-0.007464,-0.008325,0.005250,2000.0


In [39]:
print(X_baseline_train.shape)
print(X_baseline_test.shape)

(21419, 3001)
(5355, 3001)


In [40]:
X_baseline_train

Unnamed: 0,tSVD1,tSVD2,tSVD3,tSVD4,tSVD5,tSVD6,tSVD7,tSVD8,tSVD9,tSVD10,...,tSVD2992,tSVD2993,tSVD2994,tSVD2995,tSVD2996,tSVD2997,tSVD2998,tSVD2999,tSVD3000,year
3,0.129603,-0.038296,-0.063023,0.002653,-0.035225,0.004487,-0.033247,0.006327,0.040408,0.035931,...,-0.012096,0.012755,0.004345,0.016297,0.022422,0.026395,-0.001808,-0.028102,0.011717,2005.0
24,0.105464,-0.019206,-0.016426,-0.003522,0.027940,-0.001382,-0.020538,0.003146,-0.013301,-0.015466,...,0.002294,-0.002052,-0.013243,-0.002329,-0.005818,-0.000012,0.007939,-0.024247,-0.006868,2001.0
42,0.110316,-0.032971,-0.038233,-0.010302,0.009022,-0.016867,-0.066746,0.042483,0.019502,0.016915,...,-0.007230,-0.005164,0.000416,-0.007837,-0.002487,-0.004066,0.010140,-0.007082,0.001236,2002.0
49,0.043620,-0.010660,-0.049538,0.428472,-0.049904,-0.034815,-0.042501,-0.027805,-0.038459,-0.016390,...,-0.003206,0.011330,-0.004898,-0.002988,0.001996,-0.008123,-0.000639,-0.016424,-0.004971,1997.0
73,0.112533,-0.023843,-0.024143,-0.009996,0.020481,0.020480,0.000434,0.000725,-0.014176,-0.008300,...,0.003473,-0.013758,0.000954,-0.011287,-0.008795,0.003780,0.002349,-0.013346,0.001237,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212361,0.182936,-0.055312,-0.095027,0.006525,-0.008528,0.031805,-0.063892,0.001145,0.053651,-0.010439,...,-0.000571,0.001393,-0.001693,-0.012675,-0.000349,0.002634,-0.004115,0.004868,-0.006720,2009.0
212365,0.169263,-0.042064,0.007502,-0.012853,-0.016271,-0.000023,0.023768,-0.028389,0.026683,0.003763,...,-0.010071,-0.008840,0.011693,0.006862,-0.001960,0.004995,-0.007247,-0.013905,-0.001202,1998.0
212394,0.156620,-0.021632,-0.010062,-0.017675,0.014586,-0.020211,-0.044151,-0.082319,0.016963,0.027734,...,0.005601,0.000670,-0.010949,-0.012086,0.002205,-0.004838,-0.008230,0.000084,-0.002611,2015.0
212399,0.085933,-0.024211,-0.004185,-0.009729,0.047267,-0.017817,-0.034370,-0.028692,-0.017410,0.020350,...,0.005578,-0.000719,-0.004401,0.003777,-0.001121,-0.006025,-0.007464,-0.008325,0.005250,2000.0


In [41]:
# With image embeddings
X_final_train = pd.merge(
    X_images_train,
    X_baseline_train,
    right_index = True,
    left_index = True,
    how = 'inner')
    
X_final_test = pd.merge(
    X_images_test,
    X_baseline_test,
    right_index = True,
    left_index = True,
    how = 'inner')

In [42]:
X_final_train.columns

Index(['image_0', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5',
       'image_6', 'image_7', 'image_8', 'image_9',
       ...
       'tSVD2992', 'tSVD2993', 'tSVD2994', 'tSVD2995', 'tSVD2996', 'tSVD2997',
       'tSVD2998', 'tSVD2999', 'tSVD3000', 'year'],
      dtype='object', length=3257)

##### NN

In [43]:
# Baseline model data for NN
X_baseline_train_NN = NLP_intermediate_train_df
X_baseline_test_NN = NLP_intermediate_test_df

In [44]:
# Stack description + publish year and images embeddings

X_final_train_NN = pd.merge(
    X_baseline_train_NN, 
    X_images_train, 
    left_index = True, 
    right_index = True)

X_final_test_NN = pd.merge(
    X_baseline_test_NN, 
    X_images_test, 
    left_index = True, 
    right_index = True)

## Run models

### Support Vector Regression & co.

#### Set up

In [45]:
# Create models

# Random forest
rf = RandomForestRegressor()

# SVR
svr_model = SVR(kernel='rbf')  # 'rbf' for radial basis function kernel

# Lightgbm


# Define pipeline steps
rf_pipeline = Pipeline([
    ('rf', rf)  # Random Forest classifier
])

svr_pipeline = Pipeline([
    ('svr', svr_model)  # Neural Network classifier
])

#### Run

In [46]:
# Set up table to run different variations and store the results

evaluation_metrics = pd.DataFrame({
    #'Random Forest': {'model': rf_pipeline, 'prediction' : None, 'MAE' : None, 'MSE' : None},
    'Baseline Support Vector Regression': {'model': svr_pipeline, 'X_train': X_baseline_train, 'X_test' : X_baseline_test, 'prediction': None, 'MAE' : None, 'MSE' : None},
    'Final Support Vector Regression': {'model': svr_pipeline, 'X_train': X_final_train, 'X_test' : X_final_test, 'prediction': None, 'MAE' : None, 'MSE' : None}
}).transpose()

evaluation_metrics = evaluation_metrics.rename(
    columns  = {'index' : 'model name'}
)

evaluation_metrics

Unnamed: 0,model,X_train,X_test,prediction,MAE,MSE
Baseline Support Vector Regression,(SVR()),tSVD1 tSVD2 tSVD3 tSVD4...,tSVD1 tSVD2 tSVD3 tSVD4...,,,
Final Support Vector Regression,(SVR()),image_0 image_1 image_2 image_3...,image_0 image_1 image_2 image_3...,,,


In [47]:
# Fit and predict


for i, row in evaluation_metrics.iterrows():

    start_time = time.time()
    print(i)
    # Call model
    model = row['model']
    
    # Train model
    model.fit(row['X_train'], y_wr_train)
    
    # Time elapsed
    elapsed_time = time.time() - start_time
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)
    print(f"> Training completed. Duration: {minutes:02d}:{seconds:02d}")

    # Calculate predictions
    y_wr_pred = model.predict(row['X_test'])

    # save predictions
    row['prediction'] = y_wr_pred
    
    # Time elapsed
    elapsed_time_2 = time.time() - elapsed_time
    minutes = int(elapsed_time_2 // 60)
    seconds = int(elapsed_time_2 % 60)
    print(f"> Predictions completed. Duration: {minutes:02d}:{seconds:02d}")

    # Calculate metrics
    mse = mean_squared_error(y_wr_test, y_wr_pred)
    mae = mean_absolute_error(y_wr_test, y_wr_pred)

    # Save metrics
    row['MAE'] = mae
    row['MSE'] = mse
    
    # Time elapsed
    elapsed_time_3 = time.time() - elapsed_time_2
    minutes = int(elapsed_time_3 // 60)
    seconds = int(elapsed_time_3 % 60)
    print(f"> Evaluation completed. Duration: {minutes:02d}:{seconds:02d}")

    # Convert elapsed time to minutes and seconds
    total_time = time.time() - start_time
    minutes = int(total_time // 60)
    seconds = int(total_time % 60)

    # Print the time in minutes and seconds
    print(f">> Total time taken: {minutes:02d}:{seconds:02d}")

    print('\n')

Baseline Support Vector Regression


> Training completed. Duration: 02:10
> Predictions completed. Duration: 28540518:25
> Evaluation completed. Duration: 02:10
Total time taken: 03:03


Final Support Vector Regression
> Training completed. Duration: 05:17
> Predictions completed. Duration: 28540518:26
> Evaluation completed. Duration: 05:17
Total time taken: 06:11




In [51]:
evaluation_metrics

Unnamed: 0,model,X_train,X_test,prediction,MAE,MSE
Baseline Support Vector Regression,(SVR()),tSVD1 tSVD2 tSVD3 tSVD4...,tSVD1 tSVD2 tSVD3 tSVD4...,"[4.26317910221547, 4.263480602359525, 4.262913...",0.13147,0.045265
Final Support Vector Regression,(SVR()),image_0 image_1 image_2 image_3...,image_0 image_1 image_2 image_3...,"[4.269796558491903, 4.251500951120229, 4.26040...",0.131182,0.044787


### Neural Network

#### Set up

In [52]:
# Set up table to run different variations and store the results

evaluation_metrics_NN = pd.DataFrame({
    #'Random Forest': {'model': rf_pipeline, 'prediction' : None, 'MAE' : None, 'MSE' : None},
    'Baseline Neural Network': {'model': None, 'X_train': X_baseline_train_NN, 'X_test' : X_baseline_test_NN, 'prediction': None, 'MAE' : None, 'MSE' : None},
    'Final Neural Network': {'model': None, 'X_train': X_final_train_NN, 'X_test' : X_final_test_NN, 'prediction': None, 'MAE' : None, 'MSE' : None}
}).transpose()

evaluation_metrics_NN = evaluation_metrics_NN.rename(
    columns  = {'index' : 'model name'}
)

evaluation_metrics_NN

Unnamed: 0,model,X_train,X_test,prediction,MAE,MSE
Baseline Neural Network,,0 1 2 3 ...,0 1 2 3 ...,,,
Final Neural Network,,0 1 2 3...,0 1 2 3...,,,


In [55]:
# Run model for different moodels


for i, row in evaluation_metrics_NN.iterrows():
    start_time = time.time()
    print(i)

    input_shape = row['X_train'].shape[1]
    print(f"> Input shape: {input_shape}")

    # neurons number
    n_neurons = 512

### define a model
    final_model = keras.Sequential()

    # Add input layer
    final_model.add(layers.Dense(
                n_neurons, # number of neurons
                input_dim = input_shape, # number of inputs 
                activation = 'relu' # activation faunction
                ))

    # Hidden - Layers
    final_model.add(layers.Dropout(
                        0.3, 
                        noise_shape=None, 
                        seed=None))
    final_model.add(layers.Dense(
        256, 
        activation = "relu"))
    final_model.add(layers.Dropout(
                        0.2, 
                        noise_shape=None, 
                        seed=None))
    final_model.add(layers.Dense(
        62, 
        activation = "relu"))
    final_model.add(layers.Dropout(
                        0.2, 
                        noise_shape=None, 
                        seed=None))

    # Final layer
    final_model.add(layers.Dense(
        1, 
        activation = 'linear'))

    final_model.summary()

    # Add model to table
    row['model'] = final_model
    
### Compile the model
    final_model.compile(
    optimizer='adam',
    loss=['mean_squared_error'], 
    metrics = ['mae', 'mean_squared_error']
    )
    
    # Time elapsed
    elapsed_time_2 = time.time() - elapsed_time

### Train the model
    epochs_hist = final_model.fit(
    row['X_train'], # input
    y_wr_train, # output
    epochs=100, # number of iterations
    batch_size=50, # number of observations taken to train the data - 1030 obs/50 -> there are 17 groups (observations are taken once for epoch) so model is trained 17 times in each epoch
    verbose=1,
    validation_data = (row['X_test'], y_wr_test),
    shuffle = True
    #validation_split=0.2,    
    )
    # Time elapsed
    elapsed_time_3 = time.time() - elapsed_time_2


# ### Predictions
    y_pred = final_model.predict(row['X_test'])
    # Store predictions
    row['prediction'] = y_pred
    # Time elapsed
    elapsed_time_4 = time.time() - elapsed_time_3


# ### Evaluation
    mse = mean_squared_error(y_pred, y_wr_test)
    mae = mean_absolute_error(y_pred, y_wr_test)
    row['MAE'] = mae
    row['MSE'] = mse
    
    # Time elapsed
    elapsed_time_5 = time.time() - elapsed_time_4


    # Timings
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)
    print(f"> Model set up completed. Duration: {minutes: 02d}:{seconds: 02d}")

    minutes = int(elapsed_time_2 // 60)
    seconds = int(elapsed_time_2 % 60)
    print(f"> Compilation completed. Duration: {minutes: 02d}:{seconds: 02d}")

    minutes = int(elapsed_time_3 // 60)
    seconds = int(elapsed_time_3 % 60)
    print(f"> Training completed. Duration: {minutes: 02d}:{seconds: 02d}") 

    minutes = int(elapsed_time_4 // 60)
    seconds = int(elapsed_time_4 % 60)
    print(f"> Prediction completed. Duration: {minutes: 02d}:{seconds: 02d}") 

    minutes = int(elapsed_time_5 // 60)
    seconds = int(elapsed_time_5 % 60)
    print(f"> Evaluation completed. Duration: {minutes: 02d}:{seconds: 02d}") 

    total_time = time.time() - start_time
    minutes = int(total_time // 60)
    seconds = int(total_time % 60)

    # Print the time in minutes and seconds
    print(f"Total time taken: {minutes:02d}:{seconds:02d}")

    print('\n')


Baseline Neural Network
> Input shape: 256


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100


[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2.0489 - mae: 0.9263 - mean_squared_error: 2.0489 - val_loss: 0.9012 - val_mae: 0.9250 - val_mean_squared_error: 0.9026
Epoch 2/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3879 - mae: 0.4986 - mean_squared_error: 0.3879 - val_loss: 1.6791 - val_mae: 1.2778 - val_mean_squared_error: 1.6815
Epoch 3/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2966 - mae: 0.4344 - mean_squared_error: 0.2966 - val_loss: 0.8272 - val_mae: 0.8866 - val_mean_squared_error: 0.8285
Epoch 4/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2570 - mae: 0.4040 - mean_squared_error: 0.2570 - val_loss: 0.9139 - val_mae: 0.9346 - val_mean_squared_error: 0.9153
Epoch 5/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2250 - mae: 0.3780 - mean_squared_error: 0.2250 - val_l

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 3.3820 - mae: 1.1280 - mean_squared_error: 3.3820 - val_loss: 0.6893 - val_mae: 0.8000 - val_mean_squared_error: 0.6912
Epoch 2/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4224 - mae: 0.5173 - mean_squared_error: 0.4224 - val_loss: 1.1109 - val_mae: 1.0308 - val_mean_squared_error: 1.1130
Epoch 3/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3382 - mae: 0.4636 - mean_squared_error: 0.3382 - val_loss: 1.0689 - val_mae: 1.0118 - val_mean_squared_error: 1.0704
Epoch 4/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.2740 - mae: 0.4172 - mean_squared_error: 0.2740 - val_loss: 1.0486 - val_mae: 1.0028 - val_mean_squared_error: 1.0506
Epoch 5/100
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.2436 - mae: 0.3937 - mean_squared_error: 0.

In [56]:
evaluation_metrics_NN

Unnamed: 0,model,X_train,X_test,prediction,MAE,MSE
Baseline Neural Network,"<Sequential name=sequential_3, built=True>",0 1 2 3 ...,0 1 2 3 ...,"[[3.5686584], [3.5541081], [3.5608985], [3.536...",0.707585,0.534406
Final Neural Network,"<Sequential name=sequential_4, built=True>",0 1 2 3...,0 1 2 3...,"[[3.4987206], [3.4354882], [3.4638855], [3.398...",0.804946,0.685819


In [58]:
evaluation_metrics_all = pd.concat([evaluation_metrics, evaluation_metrics_NN])

In [59]:
evaluation_metrics_all

Unnamed: 0,model,X_train,X_test,prediction,MAE,MSE
Baseline Support Vector Regression,(SVR()),tSVD1 tSVD2 tSVD3 tSVD4...,tSVD1 tSVD2 tSVD3 tSVD4...,"[4.26317910221547, 4.263480602359525, 4.262913...",0.13147,0.045265
Final Support Vector Regression,(SVR()),image_0 image_1 image_2 image_3...,image_0 image_1 image_2 image_3...,"[4.269796558491903, 4.251500951120229, 4.26040...",0.131182,0.044787
Baseline Neural Network,"<Sequential name=sequential_3, built=True>",0 1 2 3 ...,0 1 2 3 ...,"[[3.5686584], [3.5541081], [3.5608985], [3.536...",0.707585,0.534406
Final Neural Network,"<Sequential name=sequential_4, built=True>",0 1 2 3...,0 1 2 3...,"[[3.4987206], [3.4354882], [3.4638855], [3.398...",0.804946,0.685819


In [None]:
# Visualise NN

# # Plotting Loss And Mean Square Error For both Training And Test Sets
# plt.plot(epochs_hist.history['mse'])
# plt.plot(epochs_hist.history['val_mse'])
# plt.title('MSE')
# plt.ylabel('mae')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()
# plt.savefig(os.path.join(output_folder, '{i} mse chart.png'))

# # summarize history for loss
# plt.plot(epochs_hist.history['loss'])
# plt.plot(epochs_hist.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.savefig('4.png')
# plt.show()
# plt.savefig(os.path.join(output_folder, '{i} summary chart.png'))

## Select best model

## Hyper parameter tuning

In [None]:
# Grid search?