# Machine Learning

## Set up

In [83]:
#%pip install lightgbm

In [85]:
# Set up folders
from EDA_functions import folders_set_up
import os

# Work with datarames
import pandas as pd
import numpy as np

# Charts
import seaborn as sns
from matplotlib import pyplot as plt

# X, Y preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# SVR
from sklearn.svm import SVR

# Light GBM
# import lightgbm as lgb

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree

# Neural Network
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,Dropout
from keras.optimizers import Adam, SGD

# Pipeline
from sklearn.pipeline import Pipeline

# Evaluate models
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

### Folders
Run the code below if you have the following structure:
- Group-project: GitHub folder
- 01 Input
- 02 Output

In [4]:
analysis_folder, input_folder, output_folder = folders_set_up.generate_folders()

## Import and merge data

### Datasets

In [5]:
#Title-level dataset
titles_df = pd.read_pickle(
    os.path.join(output_folder, 'English_fiction_pre_PCA_3_with_embeddings')
)

titles_df.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'infoLink', 'categories', 'reviews number', 'average rating',
       'median rating', 'min review date', 'max review date',
       'weighted rating', 'date', 'year', 'description_language', 'Embedding'],
      dtype='object')

In [6]:
# indices are missing in the file above, we get them from another dataset
index_df = pd.read_csv(
    os.path.join(output_folder, 'English_fiction_pre_PCA_3.csv')
)

index_df = index_df[['Title', 'index']]

In [6]:
# NLP
# descriptions_df

### Merge

In [7]:
# Merge titles dataframe with indices dataframe

df = pd.merge(
    titles_df,
    index_df,
    on = 'Title',
    how = 'left'
)

df.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'infoLink', 'categories', 'reviews number', 'average rating',
       'median rating', 'min review date', 'max review date',
       'weighted rating', 'date', 'year', 'description_language', 'Embedding',
       'index'],
      dtype='object')

In [8]:
# Merge titles dataframe and description PCA

# df = pd.merge(
#     titles_df,
#     descriptions_df,
#     on = 'index',
#     how = 'left'
# )

### Format data

In [8]:
df.dtypes

Title                    object
description              object
authors                  object
image                    object
previewLink              object
publisher                object
infoLink                 object
categories               object
reviews number            int64
average rating          float64
median rating           float64
min review date          object
max review date          object
weighted rating         float64
date                     object
year                    float64
description_language     object
Embedding                object
index                     int64
dtype: object

#### Date

In [9]:
dates_columns = ['min review date', 'max review date', 'date']

for date in dates_columns:
    # get date from strings with time
    df[date] = df[date].str.split().str[0]
    # convert in datetime
    df[date] = pd.to_datetime(df[date])

In [10]:
df[dates_columns].isna().sum()

min review date    0
max review date    0
date               0
dtype: int64

In [11]:
# NOTE: we work on a subset of data for now to make the ML run faster
df = df.sample(n=1000, random_state=42)

#### Image embeddings
These need may need to be transformed in from arrays to columns if the model we use is not NN

### Clean data
Most of the cleaning is done in '02 Consolidate books dataset':
- English description
- category containing the word 'fiction'
- non-missing date
- non-missing author
- non-missing publisher
- non-missing cover image

## X and y set up

### Train test split

In [12]:
# Create X and y including all X features and all all teh possible target variables
# NOTE: we will have to add the description PCA in X_features
columns_to_drop = ['description', 'image', 'previewLink',
       'infoLink', 'categories', 'reviews number', 'average rating',
       'median rating', 'min review date', 'max review date',
       'weighted rating', 'date', 'description_language',
       'publisher']

X = df.drop(columns_to_drop, axis = 1)
y = df[['average rating', 'weighted rating']]

In [13]:
# Create train test split

# Need to create train test split for different combinations of data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size= 0.2, 
    random_state= 42,
)

In [14]:
# store indices of train test split
train_indices = X_train[['Title', 'index']]
test_indices = X_test[['Title', 'index']]

train_indices.to_csv(
    os.path.join(output_folder, 'train_indices.csv')
)


test_indices.to_csv(
    os.path.join(output_folder, 'test_indices.csv')
)

In [15]:
# Remove Title and idex
X_train = X_train.drop(['Title', 'index'], axis = 1)
X_test = X_test.drop(['Title', 'index'], axis = 1)

In [16]:
# We are going to run two models for two target variables
# - Target variable: Average rating
#   - baseline (i.e. excluding image embeddings)
#   - including image embeddings
# - Target variable: weighted rating
#   - baseline (i.e. excluding image embeddings)
#   - including image embeddings

# We therefore need to create the following datsets
# - X train and X test with embeddings
# - X train and X text without embeddings
# - y train and y test using average rating
# - y train and y test using weighted rating

X_baseline_train = X_train.drop('Embedding', axis = 1)
X_baseline_test = X_test.drop('Embedding', axis = 1)
X_images_train = X_train
X_images_test = X_test

y_avg_r_train = y_train['average rating']
y_avg_r_test = y_test['average rating']

y_wr_train = y_train['weighted rating']
y_wr_test = y_test['weighted rating']

In [19]:
X_baseline_train.shape

(800, 2)

### Scale variables

No need to scale variables for now

## Set up pipeline

In [20]:
# Only to test pipeline: set X = 'year'
X_baseline_train = X_baseline_train.drop('authors', axis = 1)
X_baseline_test = X_baseline_test.drop('authors', axis = 1)

In [89]:
# Create models

# Random forest
rf = RandomForestRegressor()

# SVR
svr_model = SVR(kernel='rbf')  # 'rbf' for radial basis function kernel

# Lightgbm


# Define pipeline steps
rf_pipeline = Pipeline([
    ('rf', rf)  # Random Forest classifier
])

svr_pipeline = Pipeline([
    ('svr', svr_model)  # Neural Network classifier
])

In [90]:
evaluation_metrics = pd.DataFrame({
    'Random Forest': {'model': rf_pipeline, 'prediction' : None, 'MAE' : None, 'MSE' : None},
    'Support Vector Regression': {'model': svr_pipeline, 'prediction': None, 'MAE' : None, 'MSE' : None}
}).transpose()
evaluation_metrics

Unnamed: 0,model,prediction,MAE,MSE
Random Forest,(RandomForestRegressor()),,,
Support Vector Regression,(SVR()),,,


In [91]:
# Fit and predict

for i, row in evaluation_metrics.iterrows():

    # Call model
    model = row['model']
    
    # Train model
    model.fit(X_baseline_train, y_wr_train)

    # Calculate predictions
    y_wr_pred = model.predict(X_baseline_test)

    # save predictions
    row['prediction'] = y_wr_pred

    # Calculate metrics
    mse = mean_squared_error(y_wr_test, y_wr_pred)
    mae = mean_absolute_error(y_wr_test, y_wr_pred)

    # Save metrics
    row['MAE'] = mae
    row['MSE'] = mse

In [92]:
evaluation_metrics

Unnamed: 0,model,prediction,MAE,MSE
Random Forest,"((DecisionTreeRegressor(max_features=1.0, rand...","[4.109860440240656, 4.321242721594951, 4.22226...",0.149404,0.04883
Support Vector Regression,(SVR()),"[4.233471938121225, 4.291592268497917, 4.26037...",0.140686,0.047959


## Neural Network

-> Questions/notes:
Inputs to choose:
- number of layers:
    - Description NN
        - input
        - noise
        - hidden layer
        - noise
        - hidden layer
        - final layer
    - Description and image embeddings NN
        - input
        - noise
        - hidden layer
        - noise
        - final layer
    Too many?   
- add dense layers to avoid overfitting?
- activation functions
    - ReLu (Rectified linear activation function): piecewise linear function that will output the input directly if it is positive, otherwise, it will output zero. Simple but effective.
- Use linear in the last layer to obtain a continuous variable
- optimizer: 
    - Adam; works with momentums of first and second order. 
    - sdg: variant of Gradient Descent (Gradient Descent is the most basic but most used optimization algorithm. It’s used heavily in linear regression and classification algorithms. It's easy and works well but there is the risk that the model gets stuck in local minima)
- loss function
    - MSE?
- number of epochs
- which metric to use to evaluate the model?
    - MSE
    - MAE

- Use gridsearch to optimise hyperparameters?

### Baseline model

In [35]:
X_baseline_train.shape

(800, 1)

#### Set up

In [93]:
# get number of inputs - second element of shape (i.e. number of columns in X)
input_shape = X_baseline_train.shape[1]

# neurons number
n_neurons = 512

# define a model
baseline_model = keras.Sequential()

# Add input layer
baseline_model.add(layers.Dense(
            n_neurons, # number of neurons
            input_dim = input_shape, # number of inputs 
            activation = 'relu' # activation faunction
            ))

# Hidden - Layers
baseline_model.add(layers.Dropout(
                    0.3, 
                    noise_shape=None, 
                    seed=None))
baseline_model.add(layers.Dense(
                    50, 
                    activation = "relu"))
baseline_model.add(layers.Dropout(
                    0.2, 
                    noise_shape=None, 
                    seed=None))
baseline_model.add(layers.Dense(
    50, 
    activation = "linear"))


baseline_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#### Compile

In [94]:
# Compile model

baseline_model.compile(
    optimizer='adam',
    loss=['mean_squared_error'], 
    metrics = ['mae', 'mean_squared_error']
    )

#### Train

In [95]:
# Train model
epochs_hist = baseline_model.fit(
    X_baseline_train, # input
    y_wr_train, # output
    epochs=100, # number of iterations
    batch_size=50, # number of observations taken to train the data - 1030 obs/50 -> there are 17 groups (observations are taken once for epoch) so model is trained 17 times in each epoch
    verbose=1,
    validation_data = (X_baseline_test, y_wr_test),
    shuffle = True
    #validation_split=0.2,    
)

Epoch 1/100


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4862.5215 - mae: 46.7214 - mean_squared_error: 4862.5220 - val_loss: 17.9847 - val_mae: 4.2352 - val_mean_squared_error: 17.9847
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 21.6027 - mae: 4.3955 - mean_squared_error: 21.6027 - val_loss: 17.9476 - val_mae: 4.2308 - val_mean_squared_error: 17.9476
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 18.3275 - mae: 4.2576 - mean_squared_error: 18.3275 - val_loss: 17.8998 - val_mae: 4.2252 - val_mean_squared_error: 17.8998
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 18.1123 - mae: 4.2431 - mean_squared_error: 18.1123 - val_loss: 17.8457 - val_mae: 4.2188 - val_mean_squared_error: 17.8457
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 17.9856 - mae: 4.2337 - mean_squared_error: 17

### Calculate description embeddings

In [191]:
# Predict baseline X train and X test 

X_intermediate_train = baseline_model.predict(X_baseline_train)
X_intermediate_test = baseline_model.predict(X_baseline_test)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step


## Image embeddings final model

### Set up

In [249]:
X_intermediate_train_df = pd.DataFrame({'baseline output' : [X_intermediate_test]}, index=X_baseline_train.index)
X_intermediate_test_df = pd.DataFrame({'baseline output' : [X_intermediate_test]}, index=X_baseline_test.index)

In [271]:
# Stack description + publish year and images embeddings

# Make X intermediates arrays
# X_intermediate_train = pd.DataFrame({'baseline output'}: [X_intermediate_train]})
# X_intermediate_test = pd.DataFrame({'baseline output'}: [X_intermediate_test]})

X_final_train = pd.merge(
    X_intermediate_train_df, 
    X_images_train[['Embedding']], left_index = True, right_index = True)

X_final_test = pd.merge(
    X_intermediate_test_df, 
    X_images_test[['Embedding']], left_index = True, right_index = True)

In [264]:
# NN 2: descriptions and images

input_shape = X_final_train.shape[1]

# neurons number
n_neurons = 512

# define a model
final_model = keras.Sequential()

# Add input layer
final_model.add(layers.Dense(
            n_neurons, # number of neurons
            input_dim = input_shape, # number of inputs 
            activation = 'relu' # activation faunction
            ))

# Hidden - Layers
final_model.add(layers.Dropout(
                    0.3, 
                    noise_shape=None, 
                    seed=None))
final_model.add(layers.Dense(
    50, 
    activation = "relu"))
final_model.add(layers.Dropout(
                    0.2, 
                    noise_shape=None, 
                    seed=None))
    
# Final layer
final_model.add(layers.Dense(
    1, 
    activation = 'linear'))

final_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [266]:
# Compile model

final_model.compile(
    optimizer='adam',
    loss=['mean_squared_error'], 
    metrics = ['mae', 'mean_squared_error']
    )

#### Train

In [267]:
# Train model
epochs_hist = final_model.fit(
    X_final_train, # input
    y_wr_train, # output
    epochs=100, # number of iterations
    batch_size=50, # number of observations taken to train the data - 1030 obs/50 -> there are 17 groups (observations are taken once for epoch) so model is trained 17 times in each epoch
    verbose=1,
    validation_data = (X_final_test, y_wr_test),
    shuffle = True
    #validation_split=0.2,    
)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

### Predict

In [48]:
# Predict
y_pred = baseline_model.predict(X_baseline_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


### Evaluate model

In [49]:
# evaluate the model (it will give the metric specified when model is compiled)
score = baseline_model.evaluate(
    X_baseline_test,
    y_wr_test,
    verbose=1
)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 617us/step - loss: 13.6194 - mae: 3.6358 - mean_squared_error: 13.6317


In [51]:
mae = mean_squared_error(y_pred, y_wr_test)

ValueError: y_true and y_pred have different number of output (50!=1)

In [None]:
# Visualise NN

# Plotting Loss And Root Mean Square Error For both Training And Test Sets
plt.plot(epochs_hist.history['mae'])
plt.plot(epochs_hist.history['val_mae'])
plt.title('MAE')
plt.ylabel('mae')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(epochs_hist.history['loss'])
plt.plot(epochs_hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('4.png')
plt.show()

In [None]:
# Interpretation of model performance - example with y continuous
test_predictions_ = model.predict(test_df).flatten()
test_labels_ = test_labels.to_numpy().flatten()

_, ax = plt.subplots(figsize=(14,8))
plt.scatter(
    test_labels_,
    test_predictions_,
    alpha=0.6,
    color='#ff7043',
    lw=1,
    ec='black'
)

lims = [
    0,
    max(test_predictions_.max(), test_labels_.max())
]

plt.plot(lims, lims, lw=1, color='#00acc1')
plt.tight_layout()
plt.show()

## Cross validation

### Tune model?

In [None]:
# Grid search?