# Machine Learning

## Set up

In [9]:
# Work with datarames
import pandas as pd
import numpy as np

# Charts
import seaborn as sns
from matplotlib import pyplot as plt

# X, Y preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree

# Neural Network
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,Dropout
from keras.optimizers import Adam, SGD

# Pipeline
from sklearn.pipeline import Pipeline

## Import and merge data

### Datasets

In [None]:
#Title-level dataset
titles_df

In [1]:
# Images
covers_df

In [2]:
# NLP
descriptions_df

### Merge

### Format data

### Clean data

-> Questions/notes:
To decide which observations to drop
- non-English description
- no date?
- others?

## X and y set up

### Train test split

-> Note:
- will have to create X including and X excluding images

In [3]:
# Keep only columns for ML

# Example code:
columns_to_drop = []
data = data.drop(columns_to_drop, axis = 1)

In [5]:
# Create X and Y

# Example code
x_vars = []
y_var = ''

X = data[x_vars]
y = data[y_var]

In [6]:
# Create train test split

# Example code
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size= 0.2, 
    random_state= 42,
)

### Scale variables

-> Questions/notes:
Do we need to fit transform or normalise the data?    
They are mostly categorical or non numerical variables so maybe not?  
If yes:
- same transformation for both models?
- which type of scaler?
- do we need to scale also y?

In [None]:
# If needed, use code below

# Choose type of scaler
scaler = StandardScaler()
scaler = MinMaxScaler()

# fit_transform - get mean and std deviation and normalise the X_train data
# Note: do this on X train data to avoid data leakage! would have this issue if we were doing this on X
X_train = scaler.fit_transform(X_train)
# transform test based on info of X train
X_test = scaler.transform(X_test)

# Also fit transform y?

## Random forest

-> Questions/notes:
- any hyperparameter we need to think about?
- which metric are we using to evaluate the model?

### Set up

In [2]:
# Set up RF
rf = RandomForestRegressor()

### Train model

-> Note:
- this will have to be done including and excluding images

In [None]:
# Fit data
rf.fit(X_train, y_train)

### Predict

In [None]:
# Make predictions
rf_pred = rf.predict(X_test)

### Feature importance

In [None]:
# Display the feature importance.

# get importance of features and assign names
rf_importances = rf.feature_importances_
rf_importances = pd.DataFrame({'feature':X.columns, 'importance': rf_importances})
# sort dataset by importance
rf_importances = rf_importances.sort_values(by = 'importance', ascending = False)

# Draw chart
ax = sns.barplot(
    data = rf_importances,
    x = 'importance',
    y = 'feature'
)

for index, value in enumerate(rf_importances['importance']):
    ax.text(value, index, f'{value:.4f}', ha='left', va='center', fontsize=10)

### Decision tree of the forest

In [4]:
#Using a decision tree with a max_leaf_nodes of 3, plot the decision tree

dt_max_3 = DecisionTreeRegressor(max_depth=3)
dt_max_3.fit(X_train,y_train)
dt_max_3_predictions = dt_max_3.predict(X_test)

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (10,10), dpi=300)
tree.plot_tree(
    dt_max_3,
    max_depth = 3, 
    feature_names=X_train.columns,  
    class_names=True,
    filled=True,
    fontsize=8
)
plt.show()


### Evaluate model

## Neural Network

-> Questions/notes:
Inputs to choose:
- number of layers
- activation functions
- Use softmax in the last layer to obtain the probability distribution of the outcome?
- optimizer: Adam? sdg?
- loss function
- add dense layers to avoid overfitting?
- number of epochs
- which metric to use to evaluate the model?

- Use gridsearch to optimise hyperparameters?

### Set up

In [None]:
# get number of inputs - second element of shape (i.e. number of columns in X)
input_shape = X.shape[1]

# neurons number
n_neurons = 512

# define a model
model = keras.Sequential()

# Add input layer
model.add(layers.Dense(
            n_neurons, # number of neurons
            input_dim = input_shape, # number of inputs 
            activation = 'tanh' # activation faunction
            ))

# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))

# To change activation function: Output: only want one neuron in the last layer = no activation because we want an output that is a continuous variable (same as saying activation = linear)
model.add(layers.Dense(1, activation = 'linear'))

model.summary()

### Compile

In [None]:
# Compile model

# Concrete and wine datasets (output is continuous variable)
model.compile(
    optimizer='sgd', 
    loss='mean_squared_error', 
    metrics=['mae'])

# Shallow Network (numbers recognition - output is categorical var)
model.compile(
    loss='categorical_crossentropy',
    optimizer=SGD(learning_rate=0.01),  # lr = learning rate
    metrics=['accuracy']
)

# Auto purchase dataset (y is continuous)
model.compile(
    optimizer='adam',
    loss='mean_squared_error')

### Tune model?

In [None]:
# Grid search?

### Train model

-> Note:
- this will have to be done including and excluding images

In [None]:
# Train model
epochs_hist = model.fit(
    X_train, # input
    y_train, # output
    epochs=100, # number of iterations
    batch_size=50, # number of observations taken to train the data - 1030 obs/50 -> there are 17 groups (observations are taken once for epoch) so model is trained 17 times in each epoch
    verbose=1,
    validation_data = (X_test, y_test),
    shuffle = True
    #validation_split=0.2,    
)

### Predict

In [None]:
# Predict
y_pred = model.predict(X_test)

### Evaluate model

In [None]:
# evaluate the model (it will give the metric specified when model is compiled)
score = model.evaluate(
    X_test,
    y_test,
    verbose=1
)

In [None]:
# Visualise NN

# Plotting Loss And Root Mean Square Error For both Training And Test Sets
plt.plot(epochs_hist.history['mae'])
plt.plot(epochs_hist.history['val_mae'])
plt.title('MAE')
plt.ylabel('mae')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(epochs_hist.history['loss'])
plt.plot(epochs_hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('4.png')
plt.show()

In [None]:
# Interpretation of model performance - example with y continuous
test_predictions_ = model.predict(test_df).flatten()
test_labels_ = test_labels.to_numpy().flatten()

_, ax = plt.subplots(figsize=(14,8))
plt.scatter(
    test_labels_,
    test_predictions_,
    alpha=0.6,
    color='#ff7043',
    lw=1,
    ec='black'
)

lims = [
    0,
    max(test_predictions_.max(), test_labels_.max())
]

plt.plot(lims, lims, lw=1, color='#00acc1')
plt.tight_layout()
plt.show()

## Set up pipeline

-> Questions/notes:
- how to integrate NN in the pipeline?

In [None]:
# Example
# Define pipeline steps
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('rf', rf)  # Random Forest classifier
])

nn_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('nn', model)  # Neural Network classifier
])

# Fit Random Forest pipeline
rf_pipeline.fit(X_train, y_train)

# Fit Neural Network pipeline
nn_pipeline.fit(X_train, y_train)

# Evaluate models
rf_accuracy = rf_pipeline.score(X_test, y_test)
nn_accuracy = nn_pipeline.score(X_test, y_test)

print("Random Forest Accuracy:", rf_accuracy)
print("Neural Network Accuracy:", nn_accuracy)
