## Implementation of a Neural Network

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from joblib import load


# Load the saved TF-IDF feature matrix and target variable for the training data
model_train_tfidf = load('../pkl_files/model_train_tfidf.pkl')
target = load('../pkl_files/target.pkl')
model_test_tfidf = load('../pkl_files/model_test_tfidf.pkl')

# Load the saved Count Vectorizer feature matrix and target varible for the training data
model_train_count = load('../pkl_files/model_train_count.pkl')
target = load('../pkl_files/target.pkl')
model_test_count = load('../pkl_files/model_test_count.pkl')

In [8]:
# Split the training data into training and validation sets
X_validation_train_tfidf, X_validation_test_tfidf, y_validation_train_tfidf, y_validation_test_tfidf = train_test_split(model_train_tfidf, target, test_size=0.2, random_state=42)

Perform cross validation to determine the optimal hyperparameters for the neural network

In [None]:
mlp_tfidf_cv = MLPClassifier(hidden_layer_sizes=(64,32),  # Two hidden layers, with 512 and 256 neurons respectively     
                    solver='adam',                 # adam optimizer
                    alpha = 0.0001,                # default for l2 (ridge)
                    max_iter=50,                # Maximum number of iterations
                    random_state=42,             # Random seed for reproducibility
                    batch_size=32,               # Batch size for gradient descent
                    learning_rate='adaptive',    # Adaptive learning rate
                    learning_rate_init = 0.01,
                    momentum = 0.9,
                    nesterovs_momentum = True,
                    early_stopping = True,
                    warm_start=False,            # Whether to reuse the previous solution
                    tol=1e-4,                    # Tolerance for stopping
                    verbose=True)       


param_grid = {
    'hidden_layer_sizes': [(64, 32), (128,), (64,64)],  # Different architectures
    'alpha': [0.0001, 0.001],                      # Regularization
    'learning_rate_init': [0.001, 0.01],          # Learning rates
}

grid_search = GridSearchCV(estimator=mlp_tfidf_cv, param_grid=param_grid, 
                           scoring='f1_weighted', cv=3, verbose=1, n_jobs = -1)

# Fit the Grid Search model on the training data
grid_search.fit(X_validation_train_tfidf, y_validation_train_tfidf)

# Output the best parameters and F1 score found 
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated F1 score: {grid_search.best_score_:.4f}")

# Fit the final MLP model using the best parameters
best_model_tfidf = grid_search.best_estimator_

# Make predictions on the test set
y_val_pred_tfidf = best_model_tfidf.predict(X_validation_test_tfidf)

# Evaluate the model performance (e.g., accuracy)
f1score = f1_score(y_validation_test_tfidf, y_val_pred_tfidf, average='weighted')
print(f"F1 on test set: {f1score:.4f}")

#joblib.dump(mlp, '../pkl_files/mlp_model.pkl')

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Iteration 1, loss = 0.54861118
Validation score: 0.731527
Iteration 1, loss = 0.56342031
Validation score: 0.758621
Iteration 1, loss = 0.64111006
Iteration 1, loss = 0.56514255
Validation score: 0.714286
Validation score: 0.788177
Iteration 1, loss = 0.63394168
Validation score: 0.795567
Iteration 1, loss = 0.64416335
Validation score: 0.751232
Iteration 2, loss = 0.23879959
Validation score: 0.682266
Iteration 2, loss = 0.26917050
Validation score: 0.657635
Iteration 2, loss = 0.26044986
Validation score: 0.642857
Iteration 2, loss = 0.44511697
Validation score: 0.783251
Iteration 2, loss = 0.45080041
Validation score: 0.736453
Iteration 2, loss = 0.45547688
Validation score: 0.780788
Iteration 1, loss = 0.61818959
Validation score: 0.743842
Iteration 1, loss = 0.62126417
Validation score: 0.780788
Iteration 3, loss = 0.07651488
Validation score: 0.756158
Iteration 3, loss = 0.10282978
Validation score: 0.674877
Iteration 3

Using the best trained model, retrain the model on the entire training dataset and predict the test set Y values.

In [None]:
best_model_tfidf.fit(model_train_tfidf, target)

# Make predictions on the test set
y_nn_pred_tfidf = best_model_tfidf.predict(model_test_tfidf)

neural_network_tfidf_submission = pd.read_csv("../Data/sample_submission.csv")

# replace empty target column with predicted values
neural_network_tfidf_submission['target'] = y_nn_pred_tfidf

# Save results for submission
neural_network_tfidf_submission.to_csv('../csv_files/neural_network_tfidf_submission.csv', index=False)

Iteration 1, loss = 0.57174146
Iteration 2, loss = 0.38137917
Iteration 3, loss = 0.24825102
Iteration 4, loss = 0.15916433
Iteration 5, loss = 0.10640699
Iteration 6, loss = 0.07546437
Iteration 7, loss = 0.05728817
Iteration 8, loss = 0.04456114
Iteration 9, loss = 0.03965334
Iteration 10, loss = 0.03453431
Iteration 11, loss = 0.03189475
Iteration 12, loss = 0.02924954
Iteration 13, loss = 0.02737406
Iteration 14, loss = 0.02649217
Iteration 15, loss = 0.02566075
Iteration 16, loss = 0.02461917
Iteration 17, loss = 0.02476661
Iteration 18, loss = 0.02333141
Iteration 19, loss = 0.02363084
Iteration 20, loss = 0.02320705
Iteration 21, loss = 0.02150139
Iteration 22, loss = 0.02331961
Iteration 23, loss = 0.02391142
Iteration 24, loss = 0.02367958
Iteration 25, loss = 0.02197073
Iteration 26, loss = 0.02293009
Iteration 27, loss = 0.02401903
Iteration 28, loss = 0.02185660
Iteration 29, loss = 0.01918794
Iteration 30, loss = 0.02092213
Iteration 31, loss = 0.02068775
Iteration 32, los

## Fit count vectorized data and make predictions

In [19]:
X_validation_train_count, X_validation_test_count, y_validation_train_count, y_validation_test_count = train_test_split(model_train_count, target, test_size=0.2, random_state=42)

In [20]:
mlp_count_cv = MLPClassifier(hidden_layer_sizes=(64,32),  # Two hidden layers, with 512 and 256 neurons respectively     
                    solver='adam',                 # adam optimizer
                    alpha = 0.0001,                # default for l2 (ridge)
                    max_iter=50,                # Maximum number of iterations
                    random_state=42,             # Random seed for reproducibility
                    batch_size=32,               # Batch size for gradient descent
                    learning_rate='adaptive',    # Adaptive learning rate
                    learning_rate_init = 0.01,
                    momentum = 0.9,
                    nesterovs_momentum = True,
                    early_stopping = True,
                    warm_start=False,            # Whether to reuse the previous solution
                    tol=1e-4,                    # Tolerance for stopping
                    verbose=True)       


param_grid = {
    'hidden_layer_sizes': [(64, 32), (128,), (64,64)],  # Different architectures
    'alpha': [0.0001, 0.001],                      # Regularization
    'learning_rate_init': [0.001, 0.01],          # Learning rates
}

grid_search = GridSearchCV(estimator=mlp_count_cv, param_grid=param_grid, 
                           scoring='f1_weighted', cv=3, verbose=1, n_jobs = -1)

# Fit the Grid Search model on the training data
grid_search.fit(X_validation_train_count, y_validation_train_count)

# Output the best parameters and F1 score found 
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated F1 score: {grid_search.best_score_:.4f}")

# Fit the final MLP model using the best parameters
best_model_count = grid_search.best_estimator_

# Make predictions on the test set
y_val_pred_count = best_model_count.predict(X_validation_test_count)

# Evaluate the model performance (e.g., accuracy)
f1score = f1_score(y_validation_test_count, y_val_pred_count, average='weighted')
print(f"F1 on test set: {f1score:.4f}")


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Iteration 1, loss = 0.57267931
Validation score: 0.761084
Iteration 1, loss = 0.51534161
Validation score: 0.778325
Iteration 1, loss = 0.58002721
Validation score: 0.738916
Iteration 1, loss = 0.53008816
Validation score: 0.798030
Iteration 1, loss = 0.58609825
Validation score: 0.802956
Iteration 1, loss = 0.53270679
Validation score: 0.743842
Iteration 2, loss = 0.17741808
Validation score: 0.721675
Iteration 2, loss = 0.28981509
Validation score: 0.743842
Iteration 2, loss = 0.30544289
Validation score: 0.726601
Iteration 2, loss = 0.19581190
Validation score: 0.756158
Iteration 2, loss = 0.31072609
Validation score: 0.773399
Iteration 2, loss = 0.22273996
Validation score: 0.726601
Iteration 1, loss = 0.55553821
Validation score: 0.768473
Iteration 1, loss = 0.56059670
Validation score: 0.788177
Iteration 3, loss = 0.07000510
Validation score: 0.741379
Iteration 3, loss = 0.08937903
Validation score: 0.773399
Iteration 3

Now, retrain the neural network with tuned hyperparameters on the entire count vectorized training set and make predictions.

In [21]:
mlp_count_cv.fit(model_train_count, target)

# Make predictions on the test set
y_nn_pred_count = mlp_count_cv.predict(model_test_count)

neural_network_count_submission = pd.read_csv("../Data/sample_submission.csv")

# replace empty target column with predicted values
neural_network_count_submission['target'] = y_nn_pred_count

# Save results for submission
neural_network_count_submission.to_csv('../csv_files/neural_network_count_submission.csv', index=False)

Iteration 1, loss = 0.49552018
Validation score: 0.799213
Iteration 2, loss = 0.24620945
Validation score: 0.783465
Iteration 3, loss = 0.12072545
Validation score: 0.762467
Iteration 4, loss = 0.07628822
Validation score: 0.748031
Iteration 5, loss = 0.05194650
Validation score: 0.755906
Iteration 6, loss = 0.04348739
Validation score: 0.754593
Iteration 7, loss = 0.05102178
Validation score: 0.733596
Iteration 8, loss = 0.05303021
Validation score: 0.754593
Iteration 9, loss = 0.05621832
Validation score: 0.753281
Iteration 10, loss = 0.06291985
Validation score: 0.740157
Iteration 11, loss = 0.05746509
Validation score: 0.740157
Iteration 12, loss = 0.05513306
Validation score: 0.757218
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
