In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd

In [2]:
data = np.load("combined_representations.npz", allow_pickle=True)
X_train_combined = data["X_train"]
X_val_combined = data["X_val"]
test_combined = data["test"]
y_train = data["y_train"]
y_val = data["y_val"]

In [3]:
test_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.26894767,
         0.28410659, -0.05457572],
       [ 0.        ,  0.        ,  0.        , ..., -0.18193519,
         0.56684933,  0.1033013 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.15412894,
         0.41844097, -0.1535055 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.20144454,
         0.56314219, -0.01820332],
       [ 0.        ,  0.        ,  0.11392692, ..., -0.09101021,
         0.50645848,  0.04244453],
       [ 0.        ,  0.        ,  0.        , ..., -0.29217856,
         0.44455622,  0.144129  ]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.12193654,
        -0.0445922 , -0.00576038],
       [ 0.        ,  0.        ,  0.        , ..., -0.1794887 ,
         0.44336581,  0.22292026],
       [ 0.        ,  0.08711945,  0.08898357, ..., -0.03605678,
         0.48175168,  0.20329235],
       ...,
       [ 0.        ,  0.        ,  0. 

In [4]:
X_val_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.28789246,
         0.48497257,  0.15152374],
       [ 0.        ,  0.        ,  0.        , ..., -0.06563767,
         0.40370935,  0.0725085 ],
       [ 0.        ,  0.        ,  0.1388047 , ..., -0.16953171,
         0.3015066 ,  0.14860522],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.10403167,
        -0.09614274,  0.04147864],
       [ 0.        ,  0.        ,  0.        , ..., -0.16573102,
         0.38612467,  0.25377321],
       [ 0.        ,  0.        ,  0.        , ..., -0.121358  ,
         0.53756869,  0.1067788 ]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.36170898,
         0.30741972,  0.08949168],
       [ 0.        ,  0.        ,  0.        , ..., -0.11652934,
         0.34135978,  0.214483  ],
       [ 0.        ,  0.        ,  0.        , ..., -0.20559426,
         0.22731568,  0.1786795 ],
       ...,
       [ 0.        ,  0.        ,  0. 

In [5]:
X_train_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.18555058,
         0.44161261, -0.0627408 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.0345583 ,
         0.04763634, -0.02800948],
       [ 0.        ,  0.        ,  0.        , ..., -0.08426904,
         0.34936657,  0.0997809 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.21587891,
         0.39432905,  0.11549282],
       [ 0.        ,  0.        ,  0.        , ..., -0.23952458,
         0.445471  ,  0.19499586],
       [ 0.        ,  0.        ,  0.        , ..., -0.12781864,
         0.43982258,  0.14028625]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.28434646,
        -0.07843604, -0.0471223 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.0291326 ,
         0.04489934, -0.0054476 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.25649588,
         0.346195  ,  0.22571714],
       ...,
       [ 0.        ,  0.        ,  0. 

In [6]:
y_val.shape

(1562,)

In [7]:
y_train.shape

(4686,)

### Preparing data for the models

In [8]:
# For X_train_combined

train_dict = X_train_combined.item()

description_features_train = train_dict['description']
host_about_features_train = train_dict['host_about']
comments_features_train = train_dict['comments']

X_train_merged = np.concatenate((description_features_train, host_about_features_train, comments_features_train), axis=1)
print("Shape of merged feature matrix:", X_train_merged.shape)

Shape of merged feature matrix: (4686, 1800)


In [9]:
# For X_val_combined

val_dict = X_val_combined.item()

description_features_val = val_dict['description']
host_about_features_val = val_dict['host_about']
comments_features_val = val_dict['comments']

X_val_merged = np.concatenate((description_features_val, host_about_features_val, comments_features_val), axis=1)
print("Shape of merged feature matrix:", X_val_merged.shape)

Shape of merged feature matrix: (1562, 1800)


In [10]:
# For test_combined

test_dict = test_combined.item()

description_features_test = test_dict['description']
host_about_features_test = test_dict['host_about']
comments_features_test = test_dict['comments']

X_test_merged = np.concatenate((description_features_test, host_about_features_test, comments_features_test), axis=1)
print("Shape of merged feature matrix:", X_test_merged.shape)

Shape of merged feature matrix: (695, 1800)


# Models
##### Grid Searches are small because that's not the main point

### Logistic Regression

In [11]:
# Define values for grid search
parameters = {
    'C': [0.005, 0.01, 0.05],  # Regularization
    'penalty': ['l1', 'l2'],  
    'solver': ['liblinear', 'saga'],  # Optimization     
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['C']) * len(parameters['penalty']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for C, penalty, solver in product(parameters['C'], parameters['penalty'], parameters['solver']):

    pbar.update(1)
    lr = LogisticRegression(C=C, penalty=penalty, solver=solver, random_state=0)

    lr.fit(X_train_merged, y_train)
    f1 = f1_score(y_val, lr.predict(X_val_merged), average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'C': C, 'penalty': penalty, 'solver': solver}

pbar.close()

# Use the best parameter combination found
print("Best parameters found:", best_params)
best_lr = LogisticRegression(**best_params, random_state=0)
best_lr.fit(X_train_merged, y_train)

# Predictions
lr_pred = best_lr.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_lr.score(X_val_merged, y_val))
print(classification_report(y_val, lr_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lr_pred))

f1 = f1_score(y_val, lr_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 12/12 [00:29<00:00,  2.49s/it]


Best parameters found: {'C': 0.005, 'penalty': 'l1', 'solver': 'liblinear'}
F1 score on validation set with best parameters: 0.7266325224071702
              precision    recall  f1-score   support

           0       0.73      1.00      0.84      1135
           1       0.00      0.00      0.00       427

    accuracy                           0.73      1562
   macro avg       0.36      0.50      0.42      1562
weighted avg       0.53      0.73      0.61      1562

Confusion Matrix:
[[1135    0]
 [ 427    0]]
F1 score on validation set: 0.6115891085889048


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Best F1: 0.8594296203352352 ; for: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
# New one: F1 score on validation set: 0.6115891085889048

### K Nearest Neighbors

In [13]:
# Values for grid search
parameters = {
    'n_neighbors': [3, 5, 7],     # nº of neighbors
    'weights': ['uniform', 'distance'],     # Weight function used 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],     # Algorithm used 
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['n_neighbors']) * len(parameters['weights']) * len(parameters['algorithm'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for n_neighbors, weights, algorithm in product(parameters['n_neighbors'], parameters['weights'], parameters['algorithm']):

    pbar.update(1)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    
    knn.fit(X_train_merged, y_train)
    knn_pred = knn.predict(X_val_merged)
    
    f1 = f1_score(y_val, knn_pred, average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm}

pbar.close()

# Use the best model found
print("Best parameters found:", best_params)
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_merged, y_train)

# Predictions
knn_pred = best_knn.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_knn.score(X_val_merged, y_val))
print(classification_report(y_val, knn_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, knn_pred))

f1 = f1_score(y_val, knn_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 24/24 [08:12<00:00, 20.53s/it]


Best parameters found: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto'}
F1 score on validation set with best parameters: 0.7125480153649167
              precision    recall  f1-score   support

           0       0.77      0.87      0.81      1135
           1       0.46      0.29      0.36       427

    accuracy                           0.71      1562
   macro avg       0.61      0.58      0.59      1562
weighted avg       0.68      0.71      0.69      1562

Confusion Matrix:
[[989 146]
 [303 124]]
F1 score on validation set: 0.6894710596358977


In [14]:
# Best F1: 0.7863375246559777 ; for: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree'}

In [None]:
# New one: F1 score on validation set: 0.6894710596358977

### MLP

In [15]:
# Parameters for grid search
parameters = {
    'hidden_layer_sizes': [(2, 2), (5, 5), (2, 2, 2)],    # hidden layers
    'activation': ['logistic', 'relu', 'tanh'],    # Activation function
    'solver': ['lbfgs', 'sgd'],     # Optimization algorithm
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['hidden_layer_sizes']) * len(parameters['activation']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Perform grid search
for hidden_layer_sizes, activation, solver in product(parameters['hidden_layer_sizes'], parameters['activation'], parameters['solver']):

    pbar.update(1)
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, random_state=0)

    mlp.fit(X_train_merged, y_train)
    mlp_pred = mlp.predict(X_val_merged)

    f1 = f1_score(y_val, mlp_pred, average='weighted')
    
    # Check if best score
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'hidden_layer_sizes': hidden_layer_sizes, 'activation': activation, 'solver': solver}

pbar.close()

# Use the best model found by manual search
print("Best parameters found:", best_params)
best_mlp = MLPClassifier(**best_params, random_state=0)
best_mlp.fit(X_train_merged, y_train)

# Predictions
mlp_pred = best_mlp.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_mlp.score(X_val_merged, y_val))
print(classification_report(y_val, mlp_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, mlp_pred))

f1 = f1_score(y_val, mlp_pred, average='weighted')
print("F1 score on validation set:", f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best parameters found: {'hidden_layer_sizes': (2, 2, 2), 'activation': 'relu', 'solver': 'lbfgs'}
F1 score on validation set with best parameters: 0.6754161331626121
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      1135
           1       0.36      0.24      0.29       427

    accuracy                           0.68      1562
   macro avg       0.55      0.54      0.54      1562
weighted avg       0.64      0.68      0.65      1562

Confusion Matrix:
[[953 182]
 [325 102]]
F1 score on validation set: 0.6523929374820174


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [16]:
# Best F1: 0.8607046991130397 ; for: {'hidden_layer_sizes': (2, 2), 'activation': 'tanh', 'solver': 'sgd'}

In [None]:
# New one: F1 score on validation set: 0.6523929374820174

# Extra Models

### Random Forest

In [17]:
# Values for grid search
parameters_rf = {
    'n_estimators': [100, 200],     # nº of trees
    'max_depth': [None, 10, 20],      # max depth
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2]     
}

best_f1_rf = 0
best_params_rf = None

# Progression bar
total_combinations_rf = len(parameters_rf['n_estimators']) * len(parameters_rf['max_depth']) * len(parameters_rf['min_samples_split']) * len(parameters_rf['min_samples_leaf'])
pbar_rf = tqdm(total=total_combinations_rf)

# Perform grid search
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(parameters_rf['n_estimators'], parameters_rf['max_depth'], parameters_rf['min_samples_split'], parameters_rf['min_samples_leaf']):

    pbar_rf.update(1)

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=0)
    rf.fit(X_train_merged, y_train)
    
    # Make predictions
    rf_pred = rf.predict(X_val_merged)
    f1_rf = f1_score(y_val, rf_pred, average='weighted')
    
    # Check if it's the best score
    if f1_rf > best_f1_rf:
        best_f1_rf = f1_rf
        best_params_rf = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

pbar_rf.close()

# Use the best model found by manual search for Random Forest
print("Best parameters found for Random Forest:", best_params_rf)
best_rf = RandomForestClassifier(**best_params_rf, random_state=0)
best_rf.fit(X_train_merged, y_train)

# Predictions
rf_pred = best_rf.predict(X_val_merged)

print("F1 score on validation set with best parameters for Random Forest:", best_rf.score(X_val_merged, y_val))
print(classification_report(y_val, rf_pred))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_val, rf_pred))

f1_rf = f1_score(y_val, rf_pred, average='weighted')
print("F1 score on validation set for Random Forest:", f1_rf)

100%|██████████| 36/36 [17:10<00:00, 28.63s/it]


Best parameters found for Random Forest: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1}
F1 score on validation set with best parameters for Random Forest: 0.7458386683738797
              precision    recall  f1-score   support

           0       0.74      0.99      0.85      1135
           1       0.81      0.09      0.16       427

    accuracy                           0.75      1562
   macro avg       0.78      0.54      0.51      1562
weighted avg       0.76      0.75      0.66      1562

Confusion Matrix for Random Forest:
[[1126    9]
 [ 388   39]]
F1 score on validation set for Random Forest: 0.6626234679812726


In [18]:
# Best F1: 0.8804259327087324 ; for: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1}

In [None]:
# New one: F1 score on validation set for Random Forest: 0.6626234679812726

### SVM

In [19]:
parameters_svm = {
    'C': [0.1, 1, 10],      # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf'],     # Kernel type
    'gamma': ['scale', 'auto']      # coefficient
}

best_f1_svm = 0
best_params_svm = None

# progression bar
total_combinations_svm = len(parameters_svm['C']) * len(parameters_svm['kernel']) * len(parameters_svm['gamma'])
pbar_svm = tqdm(total=total_combinations_svm)

# Doing grid search
for C, kernel, gamma in product(parameters_svm['C'], parameters_svm['kernel'], parameters_svm['gamma']):

    pbar_svm.update(1)
    
    svm = SVC(C=C, kernel=kernel, gamma=gamma, random_state=0)
    svm.fit(X_train_merged, y_train)
    
    svm_pred = svm.predict(X_val_merged)
    f1_svm = f1_score(y_val, svm_pred, average='weighted')
    
    if f1_svm > best_f1_svm:
        best_f1_svm = f1_svm
        best_params_svm = {'C': C, 'kernel': kernel, 'gamma': gamma}

pbar_svm.close()

# Use the best model found by manual search for SVM
best_svm = SVC(**best_params_svm, random_state=0)
best_svm.fit(X_train_merged, y_train)

# Predictions
svm_pred = best_svm.predict(X_val_merged)

print("Best parameters found for SVM:", best_params_svm)
print("F1 score on validation set with best parameters for SVM:", best_svm.score(X_val_merged, y_val))
print(classification_report(y_val, svm_pred))
print("Confusion Matrix for SVM:")
print(confusion_matrix(y_val, svm_pred))

f1_svm = f1_score(y_val, svm_pred, average='weighted')
print("F1 score on validation set for SVM:", f1_svm)

100%|██████████| 18/18 [09:24<00:00, 31.36s/it]


Best parameters found for SVM: {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}
F1 score on validation set with best parameters for SVM: 0.7183098591549296
              precision    recall  f1-score   support

           0       0.75      0.93      0.83      1135
           1       0.46      0.16      0.24       427

    accuracy                           0.72      1562
   macro avg       0.60      0.54      0.53      1562
weighted avg       0.67      0.72      0.67      1562

Confusion Matrix for SVM:
[[1053   82]
 [ 358   69]]
F1 score on validation set for SVM: 0.6663234776328816


In [20]:
# Best F1: 0.850832266325224 ; for: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale'}

In [None]:
# New one: F1 score on validation set for SVM: 0.6663234776328816

## Predictions on Test
##### Using model with the best score in validation -> Random Forest

In [22]:
# Decided to train on the entire dataset for the predictions on test set
X_train_combined = pd.concat([X_train_merged, X_val_merged])
y_train_combined = pd.concat([y_train, y_val])

# Use model with best score from validation
best_rf.fit(X_train_combined, y_train_combined)
test_predictions = best_rf.predict(test_combined)

# Creating the output file
predictions_df = pd.DataFrame({'id': test_combined['id'], 'predicted': test_predictions})
predictions_df.to_csv('Predictions_XX.csv', index=False)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid