In [18]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
from itertools import product
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [4]:
data = np.load("combined_representations.npz", allow_pickle=True)
X_train_combined = data["X_train"]
X_val_combined = data["X_val"]
test_combined = data["test"]
y_train = data["y_train"]
y_val = data["y_val"]

In [5]:
test_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.08468467,
         0.16710296,  0.057366  ],
       [ 0.        ,  0.        ,  0.        , ..., -0.01306781,
         0.13182336,  0.0041255 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.13805758,
         0.24629688, -0.0862156 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.13059046,
         0.33800852,  0.01264326],
       [ 0.        ,  0.        ,  0.        , ..., -0.04177414,
         0.33193919,  0.06621849],
       [ 0.        ,  0.        ,  0.        , ..., -0.08864344,
         0.25635894,  0.10819403]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.04349438,
        -0.01102248, -0.00268946],
       [ 0.        ,  0.        ,  0.        , ..., -0.07374551,
         0.27137006,  0.1386768 ],
       [ 0.        ,  0.08407949,  0.08587857, ..., -0.03365872,
         0.21961312,  0.08438464],
       ...,
       [ 0.        ,  0.        ,  0. 

In [6]:
X_val_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.17752816,
         0.33020436,  0.11092447],
       [ 0.        ,  0.        ,  0.        , ..., -0.01434965,
         0.39485991,  0.04780014],
       [ 0.        ,  0.        ,  0.        , ..., -0.07215451,
         0.19116122,  0.10603562],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.12204272,
         0.00810044,  0.0257803 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.15537582,
         0.30471598,  0.19891339],
       [ 0.        ,  0.        ,  0.        , ..., -0.01584726,
         0.248952  ,  0.10495016]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.13266554,
         0.21048408,  0.06905448],
       [ 0.        ,  0.        ,  0.        , ..., -0.07712197,
         0.24841082,  0.07409752],
       [ 0.        ,  0.        ,  0.        , ..., -0.19376324,
         0.16052756,  0.18207438],
       ...,
       [ 0.        ,  0.        ,  0. 

In [7]:
X_train_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.1268446 ,
         0.19595661, -0.0083481 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.04427454,
        -0.0050476 , -0.03812569],
       [ 0.        ,  0.        ,  0.        , ..., -0.0455008 ,
         0.21400927,  0.0660574 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.03801016,
         0.34924866,  0.05680282],
       [ 0.        ,  0.        ,  0.        , ..., -0.13847188,
         0.24170396,  0.12646742],
       [ 0.        ,  0.        ,  0.        , ..., -0.01582526,
         0.24599232,  0.05336657]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.06807236,
        -0.0216408 ,  0.01123832],
       [ 0.        ,  0.        ,  0.        , ...,  0.018232  ,
         0.03550534, -0.0068904 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.17294988,
         0.2813098 ,  0.13600118],
       ...,
       [ 0.        ,  0.        ,  0. 

In [8]:
y_val.shape

(1562,)

In [9]:
y_train.shape

(4686,)

### Preparing data for the models

In [10]:
# For X_train_combined

train_dict = X_train_combined.item()

description_features_train = train_dict['description']
host_about_features_train = train_dict['host_about']
comments_features_train = train_dict['comments']

X_train_merged = np.concatenate((description_features_train, host_about_features_train, comments_features_train), axis=1)
print("Shape of merged feature matrix:", X_train_merged.shape)

Shape of merged feature matrix: (4686, 1800)


In [11]:
# For X_val_combined

val_dict = X_val_combined.item()

description_features_val = val_dict['description']
host_about_features_val = val_dict['host_about']
comments_features_val = val_dict['comments']

X_val_merged = np.concatenate((description_features_val, host_about_features_val, comments_features_val), axis=1)
print("Shape of merged feature matrix:", X_val_merged.shape)

Shape of merged feature matrix: (1562, 1800)


In [12]:
# For test_combined

test_dict = test_combined.item()

description_features_test = test_dict['description']
host_about_features_test = test_dict['host_about']
comments_features_test = test_dict['comments']

X_test_merged = np.concatenate((description_features_test, host_about_features_test, comments_features_test), axis=1)
print("Shape of merged feature matrix:", X_test_merged.shape)

Shape of merged feature matrix: (695, 1800)


# Models
##### Grid Searches are small because that's not the main point

### Logistic Regression

In [13]:
# Define values for grid search
parameters = {
    'C': [0.005, 0.01, 0.05],  # Regularization
    'penalty': ['l1', 'l2'],  
    'solver': ['liblinear', 'saga'],  # Optimization     
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['C']) * len(parameters['penalty']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for C, penalty, solver in product(parameters['C'], parameters['penalty'], parameters['solver']):

    pbar.update(1)
    lr = LogisticRegression(C=C, penalty=penalty, solver=solver, random_state=0)

    lr.fit(X_train_merged, y_train)
    f1 = f1_score(y_val, lr.predict(X_val_merged), average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'C': C, 'penalty': penalty, 'solver': solver}

pbar.close()

# Use the best parameter combination found
print("Best parameters found:", best_params)
best_lr = LogisticRegression(**best_params, random_state=0)
best_lr.fit(X_train_merged, y_train)

# Predictions
lr_pred = best_lr.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_lr.score(X_val_merged, y_val))
print(classification_report(y_val, lr_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lr_pred))

f1 = f1_score(y_val, lr_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 12/12 [00:26<00:00,  2.22s/it]


Best parameters found: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
F1 score on validation set with best parameters: 0.8553137003841229
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      1135
           1       0.69      0.85      0.76       427

    accuracy                           0.86      1562
   macro avg       0.81      0.85      0.83      1562
weighted avg       0.87      0.86      0.86      1562

Confusion Matrix:
[[974 161]
 [ 65 362]]
F1 score on validation set: 0.8594296203352352


In [14]:
# Best F1: 0.8594296203352352 ; for: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

### K Nearest Neighbors

In [15]:
# Values for grid search
parameters = {
    'n_neighbors': [3, 5, 7],     # nº of neighbors
    'weights': ['uniform', 'distance'],     # Weight function used 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],     # Algorithm used 
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['n_neighbors']) * len(parameters['weights']) * len(parameters['algorithm'])
pbar = tqdm(total=total_combinations)

# Iterate over all combinations of parameters
for n_neighbors, weights, algorithm in product(parameters['n_neighbors'], parameters['weights'], parameters['algorithm']):

    pbar.update(1)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    
    knn.fit(X_train_merged, y_train)
    knn_pred = knn.predict(X_val_merged)
    
    f1 = f1_score(y_val, knn_pred, average='weighted')
    
    # Check if score is the best
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm}

pbar.close()

# Use the best model found
print("Best parameters found:", best_params)
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_merged, y_train)

# Predictions
knn_pred = best_knn.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_knn.score(X_val_merged, y_val))
print(classification_report(y_val, knn_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, knn_pred))

f1 = f1_score(y_val, knn_pred, average='weighted')
print("F1 score on validation set:", f1)

100%|██████████| 24/24 [07:03<00:00, 17.63s/it]


Best parameters found: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree'}
F1 score on validation set with best parameters: 0.7797695262483995
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1135
           1       0.58      0.72      0.64       427

    accuracy                           0.78      1562
   macro avg       0.73      0.76      0.74      1562
weighted avg       0.80      0.78      0.79      1562

Confusion Matrix:
[[912 223]
 [121 306]]
F1 score on validation set: 0.7863375246559777


In [16]:
# Best F1: 0.7863375246559777 ; for: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree'}

### MLP

In [24]:
# Parameters for grid search
parameters = {
    'hidden_layer_sizes': [(2, 2), (5, 5), (2, 2, 2)],    # hidden layers
    'activation': ['logistic', 'relu', 'tanh'],    # Activation function
    'solver': ['lbfgs', 'sgd'],     # Optimization algorithm
}

best_f1 = 0
best_params = None

# Progress bar
total_combinations = len(parameters['hidden_layer_sizes']) * len(parameters['activation']) * len(parameters['solver'])
pbar = tqdm(total=total_combinations)

# Perform grid search
for hidden_layer_sizes, activation, solver in product(parameters['hidden_layer_sizes'], parameters['activation'], parameters['solver']):

    pbar.update(1)
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, random_state=0)

    mlp.fit(X_train_merged, y_train)
    mlp_pred = mlp.predict(X_val_merged)

    f1 = f1_score(y_val, mlp_pred, average='weighted')
    
    # Check if best score
    if f1 > best_f1:
        best_f1 = f1
        best_params = {'hidden_layer_sizes': hidden_layer_sizes, 'activation': activation, 'solver': solver}

pbar.close()

# Use the best model found by manual search
print("Best parameters found:", best_params)
best_mlp = MLPClassifier(**best_params, random_state=0)
best_mlp.fit(X_train_merged, y_train)

# Predictions
mlp_pred = best_mlp.predict(X_val_merged)

print("F1 score on validation set with best parameters:", best_mlp.score(X_val_merged, y_val))
print(classification_report(y_val, mlp_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, mlp_pred))

f1 = f1_score(y_val, mlp_pred, average='weighted')
print("F1 score on validation set:", f1)


 17%|█▋        | 5/30 [00:43<03:36,  8.67s/it]

 11%|█         | 2/18 [00:02<00:23,  1.48s/it][A
 17%|█▋        | 3/18 [00:07<00:41,  2.74s/it][A
 22%|██▏       | 4/18 [00:13<00:54,  3.89s/it][A
 28%|██▊       | 5/18 [00:20<01:05,  5.06s/it][A

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

 44%|████▍     | 8/18 [00:52<01:30,  9.01s/it][A
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


 61%|██████    | 11/18 [01:29<01:30, 12.96s/it][A

 72%|███████▏  | 13/18 [01:55<01:09, 13.87s/it][A
 78%|███████▊  | 14/18 [02:01<00:45, 11.32s/it][A
STOP: TOTAL NO. of ITERATIO

Best parameters found: {'hidden_layer_sizes': (2, 2), 'activation': 'tanh', 'solver': 'sgd'}
F1 score on validation set with best parameters: 0.8572343149807938
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1135
           1       0.70      0.83      0.76       427

    accuracy                           0.86      1562
   macro avg       0.82      0.85      0.83      1562
weighted avg       0.87      0.86      0.86      1562

Confusion Matrix:
[[984 151]
 [ 72 355]]
F1 score on validation set: 0.8607046991130397




In [None]:
# Best F1: 0.8607046991130397 ; for: {'hidden_layer_sizes': (2, 2), 'activation': 'tanh', 'solver': 'sgd'}

# Extra Models

### Random Forest

In [25]:
# Values for grid search
parameters_rf = {
    'n_estimators': [100, 200],     # nº of trees
    'max_depth': [None, 10, 20],      # max depth
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2]     
}

best_f1_rf = 0
best_params_rf = None

# Progression bar
total_combinations_rf = len(parameters_rf['n_estimators']) * len(parameters_rf['max_depth']) * len(parameters_rf['min_samples_split']) * len(parameters_rf['min_samples_leaf'])
pbar_rf = tqdm(total=total_combinations_rf)

# Perform grid search
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(parameters_rf['n_estimators'], parameters_rf['max_depth'], parameters_rf['min_samples_split'], parameters_rf['min_samples_leaf']):

    pbar_rf.update(1)

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=0)
    rf.fit(X_train_merged, y_train)
    
    # Make predictions
    rf_pred = rf.predict(X_val_merged)
    f1_rf = f1_score(y_val, rf_pred, average='weighted')
    
    # Check if it's the best score
    if f1_rf > best_f1_rf:
        best_f1_rf = f1_rf
        best_params_rf = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

pbar_rf.close()

# Use the best model found by manual search for Random Forest
print("Best parameters found for Random Forest:", best_params_rf)
best_rf = RandomForestClassifier(**best_params_rf, random_state=0)
best_rf.fit(X_train_merged, y_train)

# Predictions
rf_pred = best_rf.predict(X_val_merged)

print("F1 score on validation set with best parameters for Random Forest:", best_rf.score(X_val_merged, y_val))
print(classification_report(y_val, rf_pred))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_val, rf_pred))

f1_rf = f1_score(y_val, rf_pred, average='weighted')
print("F1 score on validation set for Random Forest:", f1_rf)

100%|██████████| 81/81 [59:02<00:00, 43.74s/it] 


Best parameters found for Random Forest: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1}
F1 score on validation set with best parameters for Random Forest: 0.8783610755441741
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      1135
           1       0.75      0.84      0.79       427

    accuracy                           0.88      1562
   macro avg       0.84      0.87      0.85      1562
weighted avg       0.88      0.88      0.88      1562

Confusion Matrix for Random Forest:
[[1014  121]
 [  69  358]]
F1 score on validation set for Random Forest: 0.8804259327087324


In [None]:
# Best F1: 0.8804259327087324 ; for: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1}

### SVM

In [26]:
parameters_svm = {
    'C': [0.1, 1, 10],      # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf'],     # Kernel type
    'gamma': ['scale', 'auto']      # coefficient
}

best_f1_svm = 0
best_params_svm = None

# progression bar
total_combinations_svm = len(parameters_svm['C']) * len(parameters_svm['kernel']) * len(parameters_svm['gamma'])
pbar_svm = tqdm(total=total_combinations_svm)

# Doing grid search
for C, kernel, gamma in product(parameters_svm['C'], parameters_svm['kernel'], parameters_svm['gamma']):

    pbar_svm.update(1)
    
    svm = SVC(C=C, kernel=kernel, gamma=gamma, random_state=0)
    svm.fit(X_train_merged, y_train)
    
    svm_pred = svm.predict(X_val_merged)
    f1_svm = f1_score(y_val, svm_pred, average='weighted')
    
    if f1_svm > best_f1_svm:
        best_f1_svm = f1_svm
        best_params_svm = {'C': C, 'kernel': kernel, 'gamma': gamma}

pbar_svm.close()

# Use the best model found by manual search for SVM
best_svm = SVC(**best_params_svm, random_state=0)
best_svm.fit(X_train_merged, y_train)

# Predictions
svm_pred = best_svm.predict(X_val_merged)

print("Best parameters found for SVM:", best_params_svm)
print("F1 score on validation set with best parameters for SVM:", best_svm.score(X_val_merged, y_val))
print(classification_report(y_val, svm_pred))
print("Confusion Matrix for SVM:")
print(confusion_matrix(y_val, svm_pred))

f1_svm = f1_score(y_val, svm_pred, average='weighted')
print("F1 score on validation set for SVM:", f1_svm)

100%|██████████| 18/18 [07:17<00:00, 24.33s/it]


Best parameters found for SVM: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale'}
F1 score on validation set with best parameters for SVM: 0.850832266325224
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      1135
           1       0.68      0.85      0.76       427

    accuracy                           0.85      1562
   macro avg       0.81      0.85      0.82      1562
weighted avg       0.87      0.85      0.86      1562

Confusion Matrix for SVM:
[[966 169]
 [ 64 363]]
F1 score on validation set for SVM: 0.8553811517151816


In [27]:
# Best F1: 0.850832266325224 ; for: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale'}

## Predictions on Test
##### Using model with the best score in validation -> Random Forest