In [49]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
## Classification Models (4th / 5th steps guidelines):

In [3]:
data = np.load("combined_representations.npz", allow_pickle=True)
X_train_combined = data["X_train"]
X_val_combined = data["X_val"]
test_combined = data["test"]
y_train = data["y_train"]
y_val = data["y_val"]

In [4]:
test_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.08468467,
         0.16710296,  0.057366  ],
       [ 0.        ,  0.        ,  0.        , ..., -0.01306781,
         0.13182336,  0.0041255 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.13805758,
         0.24629688, -0.0862156 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.13059046,
         0.33800852,  0.01264326],
       [ 0.        ,  0.        ,  0.        , ..., -0.04177414,
         0.33193919,  0.06621849],
       [ 0.        ,  0.        ,  0.        , ..., -0.08864344,
         0.25635894,  0.10819403]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.04349438,
        -0.01102248, -0.00268946],
       [ 0.        ,  0.        ,  0.        , ..., -0.07374551,
         0.27137006,  0.1386768 ],
       [ 0.        ,  0.08407949,  0.08587857, ..., -0.03365872,
         0.21961312,  0.08438464],
       ...,
       [ 0.        ,  0.        ,  0. 

In [5]:
X_val_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.17752816,
         0.33020436,  0.11092447],
       [ 0.        ,  0.        ,  0.        , ..., -0.01434965,
         0.39485991,  0.04780014],
       [ 0.        ,  0.        ,  0.        , ..., -0.07215451,
         0.19116122,  0.10603562],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.12204272,
         0.00810044,  0.0257803 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.15537582,
         0.30471598,  0.19891339],
       [ 0.        ,  0.        ,  0.        , ..., -0.01584726,
         0.248952  ,  0.10495016]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.13266554,
         0.21048408,  0.06905448],
       [ 0.        ,  0.        ,  0.        , ..., -0.07712197,
         0.24841082,  0.07409752],
       [ 0.        ,  0.        ,  0.        , ..., -0.19376324,
         0.16052756,  0.18207438],
       ...,
       [ 0.        ,  0.        ,  0. 

In [6]:
X_train_combined

array({'description': array([[ 0.        ,  0.        ,  0.        , ..., -0.1268446 ,
         0.19595661, -0.0083481 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.04427454,
        -0.0050476 , -0.03812569],
       [ 0.        ,  0.        ,  0.        , ..., -0.0455008 ,
         0.21400927,  0.0660574 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.03801016,
         0.34924866,  0.05680282],
       [ 0.        ,  0.        ,  0.        , ..., -0.13847188,
         0.24170396,  0.12646742],
       [ 0.        ,  0.        ,  0.        , ..., -0.01582526,
         0.24599232,  0.05336657]]), 'host_about': array([[ 0.        ,  0.        ,  0.        , ..., -0.06807236,
        -0.0216408 ,  0.01123832],
       [ 0.        ,  0.        ,  0.        , ...,  0.018232  ,
         0.03550534, -0.0068904 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.17294988,
         0.2813098 ,  0.13600118],
       ...,
       [ 0.        ,  0.        ,  0. 

In [7]:
y_val.shape

(1562,)

In [8]:
y_train.shape

(4686,)

### Preparing data for the models

In [28]:
# For X_train_combined

train_dict = X_train_combined.item()

description_features_train = train_dict['description']
host_about_features_train = train_dict['host_about']
comments_features_train = train_dict['comments']

X_train_merged = np.concatenate((description_features_train, host_about_features_train, comments_features_train), axis=1)
print("Shape of merged feature matrix:", X_train_merged.shape)

Shape of merged feature matrix: (4686, 1800)


In [29]:
# For X_val_combined

val_dict = X_val_combined.item()

description_features_val = val_dict['description']
host_about_features_val = val_dict['host_about']
comments_features_val = val_dict['comments']

X_val_merged = np.concatenate((description_features_val, host_about_features_val, comments_features_val), axis=1)
print("Shape of merged feature matrix:", X_val_merged.shape)

Shape of merged feature matrix: (1562, 1800)


In [30]:
# For test_combined

test_dict = test_combined.item()

description_features_test = test_dict['description']
host_about_features_test = test_dict['host_about']
comments_features_test = test_dict['comments']

X_test_merged = np.concatenate((description_features_test, host_about_features_test, comments_features_test), axis=1)
print("Shape of merged feature matrix:", X_test_merged.shape)

Shape of merged feature matrix: (695, 1800)


# Models

### Logistic Regression

In [48]:
# Initialize logistic
lr = LogisticRegression(random_state=0)

# Train the model
lr.fit(X_train_merged, y_train)

# Predictions
lr_pred = lr.predict(X_val_merged)

print(classification_report(y_val, lr_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lr_pred))

f1 = f1_score(y_val, lr_pred, average='weighted')
print("F1 score on validation set:", f1)

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1135
           1       0.66      0.76      0.71       427

    accuracy                           0.83      1562
   macro avg       0.78      0.81      0.79      1562
weighted avg       0.84      0.83      0.83      1562

Confusion Matrix:
[[971 164]
 [103 324]]
F1 score on validation set: 0.8324030027318112


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### K Nearest Neighbors

In [51]:
knn = KNeighborsClassifier()

# Fit the KNN model on the entire training dataset
knn.fit(X_train_merged, y_train)

# Predict on the validation set
knn_pred = knn.predict(X_val_merged)

print(classification_report(y_val, knn_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, knn_pred))

# Calculate F1 score on the validation set
f1 = f1_score(y_val, knn_pred, average='weighted')
print("F1 score on validation set:", f1)

              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1135
           1       0.57      0.71      0.64       427

    accuracy                           0.78      1562
   macro avg       0.73      0.76      0.74      1562
weighted avg       0.80      0.78      0.78      1562

Confusion Matrix:
[[909 226]
 [122 305]]
F1 score on validation set: 0.7839531255961923


### MLP