In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [7]:
X_train = np.load('/content/drive/My Drive/CSVs/X_train_transformed.npy')
y_train = pd.read_parquet('/content/drive/My Drive/CSVs/y_train.parquet')
y_train = y_train.values
X_test = np.load('/content/drive/My Drive/CSVs/X_test_transformed.npy')
y_test = pd.read_parquet('/content/drive/My Drive/CSVs/y_test.parquet')
y_test = y_test.values


In [8]:
type(X_train)


numpy.ndarray

In [9]:
type(y_train)

numpy.ndarray

In [10]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report

In [12]:
classifiers = [RandomForestClassifier(random_state=42) for _ in range(y_train.shape[1])]

In [13]:
for i, clf in enumerate(tqdm(classifiers)):
    clf.fit(X_train, y_train[:, i])

100%|██████████| 18/18 [01:44<00:00,  5.81s/it]


In [21]:
import joblib

In [23]:
for i, clf in enumerate(classifiers):
    joblib.dump(clf, f'/content/drive/My Drive/Models/rf_classifier_label_{i}.pkl')

In [14]:
y_pred = np.array([clf.predict(X_test) for clf in classifiers]).T

In [15]:
y_pred

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

(1585, 1491)
(1323, 1155)
(1181, 925)
(726, 388)
(729, 501)
(702, 309)
(642, 321)
(610, 408)
(502, 212)
(599, 269)
(298, 55)
(255, 68)
(273, 27)
(250, 199)
(267, 17)
(210, 117)
(176, 138)
(192, 88)


In [19]:
# Evaluate model
hamming_loss_score = hamming_loss(y_test, y_pred)
f1_score_micro = f1_score(y_test, y_pred, average='micro')
f1_score_macro = f1_score(y_test, y_pred, average='macro')

print(f'Hamming Loss: {hamming_loss_score:.4f}')
print(f'F1 Score (Micro): {f1_score_micro:.4f}')
print(f'F1 Score (Macro): {f1_score_macro:.4f}')

Hamming Loss: 0.0844
F1 Score (Micro): 0.6388
F1 Score (Macro): 0.5378


In [22]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1585
           1       0.82      0.72      0.77      1323
           2       0.84      0.66      0.74      1181
           3       0.78      0.42      0.55       726
           4       0.74      0.51      0.60       729
           5       0.78      0.34      0.48       702
           6       0.83      0.41      0.55       642
           7       0.78      0.52      0.62       610
           8       0.84      0.36      0.50       502
           9       0.84      0.38      0.52       599
          10       0.75      0.14      0.23       298
          11       0.90      0.24      0.38       255
          12       0.85      0.08      0.15       273
          13       0.91      0.72      0.81       250
          14       0.82      0.05      0.10       267
          15       0.85      0.48      0.61       210
          16       0.84      0.66      0.74       176
          17       0.83    

  _warn_prf(average, modifier, msg_start, len(result))
