### The point of this notebook is to prepare data to training, build and evaluate classifier.

In [1]:
import numpy as np
import os

In [2]:
packages_path = "../data/interim/distilBERT_output/"
data_packages = os.listdir(packages_path)

In [3]:
finaly_matrix = np.empty((0, 769)) # number of columns is equal to number of hidden layers
finaly_matrix.shape

(0, 769)

In [4]:
for pack in data_packages:
    element = np.load("{}/{}".format(packages_path, pack))
    finaly_matrix = np.append(finaly_matrix, element, axis=0)

In [8]:
finaly_matrix.shape

(26709, 769)

In [9]:
# Prepare data

In [10]:
labels = finaly_matrix[:, -1]

In [11]:
features = np.delete(finaly_matrix, -1, axis=1)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=101)

In [14]:
# Train model

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rff = RandomForestClassifier(n_jobs=-1)
rff.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [19]:
predicted = rff.predict(X_test)

In [20]:
# Evaluate model

In [21]:
from sklearn.metrics import classification_report

In [24]:
with open("../reports/model_metrics.txt", "w") as handle:
    handle.write(classification_report(y_test, predicted))
    
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         0.0       0.80      0.87      0.83      4532
         1.0       0.81      0.71      0.76      3481

    accuracy                           0.80      8013
   macro avg       0.80      0.79      0.79      8013
weighted avg       0.80      0.80      0.80      8013



In [25]:
from sklearn.dummy import DummyClassifier

In [30]:
dummy = DummyClassifier(random_state=101, strategy="stratified")
dummy.fit(X_train, y_train)
dummy_predicted = dummy.predict(X_test)

with open("../reports/dummy_model_metrics.txt", "w") as handle:
    handle.write(classification_report(y_test, dummy_predicted))
    
print(classification_report(y_test, dummy_predicted))

              precision    recall  f1-score   support

         0.0       0.56      0.55      0.55      4532
         1.0       0.43      0.44      0.43      3481

    accuracy                           0.50      8013
   macro avg       0.49      0.49      0.49      8013
weighted avg       0.50      0.50      0.50      8013



In [31]:
# save model

In [32]:
import joblib

In [33]:
joblib.dump(rff, "../models/classifier")

['../models/classifier']