# Training a Classifier

## Loading the data

In [17]:
import json

In [18]:
with open("./datasets/labeled_reduced_gradients_for_training.json", "r") as f:
    labeled_reduced_gradients_for_training = json.load(f)

In [19]:
list(labeled_reduced_gradients_for_training[0].keys())

['user',
 'labeled_texts',
 'label',
 'labeled_gradients',
 'labeled_reduced_gradients']

In [20]:
# creating lists of data and labels

training_data_for_classifier = []
training_labels_for_classifier = []

for entry in labeled_reduced_gradients_for_training:
    labeled_gradients = entry["labeled_reduced_gradients"]

    entry_gradients = [
        dic["reduced_gradient"]
        for dic in labeled_gradients
    ]
    entry_labels = [
        dic["label"]
        for dic in labeled_gradients
    ]
    training_data_for_classifier += entry_gradients
    training_labels_for_classifier += entry_labels

In [21]:
len(training_data_for_classifier)

22413

In [22]:
len(training_labels_for_classifier)

22413

In [23]:
set(training_labels_for_classifier)

{0, 1}

## Initializing the classifier

In [24]:
from sklearn.linear_model import SGDClassifier

In [25]:
SGDClassifier()

SGDClassifier()

In [26]:
classfier = SGDClassifier(
    loss="log" # logistic regression
    # change for different kinds of classifiers
)

In [27]:
classfier.fit(
    X=training_data_for_classifier,
    y=training_labels_for_classifier
)

SGDClassifier(loss='log')

In [28]:
classfier.predict(
    X=training_data_for_classifier[:2],
)

array([0, 0])

In [29]:
classfier.predict_proba(
    X=training_data_for_classifier[:2],
)

array([[0.73482555, 0.26517445],
       [0.68047564, 0.31952436]])

In [30]:
# simple metrics on training data
from sklearn.metrics import precision_recall_fscore_support

In [31]:
precision_recall_fscore_support(
    y_pred=classfier.predict(training_data_for_classifier),
    y_true=training_labels_for_classifier,
    average="binary"
)

(0.32786885245901637, 0.00586596275113653, 0.011525716755510734, None)

In [32]:
# saving the classifier
import os
import pickle
training_dir = "training"
with open(
    os.path.join(training_dir, "gradient_classifier.pk"), "wb"
) as f:
    pickle.dump(classfier,f)