# Making predictions on test dataset


## Loading gradient classifier, topic model, and dimension reduction tools

In [1]:
# saving the classifier
import os
import pickle
training_dir = "training"
with open(os.path.join(training_dir, "gradient_classifier.pk"), "rb"
) as f:
    gradient_classifier = pickle.load(f)

with open(os.path.join(training_dir, "topic_model.pk"), "rb"
) as f:
    topic_model = pickle.load(f)

with open(os.path.join(training_dir, "pca_reduction.pk"), "rb"
) as f:
    pca_reduction = pickle.load(f)


CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ../c10/cuda/CUDAFunctions.cpp:112.)



## Defining classification for users

Until now, we have a classifier for gradients. 

We have to use this classifier to assing a label to a user, given their documents.


As we are doing early risk detection, note that false negatives are to be avoided.

This is the algorithm for classifying users based on their documents:

- The documents must be given in chronological order as a list
- Using the topic model, we calculate the topic probabilities for each document
- We calculate the logits and, using PCA, we reduce their dimension
- We calculate the gradients as the difference of the logits of consecutive documents
    - Given that calculating the gradients reduces the list length by 1, we add a dummy document (empty string) at the beginning of the document list.
- Using the gradient classifier, we go through the gradients:
    - On a gradient, calculate the probabilities that it corresponds to the positive or the negative label
    - If one of the probabilities is above a predefined threshold (for example, 80%), assign the corresponding label to the user, and stop the classification. 
        - This is done because we focus on EARLY detection.
    - If none of the probabilities is above the predefined threshold, move on to the next gradient
    - Iterate until a label is assigned.
    - If no label is assigned, label the user as at risk (assing the positive label)
        - This is done to minimize false negatives  


In [2]:
from myutils.utils import basic_text_cleaning
from myutils.utils import calculate_logits
from myutils.utils import CLASSIFICATION_THRESHOLD

In [3]:
# Classification Threshold
CLASSIFICATION_THRESHOLD

0.8

In [4]:
def classify_based_on_documents(
    document_list, # list of strings
    topic_model, # gives probabilities to belong to a topic
    dimensionality_reduction, # PCA or similar
    gradient_classifier, # probabilistic classifier for the gradients
):
    """
    Returns the prediction for the user (positive, negative),
    and also on which document the decision was made.
    
    For example: 
    (1, 3) means that the user was assigned label 1, 
    and that this decision was made after looking at document number 3
    (using 1-indexing)
    """
    # adding dummy document and cleaning the documents
    document_list = [""] + document_list
    document_list = [basic_text_cleaning(doc) for doc in document_list]
    n_docs = len(document_list)

    # calculating logits
    topics, probs = topic_model.transform(document_list)
    user_logits = calculate_logits(probs)
    
    # calculating gradients and reducing dimensionality
    user_gradients = [
        list(next_logits - previous_logits)
        for (previous_logits, next_logits)
        in zip(
            user_logits[0:n_docs-1], user_logits[1:n_docs]
        )
    ]
    reduced_gradients = dimensionality_reduction.transform(user_gradients)

    # go through the gradients one by one
    prediction = None
    necessary_documents = None

    for document_index, red_gradient in enumerate(reduced_gradients):
        # get the probabilities for labels 0 and 1
        [[prob_neg, prob_pos]] = gradient_classifier.predict_proba(
            X = [red_gradient]
            )
        # assing a category
        if prob_neg >= CLASSIFICATION_THRESHOLD:
            prediction = 0
        if prob_pos >= CLASSIFICATION_THRESHOLD:
            prediction = 1
        if prediction is not None:
            necessary_documents = document_index + 1 # 1-indexing
            break
    
    # if no category is assigned, assing as "at risk"
    if prediction is None:
        prediction = 1
        necessary_documents = len(reduced_gradients)

    result = {
        "prediction" : prediction,
        "necessary_documents" : necessary_documents
    }
    return result



# a simple example
classify_based_on_documents(
    [
        "Hello, it is a good day outside",
        "Today I want to eat pasta",
        "Tell me what the problem is",
        "I cannot wait to go to the park",
        "Where is the nearest pizza place?"
    ],
    topic_model=topic_model,
    dimensionality_reduction=pca_reduction,
    gradient_classifier=gradient_classifier
)


{'prediction': 1, 'necessary_documents': 5}

## Applying on test data

In [5]:
# loading test data
import json
datasets_dir = "datasets"

with open(os.path.join(datasets_dir, "test_dataset.json"), "r") as f:
    test_dataset = json.load(f)

In [6]:
# number of users in test dataset
len(test_dataset)

11959

In [7]:
test_dataset[0]

{'user': '00kate00',
 'labeled_texts': [{'text': 'its been pouring with rain for 2 days maybe we wont have water restrictions after this rain i hope so',
   'polarity': 1},
  {'text': 'arrrrhhh, i did it again i past 100 and now 200 i was gunna say it was my 200th update but i didnt get to again',
   'polarity': 0},
  {'text': 'i wanna go see jason mraz in concert now after seeing him on rove but its sold out',
   'polarity': 0},
  {'text': 'thats was the fastest shower of my life, somebody kept turning on the water and it was going cold',
   'polarity': 0},
  {'text': 'nick is such a stud muffin', 'polarity': 1}],
 'label': 0}

In [8]:
from tqdm import tqdm

In [9]:
# Classifying users by their documents

evaluation_results = []

for entry in tqdm(test_dataset):
    evaluation_entry = dict()

    evaluation_entry["user"] = entry["user"]
    evaluation_entry["true_label"] = entry["label"]

    # get the documents
    user_docs = [dic["text"] for dic in entry["labeled_texts"]]
    evaluation_entry["total_documents"] = len(user_docs)

    # classify the user
    classification_result = classify_based_on_documents(
        user_docs,
        topic_model=topic_model,
        dimensionality_reduction=pca_reduction,
        gradient_classifier=gradient_classifier
    )

    evaluation_entry["predicted_label"] = classification_result["prediction"]
    evaluation_entry["necessary_documents"] = classification_result["necessary_documents"]

    evaluation_results.append(evaluation_entry)

  0%|          | 0/11959 [00:00<?, ?it/s]

100%|██████████| 11959/11959 [7:28:05<00:00,  2.25s/it]  


In [10]:
evaluation_results[0]

{'user': '00kate00',
 'true_label': 0,
 'total_documents': 5,
 'predicted_label': 1,
 'necessary_documents': 5}

In [18]:
evaluation_results[1]

{'user': '061004',
 'true_label': 0,
 'total_documents': 5,
 'predicted_label': 0,
 'necessary_documents': 2}

In [19]:
evaluation_dir = "evaluation"
if not os.path.exists(evaluation_dir):
    os.mkdir(evaluation_dir)

In [20]:
# saving
with open(
    os.path.join(evaluation_dir, "classification_results.json"),
    "w"
) as f:
    json.dump(evaluation_results, f, indent=2)