# Calculating Gradients

## Loading the topic model and the probabilities

In [1]:
import pickle
import os

In [2]:
training_dir = "training"


In [3]:
#with open(os.path.join(training_dir, "topic_model.pk"), "rb") as f:
#    topic_model = pickle.load(f)

In [4]:
with open(os.path.join(training_dir, "topics.pk"), "rb") as f:
    topics = pickle.load(f)

In [5]:
with open(os.path.join(training_dir, "training_probabilities.pk"), "rb") as f:
    probs = pickle.load(f)

## Calculating Logits

In [6]:
# many probabilities are rounded to zero, 
# so, to calculate logits, we need to add a threshold
from myutils.utils import LOGIT_THRESHOLD
LOGIT_THRESHOLD # for smoothing

0.0001

In [7]:
probs += LOGIT_THRESHOLD

In [8]:
# we calculate the logits using the natural logarithm
from numpy import log

In [9]:
log(2.718)

0.999896315728952

In [10]:
training_logits = log(probs)

In [11]:
with open(os.path.join(training_dir, "training_logits.pk"), "wb") as f:
    pickle.dump( training_logits, f)

## Calculating and annotating Gradients

The gradients are calculated as the difference in logits from one document to the next.

Gradients are annotated with the same label as the user they come from.

In [12]:
# loading training data
import json
from myutils.utils import using_downsampled_train_dataset

if using_downsampled_train_dataset:
    with open("./datasets/downsampled_train_dataset.json", "r") as f:
        train_dataset = json.load(f)
else:
    with open("./datasets/train_dataset.json", "r") as f:
        train_dataset = json.load(f)

In [13]:
train_dataset[0]

{'user': 'sfannah',
 'labeled_texts': [{'text': "wants mauds ice cream real bad   stupid england don't sell it",
   'polarity': 0},
  {'text': 'is soo not ready for maths', 'polarity': 0},
  {'text': 'has had terrible signal in culford so had not been on twitter or able to text',
   'polarity': 0},
  {'text': "i have and you haven't replied", 'polarity': 0},
  {'text': 'i wish i could give you that hug right now', 'polarity': 0},
  {'text': 'i secretly want to be a pokï¿½mon', 'polarity': 1},
  {'text': 'its at home  i miss it', 'polarity': 0},
  {'text': "why is everyone watching f1 but me  i'm stuck watching parent trap... oh the joy!",
   'polarity': 0},
  {'text': 'and this happens to be one of them', 'polarity': 1},
  {'text': "i don't think people would know how much that made me smile  i'm cheered up (y)",
   'polarity': 1},
  {'text': 'thinks people should group hug more', 'polarity': 1},
  {'text': 'sorry blame the phone!  me no means it', 'polarity': 0},
  {'text': ":o not th

In [14]:
import numpy as np

In [15]:
len(train_dataset)

2391

In [16]:
labeled_gradients_for_training = []

document_count = 0
user_count = 0

for entry in train_dataset:
    new_entry = {}
    new_entry.update(entry)
    user_label =entry["label"]


    n_docs = len(entry["labeled_texts"])
    user_logits = training_logits[
        document_count : document_count + n_docs
    ]
    user_gradients = [
        list(next_logits - previous_logits)
        for (previous_logits, next_logits)
        in zip(
            user_logits[0:n_docs-1], user_logits[1:n_docs]
        )
    ]

    # gradients inherit the user label, not the text polarity!
    new_entry["labeled_gradients"] = [
        {
            "gradient": list(gradi),
            "label": user_label,
        }
        for gradi in user_gradients
    ]

    user_count +=1
    document_count+=n_docs
    labeled_gradients_for_training.append(new_entry)


In [17]:
len(labeled_gradients_for_training)

2391

In [18]:
labeled_gradients_for_training[0]["labeled_gradients"][0]

{'gradient': [-2.39789527279837,
  -2.39789527279837,
  -2.39789527279837,
  0.0,
  -0.6466271649250519,
  0.0,
  -0.6808770879681312,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  -1.0360919316867756,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4978384282391799,
  -0.8873031950009036,
  0.0,
  0.0,
  0.0,
  -1.570598079117837,
  0.0,
  0.0,
  0.0,
  -0.3894647667617237,
  -0.2795848622191617,
  0.0,
  -0.3894647667617237,
  -0.4978384282391799,
  -0.6690496289808854,
  -0.4978384282391799,
  0.0,
  -0.6768866596881651,
  -0.21825356602001822,
  0.0,
  0.0,
  0.0,
  0.2795848622191617,
  0.0,
  0.2795848622191617,
  0.0,
  -0.8286926725561692,
  0.0,
  0.5331106685554259,
  -0.9604619501872929,
  0.0,
  0.0,
  0.0,
  0.0,
  -0.6690496289808854,
  0.0,
  0.0,
  -0.8873031950009036,
  -0.3894647667617237,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  -1.7512681078733188,
  0.0,
  -2.6964606674075444,
  0.0,
  0.0,
  -0.6690496289808854,
  -0.2795848622191617,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  

## Dimensionality reduction

We can see the gradient arrays are very sparse. 
I use PCA for dimensionality reduction

In [19]:
from myutils.utils import N_GRADIENT_COMPONENTS

In [20]:
all_training_gradients = []

for entry in labeled_gradients_for_training:
    user_gradients = [
        dic["gradient"]
        for dic in entry["labeled_gradients"]
    ]
    all_training_gradients += user_gradients


In [21]:
len(all_training_gradients)

22413

In [22]:
from sklearn.decomposition import PCA
from myutils.utils import N_GRADIENT_COMPONENTS

pca_reduction = PCA(
    n_components=N_GRADIENT_COMPONENTS
)

In [23]:
%%time
pca_reduction.fit(all_training_gradients)

CPU times: user 6.73 s, sys: 20.2 s, total: 27 s
Wall time: 1.88 s


PCA(n_components=20)

In [24]:
# save
with open(os.path.join(training_dir, "pca_reduction.pk"), "wb") as f:
    pickle.dump(pca_reduction,f)

In [25]:
all_scaled_training_gradients = pca_reduction.transform(
    all_training_gradients
)

In [26]:
all_scaled_training_gradients.shape

(22413, 20)

In [27]:
labeled_reduced_gradients_for_training = []

document_count = 0
user_count = 0

for entry in labeled_gradients_for_training:
    new_entry = {}
    new_entry.update(entry)
    user_label =entry["label"]


    n_docs = len(entry["labeled_texts"])

    user_reduced_gradients = all_scaled_training_gradients[
        document_count : document_count + n_docs
    ]

    # gradients inherit the user label, not the text polarity!
    new_entry["labeled_reduced_gradients"] = [
        {
            "reduced_gradient": list(gradi),
            "label": user_label,
        }
        for gradi in user_reduced_gradients
    ]

    user_count +=1
    document_count+=n_docs
    labeled_reduced_gradients_for_training.append(new_entry)


In [28]:
labeled_reduced_gradients_for_training[0]["labeled_reduced_gradients"][0]

{'reduced_gradient': [2.8440679818556456,
  2.608103924314116,
  -1.4816889170857397,
  -1.121766689497795,
  -4.967322755437853,
  1.8053913339787224,
  0.6283579344076613,
  0.7047822062907131,
  1.9792394964200222,
  -0.15551180287707114,
  -0.37031140988710914,
  -0.13041313040467628,
  -0.9325245392118704,
  -0.11451198568048084,
  -0.7045756635652524,
  0.09302408276935789,
  -0.5332816819558618,
  0.09452667449349066,
  0.25658897015932874,
  0.18461801047191642],
 'label': 1}

In [29]:
with open("./datasets/labeled_reduced_gradients_for_training.json", "w") as f:
    json.dump(
        labeled_reduced_gradients_for_training,
        f,
        indent=2
    )