## Homework 01. Simple text processing.

In [1]:
#dl_made_env2

In [2]:
# !pip install matplotlib pandas sklearn nltk
# !pip install gdown

In [5]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from IPython import display
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from collections import Counter
from tqdm import tqdm as tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

from sklearn.metrics import accuracy_score
from utils import plot_train_process
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
import math
from sklearn.naive_bayes import GaussianNB
from gensim.models import Word2Vec
import gensim
import gdown
import gzip
import shutil


from embedding_functions import text_to_bow, splitter, computeReviewTFDict, computeCountDict, computeIDFDict, \
computeReviewTFIDFDict, computeTFIDFVector, get_phrase_embedding
from train import train_model

### Toxic or not
Your main goal in this assignment is to classify, whether the comments are toxic or not. And practice with both classical approaches and PyTorch in the process.

*Credits: This homework is inspired by YSDA NLP_course.*

*Disclaimer: The used dataset may contain obscene language and is used only as an example of real unfiltered data.*

In [None]:
# In colab uncomment this cell
# ! wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/homeworks/homework01/utils.py -nc

In [None]:
try:
    data = pd.read_csv('comments.tsv', sep='\t')
except FileNotFoundError:
    ! wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/comments_small_dataset/comments.tsv -nc
    data = pd.read_csv("comments.tsv", sep='\t')

In [None]:
texts = data['comment_text'].values
target = data['should_ban'].values
data[50::200]

In [None]:
texts_train, texts_test, y_train, y_test = train_test_split(texts, target, test_size=0.5, random_state=42)

__Note:__ it is generally a good idea to split data into train/test before anything is done to them.

It guards you against possible data leakage in the preprocessing stage. For example, should you decide to select words present in obscene tweets as features, you should only count those words over the training set. Otherwise your algoritm can cheat evaluation.

### Preprocessing and tokenization

Comments contain raw text with punctuation, upper/lowercase letters and even newline symbols.

To simplify all further steps, we'll split text into space-separated tokens using one of nltk tokenizers.

Generally, library `nltk` [link](https://www.nltk.org) is widely used in NLP. It is not necessary in here, but mentioned to intoduce it to you.

In [None]:
tokenizer = TweetTokenizer()
preprocess = lambda text: ' '.join(tokenizer.tokenize(text.lower()))

text = 'How to be a grown-up at work: replace "I don\'t want to do that" with "Ok, great!".'
print("before:", text,)
print("after:", preprocess(text),)

In [None]:
# task: preprocess each comment in train and test
texts_train = np.array([preprocess(x) for x in texts_train])
texts_test = np.array([preprocess(x) for x in texts_test])

In [None]:
# Small check that everything is done properly
assert texts_train[5] ==  'who cares anymore . they attack with impunity .'
assert texts_test[89] == 'hey todds ! quick q ? why are you so gay'
assert len(texts_test) == len(y_test)

### Step 1: bag of words

One traditional approach to such problem is to use bag of words features:
1. build a vocabulary of frequent words (use train data only)
2. for each training sample, count the number of times a word occurs in it (for each word in vocabulary).
3. consider this count a feature for some classifier

__Note:__ in practice, you can compute such features using sklearn. __Please don't do that in the current assignment, though.__
* `from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer`

In [None]:
# task: find up to k most frequent tokens in texts_train,
# sort them by number of occurences (highest first)
k = min(10000, len(set(' '.join(texts_train).split())))

#<YOUR CODE>
bw_v = {}
for item in texts_train:
    for item2 in item.split():
        if item2 not in bw_v:
            bw_v[item2] = 0
        bw_v[item2] += 1
bw_v['<UNK>'] = 0

bow_vocabulary = dict(sorted(bw_v.items(), key=lambda x: -x[1])[:k])
tokens = list(bow_vocabulary.keys())

print('example features:', sorted(bow_vocabulary)[::100])

In [None]:
X_train_bow = np.stack([text_to_bow(x, tokens) for x in texts_train])
X_test_bow = np.stack([text_to_bow(x, tokens) for x in texts_test])

In [None]:
# Small check that everything is done properly
k_max = len(set(' '.join(texts_train).split()))
assert X_train_bow.shape == (len(texts_train), min(k, k_max))
assert X_test_bow.shape == (len(texts_test), min(k, k_max))
assert np.all(X_train_bow[5:10].sum(-1) == np.array([len(s.split()) for s in  texts_train[5:10]]))
assert len(bow_vocabulary) <= min(k, k_max)
assert X_train_bow[6, list(bow_vocabulary.keys()).index('.')] == texts_train[6].split().count('.')

Now let's do the trick with `sklearn` logistic regression implementation:

In [None]:
bow_model = LogisticRegression().fit(X_train_bow, y_train)

In [None]:
for name, X, y, model in [
    ('train', X_train_bow, y_train, bow_model),
    ('test ', X_test_bow, y_test, bow_model)
]:
    proba = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()

Seems alright. Now let's create the simple logistic regression using PyTorch. Just like in the classwork.

In [None]:
model = nn.Sequential()
model.add_module('l1', nn.Linear(len(tokens), 2))

# model.to(device, torch.float32)
### YOUR CODE HERE

Remember what we discussed about loss functions! `nn.CrossEntropyLoss` combines both log-softmax and `NLLLoss`.

__Be careful with it! Criterion `nn.CrossEntropyLoss` with still work with log-softmax output, but it won't allow you to converge to the optimum.__ Next comes small demonstration:

In [None]:
# opt = ### YOUR CODE HERE
loss_function = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = ReduceLROnPlateau(opt, patience=100)
criterion = nn.CrossEntropyLoss()

In [None]:
X_train_bow_torch = torch.tensor(X_train_bow, requires_grad=True)### YOUR CODE HERE
X_test_bow_torch = torch.tensor(X_test_bow, requires_grad=True)### YOUR CODE HERE
y_train_torch = torch.tensor(y_train, dtype=torch.long)### YOUR CODE HERE
y_test_torch = torch.tensor(y_test, dtype=torch.long)### YOUR CODE HERE

Let's test that everything is fine

In [None]:
# example loss
loss = loss_function(model(X_train_bow_torch[:3]), y_train_torch[:3])

In [None]:
assert type(loss.item()) == float

Here comes small function to train the model. In future we will take in into separate file, but for this homework it's ok to implement it here. 

In [None]:
# def train_model(
#     model,
#     opt,
#     criterion,
#     lr_scheduler,
#     X_train_torch,
#     y_train_torch,
#     X_val_torch,
#     y_val_torch,
#     n_iterations=300,
#     batch_size=32,
#     warm_start=False,
#     show_plots=True,
#     eval_every=10
# ):
#     if not warm_start:
#         for name, module in model.named_children():
# #             print('resetting ', name)
#             try:
#                 module.reset_parameters()
#             except AttributeError as e:
#                 print('Cannot reset {} module parameters: {}'.format(name, e))

#     train_loss_history = []
#     train_acc_history = []
#     val_loss_history = []
#     val_acc_history = []

#     local_train_loss_history = []
#     local_train_acc_history = []
#     for i in range(n_iterations):

#         # sample 256 random observations
#         ix = np.random.randint(0, len(X_train_torch), batch_size)
#         x_batch = X_train_torch[ix]
#         y_batch = y_train_torch[ix]

#         # predict log-probabilities or logits
#         y_predicted = model(x_batch) ### YOUR CODE
# #         print(y_predicted)

#         # compute loss, just like before
#         ### YOUR CODE
#         loss = criterion(y_predicted, y_batch)
        
#         # compute gradients
#         ### YOUR CODE
#         loss.backward()

#         # Adam step
#         ### YOUR CODE
#         opt.step()

#         # clear gradients
#         ### YOUR CODE
#         opt.zero_grad()


#         local_train_loss_history.append(loss.data.numpy())
# #         lr_scheduler.step(local_train_loss_history[-1])
#         local_train_acc_history.append(
#             accuracy_score(
#                 y_batch.to('cpu').detach().numpy(),
#                 y_predicted.to('cpu').detach().numpy().argmax(axis=1)
#             )
#         )

#         if i % eval_every == 0:
#             train_loss_history.append(np.mean(local_train_loss_history))
#             train_acc_history.append(np.mean(local_train_acc_history))
#             local_train_loss_history, local_train_acc_history = [], []

#             predictions_val = model(X_val_torch)
#             val_loss_history.append(loss_function(predictions_val, y_val_torch).to('cpu').detach().item())

#             acc_score_val = accuracy_score(y_val_torch.cpu().numpy(), predictions_val.to('cpu').detach().numpy().argmax(axis=1))
#             val_acc_history.append(acc_score_val)
#             lr_scheduler.step(train_loss_history[-1])

#             if show_plots:
#                 display.clear_output(wait=True)
#                 plot_train_process(train_loss_history, val_loss_history, train_acc_history, val_acc_history)
#     return model

Let's run it on the data. Note, that here we use the `test` part of the data for validation. It's not so good idea in general, but in this task our main goal is practice.

In [None]:
train_model(model, opt, criterion, lr_scheduler, X_train_bow_torch, y_train_torch, X_test_bow_torch, y_test_torch)

In [None]:
# from sklearn.metrics import roc_auc_score, roc_curve
for name, X, y, model in [
    ('train', X_train_bow_torch, y_train, model),
    ('test ', X_test_bow_torch, y_test, model)
]:
    proba = model(X).detach().cpu().numpy()[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()

Try to vary the number of tokens `k` and check how the model performance changes. Show it on a plot.

In [None]:
# Your beautiful code here
results = {
    'train': [],
    'test': [],
    'k': []
}

for k in range(100, 10100, 500):
    results['k'].append(k)
    bow_vocabulary = dict(sorted(bw_v.items(), key=lambda x: -x[1])[:k])
    tokens = list(bow_vocabulary.keys())
    X_train_bow = np.stack([text_to_bow(x, tokens) for x in texts_train])
    X_test_bow = np.stack([text_to_bow(x, tokens) for x in texts_test])
    X_train_bow_torch = torch.tensor(X_train_bow, requires_grad=True)### YOUR CODE HERE
    X_test_bow_torch = torch.tensor(X_test_bow, requires_grad=True)### YOUR CODE HERE
    y_train_torch = torch.tensor(y_train, dtype=torch.long)### YOUR CODE HERE
    y_test_torch = torch.tensor(y_test, dtype=torch.long)### YOUR CODE HERE
    
    model = nn.Sequential()
    model.add_module('l1', nn.Linear(len(tokens), 2))
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    lr_scheduler = ReduceLROnPlateau(opt, patience=100)
    criterion = nn.CrossEntropyLoss()
    
    train_model(model, opt, criterion, lr_scheduler, X_train_bow_torch, y_train_torch, X_test_bow_torch, y_test_torch, show_plots=False)
    
    for name, X, y, model in [
        ('train', X_train_bow_torch, y_train, model),
        ('test', X_test_bow_torch, y_test, model)
    ]:
        proba = model(X).detach().cpu().numpy()[:, 1]
        auc = roc_auc_score(y, proba)
        results[name].append(auc)
        
plt.plot(results['k'], results['train'], color='orange', label='Train AUC by k')
plt.plot(results['k'], results['test'], color='black', label='Test AUC by k')
plt.legend(fontsize='large')
plt.grid()

### Step 2: implement TF-IDF features

Not all words are equally useful. One can prioritize rare words and downscale words like "and"/"or" by using __tf-idf features__. This abbreviation stands for __text frequency/inverse document frequence__ and means exactly that:

$$ feature_i = { Count(word_i \in x) \times { log {N \over Count(word_i \in D) + \alpha} }}, $$


where x is a single text, D is your dataset (a collection of texts), N is a total number of documents and $\alpha$ is a smoothing hyperparameter (typically 1). 
And $Count(word_i \in D)$ is the number of documents where $word_i$ appears.

It may also be a good idea to normalize each data sample after computing tf-idf features.

__Your task:__ implement tf-idf features, train a model and evaluate ROC curve. Compare it with basic BagOfWords model from above.

__Please don't use sklearn/nltk builtin tf-idf vectorizers in your solution :)__ You can still use 'em for debugging though.

Blog post about implementing the TF-IDF features from scratch: https://triton.ml/blog/tf-idf-from-scratch

In [None]:
# Your beautiful code here
data_train_tfidf = [splitter(x) for x in texts_train.copy()]
data_test_tfidf = [splitter(x) for x in texts_test.copy()]
tfDict = list(map(computeReviewTFDict, data_train_tfidf))
tfDict_test = list(map(computeReviewTFDict, data_test_tfidf))
countDict = computeCountDict(tfDict)
idfDict = computeIDFDict(data_train_tfidf, countDict)

#Stores the TF-IDF dictionaries
tfidfDict = [computeReviewTFIDFDict(review, idfDict) for review in tfDict]
tfidfDict_test = [computeReviewTFIDFDict(review, idfDict) for review in tfDict_test]
wordDict = sorted(countDict.keys())

In [None]:
#TfIdf vector
tfidfVector = [computeTFIDFVector(review, wordDict) for review in tfidfDict]
tfidfVector_test = [computeTFIDFVector(review, wordDict) for review in tfidfDict_test]

In [None]:
# Creating model
model = nn.Sequential()
model.add_module('l1', nn.Linear(len(tfidfVector[0]), 2))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = ReduceLROnPlateau(opt, patience=10)
criterion = nn.CrossEntropyLoss()

Same stuff about moel and optimizers here (or just omit it, if you are using the same model as before).

In [None]:
# Creating torch tensors
X_train_tfidf_torch = torch.tensor(tfidfVector, requires_grad=True)### YOUR CODE HERE
X_test_tfidf_torch = torch.tensor(tfidfVector_test, requires_grad=True)### YOUR CODE HERE
y_train_torch = torch.tensor(y_train)### YOUR CODE HERE
y_test_torch = torch.tensor(y_test)### YOUR CODE HERE

# Training model
train_model(model, opt, criterion, lr_scheduler, X_train_tfidf_torch, y_train_torch, X_test_tfidf_torch, y_test_torch, n_iterations=300, show_plots=True)

# Plotting resilts
for name, X, y, model in [
    ('train', X_train_tfidf_torch, y_train, model),
    ('test ', X_test_tfidf_torch, y_test, model)
]:
    proba = model(X).detach().cpu().numpy()[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()

Fit your model to the data. No not hesitate to vary number of iterations, learning rate and so on.

_Note: due to very small dataset, increasing the complexity of the network might not be the best idea._

### Step 3: Comparing it with Naive Bayes

Naive Bayes classifier is a good choice for such small problems. Try to tune it for both BOW and TF-iDF features. Compare the results with Logistic Regression.

In [None]:
# Your beautiful code here
#BOW
bow_vocabulary = dict(sorted(bw_v.items(), key=lambda x: -x[1]))
tokens = list(bow_vocabulary.keys())
X_train_bow = np.stack([text_to_bow(x, tokens) for x in texts_train])
X_test_bow = np.stack([text_to_bow(x, tokens) for x in texts_test])

bow_nb_model = GaussianNB().fit(X_train_bow, y_train)
for name, X, y, model in [
    ('train', X_train_bow, y_train, bow_nb_model),
    ('test ', X_test_bow, y_test, bow_nb_model)
]:
    proba = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()

In [None]:
#TFIDF
tfidf_nb_model = GaussianNB().fit(tfidfVector, y_train)
for name, X, y, model in [
    ('train', tfidfVector, y_train, tfidf_nb_model),
    ('test ', tfidfVector_test, y_test, tfidf_nb_model)
]:
    proba = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()

Shape some thoughts on the results you aquired. Which model has show the best performance? Did changing the learning rate/lr scheduler help?

_Your beautiful thoughts here_

### Step 4: Using the external knowledge.

Use the `gensim` word2vec pretrained model to translate words into vectors. Use several models with this new encoding technique. Compare the results, share your thoughts.

In [None]:
# Your beautiful code here
model_w2v = Word2Vec(data_train_tfidf, 
                 size=256,      # embedding vector size
                 min_count=5,  # consider words that occured at least 5 times
                 window=15).wv

In [None]:
X_train_w2v = np.array([get_phrase_embedding(x, model_w2v) for x in texts_train])
X_test_w2v = np.array([get_phrase_embedding(x, model_w2v) for x in texts_test])

X_train_w2v_torch = torch.tensor(X_train_w2v, requires_grad=True)### YOUR CODE HERE
X_test_w2v_torch = torch.tensor(X_test_w2v, requires_grad=True)### YOUR CODE HERE
y_train_torch = torch.tensor(y_train, dtype=torch.long)### YOUR CODE HERE
y_test_torch = torch.tensor(y_test, dtype=torch.long)### YOUR CODE HERE

model = nn.Sequential()
model.add_module('l1', nn.Linear(len(X_train_w2v[0]), 2))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = ReduceLROnPlateau(opt, patience=10)
criterion = nn.CrossEntropyLoss()
train_model(model, opt, criterion, lr_scheduler, X_train_w2v_torch, y_train_torch, X_test_w2v_torch, y_test_torch, batch_size=128, show_plots=True)

for name, X, y, model_ in [
    ('train', X_train_w2v_torch, y_train, model),
    ('test', X_test_w2v_torch, y_test, model)
]:
    proba = model_(X).detach().cpu().numpy()[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))
    plt.plot([0, 1], [0, 1], '--', color='black',)
    plt.legend(fontsize='large')
    plt.grid()

In [None]:
url = 'https://drive.google.com/uc?id=1qhOAD0Xm7W1KwGubcfU6UVxFs4RSKIHp'
# https://drive.google.com/file/d/1qhOAD0Xm7W1KwGubcfU6UVxFs4RSKIHp/view?usp=sharing
output = 'GoogleNews-vectors-negative300.bin.gz'
gdown.download(url, output, quiet=False)

In [None]:
with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:
    with open("GoogleNews-vectors-negative300.bin", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [6]:
g_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [14]:
g_model.most_similar(positive=['dog'])

[('dogs', 0.8680489659309387),
 ('puppy', 0.8106428384780884),
 ('pit_bull', 0.780396044254303),
 ('pooch', 0.7627377510070801),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500902414321899),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437614798545837),
 ('beagle', 0.7418621778488159),
 ('pup', 0.740691065788269)]

In [None]:
X_train_w2v_google = np.array([get_phrase_embedding(x, g_model) for x in texts_train])
X_test_w2v_google = np.array([get_phrase_embedding(x, g_model) for x in texts_test])

X_train_w2v_torch_google = torch.tensor(X_train_w2v_google, requires_grad=True)### YOUR CODE HERE
X_test_w2v_torch_google = torch.tensor(X_test_w2v_google, requires_grad=True)### YOUR CODE HERE
y_train_torch = torch.tensor(y_train, dtype=torch.long)### YOUR CODE HERE
y_test_torch = torch.tensor(y_test, dtype=torch.long)### YOUR CODE HERE

model = nn.Sequential()
model.add_module('l1', nn.Linear(len(X_train_w2v_google[0]), 2))
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
lr_scheduler = ReduceLROnPlateau(opt, patience=100)
criterion = nn.CrossEntropyLoss()

train_model(model, opt, criterion, lr_scheduler, X_train_w2v_torch_google, y_train_torch, X_test_w2v_torch_google, y_test_torch, batch_size=2056, n_iterations=2000, show_plots=True)

for name, X, y, model_ in [
    ('train', X_train_w2v_torch_google, y_train, model),
    ('test', X_test_w2v_torch_google, y_test, model)
]:
    proba = model_(X).detach().cpu().numpy()[:, 1]
    auc = roc_auc_score(y, proba)
    plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))
plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()