In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from IPython import display
from collections import Counter
import re
import operator
import nltk
import heapq
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from collections import defaultdict
from math import log
import gensim.downloader as api

In [6]:
def text_to_bow(text):
    """ convert text string to an array of token counts. Use bow_vocabulary. """
    #<YOUR CODE>
    sentence_vectors = []
    sentence_tokens = tokenizer.tokenize(text)
    for token in bow_vocabulary:
        if token in sentence_tokens:
            sentence_vectors.append(sentence_tokens.count(token))
        else:
            sentence_vectors.append(0)
        #sentence_vectors.append(sent_vec)
    sentence_vectors = np.asarray(sentence_vectors)
    return sentence_vectors

In [None]:
def create_model(input_size, lr=0.1):
  model = nn.Sequential()
  model.add_module('l1', nn.Linear(input_size,2))

  opt = torch.optim.SGD(model.parameters(), lr=lr)
  return model, opt

In [None]:
def train_model(
    model,
    opt,
    lr_scheduler,
    X_train_torch,
    y_train_torch,
    X_val_torch,
    y_val_torch,
    n_iterations=500,
    batch_size=32,
    warm_start=False,
    show_plots=True,
    eval_every=10
):
    if not warm_start:
        for name, module in model.named_children():
            print('resetting ', name)
            try:
                module.reset_parameters()
            except AttributeError as e:
                print('Cannot reset {} module parameters: {}'.format(name, e))

    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    local_train_loss_history = []
    local_train_acc_history = []
    for i in range(n_iterations):

        # sample 256 random observations
        ix = np.random.randint(0, len(X_train_torch), batch_size)
        x_batch = X_train_torch[ix]
        y_batch = y_train_torch[ix]

        # predict log-probabilities or logits
        y_predicted = model(x_batch) ### YOUR CODE

        # compute loss, just like before
        ### YOUR CODE
        loss = loss_function(y_predicted, y_batch)

        # compute gradients
        ### YOUR CODE
        loss.backward()

        # Adam step
        ### YOUR CODE
        opt.step()
        # clear gradients
        ### YOUR CODE
        opt.zero_grad()

        local_train_loss_history.append(loss.data.numpy())
        local_train_acc_history.append(
            accuracy_score(
                y_batch.to('cpu').detach().numpy(),
                y_predicted.to('cpu').detach().numpy().argmax(axis=1)
            )
        )

        if i % eval_every == 0:
            train_loss_history.append(np.mean(local_train_loss_history))
            train_acc_history.append(np.mean(local_train_acc_history))
            local_train_loss_history, local_train_acc_history = [], []

            predictions_val = model(X_val_torch)
            val_loss_history.append(loss_function(predictions_val, y_val_torch).to('cpu').detach().item())

            acc_score_val = accuracy_score(y_val_torch.cpu().numpy(), predictions_val.to('cpu').detach().numpy().argmax(axis=1))
            val_acc_history.append(acc_score_val)
            lr_scheduler.step(train_loss_history[-1])

            if show_plots:
                display.clear_output(wait=True)
                plot_train_process(train_loss_history, val_loss_history, train_acc_history, val_acc_history)
    return model

In [None]:
def full_cycle(my_k, x_train, x_test):
  my_X_train_torch = torch.tensor(x_train[:,:my_k], dtype=torch.float32)
  my_X_test_torch = torch.tensor(x_test[:,:my_k], dtype=torch.float32)

  my_model, my_opt = create_model(my_k)
  my_lr_scheduler = ReduceLROnPlateau(my_opt, patience=5)

  train_model(my_model, my_opt, my_lr_scheduler, my_X_train_torch, y_train_torch, my_X_test_torch, y_test_torch, show_plots=False)
  my_auc_train = roc_auc_score(y_train, my_model(my_X_train_torch).detach().cpu().numpy()[:,1])
  my_auc_test = roc_auc_score(y_test, my_model(my_X_test_torch).detach().cpu().numpy()[:,1])
  return (my_auc_train,my_auc_test)

In [None]:
def plot_dynamics(results, ks):
  plt.figure(figsize=[15,10])
  plt.ylim((0,1))
  plt.plot([mr[0] for mr in results], label='Train roc-auc')
  plt.plot([mr[1] for mr in results], label='Test roc-auc')
  plt.xticks(range(len(ks)), ks)
  plt.legend()

In [None]:
# Calculate TF
def twit_tf(twit):
  twit_words = twit.split()
  total_words = len(twit_words)
  twit_words_tf = dict()
  for word in twit_words:
    twit_words_tf[word] = twit_words_tf.get(word, 0) + 1/total_words
  return twit_words_tf

In [None]:
# Calculate TF-IDF
def get_tf_idf(twit_tfs):
  twit_tf_idf = dict()
  for word in twit_tfs.keys():
    twit_tf_idf[word] = twit_tfs[word]*words_idfs.get(word, 0)
  return twit_tf_idf

In [None]:
# Make vectorization
def vectorise(twit_tf_idf):
  result = np.zeros(len(words))
  for word in twit_tf_idf.keys():
    for i in range(len(words)):
      if word==words[i]:
        result[i] = twit_tf_idf[word]
  return result

In [None]:
# Choose LR
def full_cycle_lr(my_lr, x_train, x_test):
  my_X_train_torch = torch.tensor(x_train[:,:1000], dtype=torch.float32)
  my_X_test_torch = torch.tensor(x_test[:,:1000], dtype=torch.float32)

  my_model, my_opt  = create_model(1000)
  my_lr_scheduler = ReduceLROnPlateau(my_opt, patience=5)

  train_model(my_model, my_opt, my_lr_scheduler, my_X_train_torch, y_train_torch, my_X_test_torch, y_test_torch, show_plots=False)
  my_auc_train = roc_auc_score(y_train, my_model(my_X_train_torch).detach().cpu().numpy()[:,1])
  my_auc_test = roc_auc_score(y_test, my_model(my_X_test_torch).detach().cpu().numpy()[:,1])
  return (my_auc_train,my_auc_test)

In [None]:
def train_nb_clf(X, y):
  classes, freq = defaultdict(lambda:0), defaultdict(lambda:0)

  for i in range(len(y)):
    label = y[i]
    classes[label] += 1  
    feats = X[i, :]
    for feat in feats:
      freq[label, feat] += 1

  for label, feat in freq:               
        freq[label, feat] /= classes[label]
  for c in classes:                       
        classes[c] /= len(y)

  return classes, freq

In [None]:
def classify(classifier, feats):
    classes, prob = classifier
    return min(classes.keys(),         
        key = lambda cl: -log(classes[cl]) + \
            sum(-log(prob.get((cl,feat), 10**(-7))) for feat in feats))

In [None]:
pretrained_w2v = api.load('glove-twitter-25')

In [None]:
# Word to vec
def twit_to_vec(twit):
  twit_vec = []
  for word in twit:
    try:
      word_vec = pretrained_w2v.wv.get_vector(word)
      twit_vec.append(word_vec)
    except:
      pass
  if len(twit_vec) == 0:
    return np.zeros(25)
  else:
    return np.mean(twit_vec, axis=0)