In [None]:
import pandas as pd
import numpy as np
import torch
!pip install fastai==0.7.0
!pip install torchtext==0.2.3

import fastai
from fastai.imports import *
from fastai.torch_imports import *
from fastai.core import *
from fastai.model import fit
from fastai.dataset import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *
from fastai.text import *

import dill as pickle
!pip install spacy
!python -m spacy download en
import spacy
import html

import pickle
import collections

In [1]:
class Prep:
    
    def __init__(self, tweet, label, vocab):
        self.tweet = tweet
        self.label = label
        self.vocab = vocab
    
    # cleaning function
    def cleaning(x):
        re1 = re.compile(r'  +')
        x = x.replace('#','').replace('&amp;', '&')
        #return re1.sub(' ', html.unescape(x))
        return re1.sub(' ', re.sub('https?://[A-Za-z0-9./]+', '',html.unescape(x)))
    
    # tokenizer function
    def get_texts(df, n_lbls=1):
        BOS = 'xbos'
        labels = np.unique(df.iloc[:,range(n_lbls)].values, return_inverse=True)[n_lbls]
        texts = f'\n{BOS} ' + df[n_lbls].astype(str)
        for i in range(n_lbls+1, len(df.columns)): 
            texts += df[i].astype(str)
        texts = texts.apply(Prep.cleaning).values.astype(str)
        tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
        return tok, list(labels)
        
    # iterator function
    def get_all(df, n_lbls):
        tok, labels = [], []
        for i, r in enumerate(df):
            print(i)
            tok_, labels_ = Prep.get_texts(r, n_lbls)
            tok += tok_;
            labels += labels_
        return tok, labels
    
    # function to automatically tokenize single tweet
    def tokenize(tweet, vocab, label = '0', chunksize = 1,
                 folder_name = 'Test_lm', file_name = 'text'): 
        
        text = np.array(pd.Series(tweet))
        labels = np.array(pd.Series(label))

        colNames = ['labels','text']
        textdf = pd.DataFrame({'text':text, 'labels':labels}, columns = colNames)
    
        textdf.to_csv(folder_name+'/'+file_name+'.csv', header=False, index=False)
    
        BOS = 'xbos'
        textdf1 = pd.read_csv(folder_name+'/'+file_name+'.csv', header=None, chunksize=chunksize)
        TextLm = Prep.get_all(textdf1, 1)
        TextLm = (TextLm[0])
        
        for i in enumerate(TextLm[0]):
            if TextLm[0][i[0]] not in vocab.keys():
                TextLm[0][i[0]] = '_unk_'
        
        tok = [[vocab[o] for o in p] for p in TextLm]
        tok = tok[0]
    
        output = {
            "Tokens": TextLm[0],
            "Encoded_Tokens": tok
        }
        
        return output
    
    # 
    def OneHot(sequences, dimension):
        results = np.zeros((len(sequences), dimension))
        for i, sequence in enumerate(sequences):
            results[i, sequence] = 1
        return results

In [1]:
# activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    relu = np.maximum(0,x)
    return relu
    
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [3]:
class LSTM:
    
    def __init__(self, input, wgts, nh):
        
        self.input = input
        self.wgts = wgts
        self.nh = nh
        self.em_sz = []
        self.hidden_state_l0 = []
        self.hidden_state_l1 = []
        self.hidden_state_l2 = []
        self.cell_state_l0 = []
        self.cell_state_l1 = [] 
        self.cell_state_l2 = [] 
        
    # single lstm layer
    def single(input, wgts, nh, stage = '0'):
        
        # input weights and bias from the loaded torch model, converted into numpy variables
        wii = np.matrix(wgts['0.rnns.'+stage+'.module.weight_ih_l0'][:nh].numpy())
        wif = np.matrix(wgts['0.rnns.'+stage+'.module.weight_ih_l0'][nh:2*nh].numpy())
        wig = np.matrix(wgts['0.rnns.'+stage+'.module.weight_ih_l0'][2*nh:3*nh].numpy())
        wio = np.matrix(wgts['0.rnns.'+stage+'.module.weight_ih_l0'][3*nh:4*nh].numpy())
    
        bii = np.matrix(wgts['0.rnns.'+stage+'.module.bias_ih_l0'][:nh].numpy())
        bif = np.matrix(wgts['0.rnns.'+stage+'.module.bias_ih_l0'][nh:2*nh].numpy())
        big = np.matrix(wgts['0.rnns.'+stage+'.module.bias_ih_l0'][2*nh:3*nh].numpy())
        bio = np.matrix(wgts['0.rnns.'+stage+'.module.bias_ih_l0'][3*nh:4*nh].numpy())
        
        # output weights and bias from the loaded torch model, converted into numpy variables
        whi = np.matrix(wgts['0.rnns.'+stage+'.module.weight_hh_l0_raw'][:nh].numpy())
        whf = np.matrix(wgts['0.rnns.'+stage+'.module.weight_hh_l0_raw'][nh:2*nh].numpy())
        whg = np.matrix(wgts['0.rnns.'+stage+'.module.weight_hh_l0_raw'][2*nh:3*nh].numpy())
        who = np.matrix(wgts['0.rnns.'+stage+'.module.weight_hh_l0_raw'][3*nh:4*nh].numpy())
    
        bhi = np.matrix(wgts['0.rnns.'+stage+'.module.bias_hh_l0'][:nh].numpy())
        bhf = np.matrix(wgts['0.rnns.'+stage+'.module.bias_hh_l0'][nh:2*nh].numpy())
        bhg = np.matrix(wgts['0.rnns.'+stage+'.module.bias_hh_l0'][2*nh:3*nh].numpy())
        bho = np.matrix(wgts['0.rnns.'+stage+'.module.bias_hh_l0'][3*nh:4*nh].numpy())
    
        hs, cs = np.zeros(nh), np.zeros(nh)
        hidden_matrix = np.empty((0,nh))
        cell_matrix = np.empty((0,nh))
        
        ## LSTM Process:
        # vectors for ignore gate, forget gate, cell gate, output gate, cell state and hidden state
        # are calculated and updated per loop for every word vector. Cell states and hidden states 
        # are all stored
        for t,v in enumerate(input):
            ig = sigmoid(np.matmul(wii,input[t]) + bii + np.matmul(hs,whi) + bhi)
            fg = sigmoid(np.matmul(wif,input[t]) + bif + np.matmul(hs,whf) + bhf)
            cg = np.tanh(np.matmul(wig,input[t]) + big + np.matmul(hs,whg) + bhg)
            og = sigmoid(np.matmul(wio,input[t]) + bio + np.matmul(hs,who) + bho)
            cs = np.multiply(fg,cs) + np.multiply(ig,cg)
            hs = np.multiply(og,np.tanh(cs))
            hidden_matrix = np.append(hidden_matrix, hs, axis=0)
            cell_matrix = np.append(cell_matrix, cs, axis=0)
        
        hidden_state = np.array(hidden_matrix)
        cell_state = np.array(cell_matrix)
        return hidden_state, cell_state
    
    # stacked consisting of three layers
    def stacked(self):
        
        self.em_sz = len(self.input[0])
        # First LSTM Layer, nh = 1150
        hidden_0 = LSTM.single(input = self.input, wgts = self.wgts, nh = self.nh, stage = '0')
        # store hidden states and cell states into class object
        self.hidden_state_l0, self.cell_state_l0 = hidden_0[0], hidden_0[1]
        
        # Second LSTM Layer, nh = 1150
        hidden_1 = LSTM.single(input = hidden_0[0], wgts = self.wgts, nh = self.nh, stage = '1')
        #store hidden states and cell states into class object
        self.hidden_state_l1, self.cell_state_l1 = hidden_1[0], hidden_1[1]
        
        # Third LSTM Layer, nh = 400 = Embedding Size as LSTM Output
        hidden_2 = LSTM.single(input = hidden_1[0], wgts = self.wgts, nh = self.em_sz, stage = '2')
        # store hidden states and cell states into class object
        self.hidden_state_l2, self.cell_state_l2 = hidden_2[0], hidden_2[1]
        
        return self

In [4]:
class Prediction:
    
    #
    def pred_sentence(input, pos):
        sent = ' '.join(word for word in input['Tokens'][:pos])
        sent += str(" ") + input['Tokens'][pos]
        return sent

    def predict(input, model, vocab, tweet, pred_length = 10):
        pred = softmax(np.matmul(input, np.transpose(model['1.decoder.weight'])))
        pred_vec = pred[-1]
        pred_tok = pred_vec.argsort()[::-1][:pred_length]
        vs = len(vocab)
        stoi_df = pd.DataFrame(list(vocab.keys())[:vs], list(vocab.values())[:vs], columns = ['word'])
        stoi_df['probability'] = list(pred_vec)
        output = stoi_df.iloc[pred_tok]
        sentence = Prediction.pred_sentence(tweet, -1)
        return [[sentence], [output]]

In [5]:
class Classifier:
    
    # prepare tweet for demonstration of classifier
    def prep(tweet, wgts, vocab, nh = 1150):
        testtweet = Prep.tokenize(tweet, vocab)
        onehot    = Prep.OneHot(testtweet['Encoded_Tokens'], dimension = len(vocab))
        embedding = np.matmul(onehot, wgts['0.encoder.weight'])
        lstm      = LSTM(embedding, wgts, nh)
        st_lstm   = LSTM.stacked(lstm)
        hidden    = st_lstm.hidden_state_l2
        return hidden
    
    # concatenate last, maxpooled and mean hidden state
    def concat_pooling(input):
        maxpool  = np.max(input, axis = 0)
        meanpool = np.mean(input, axis = 0)
        concat   = np.concatenate((input[-1], maxpool, meanpool), axis = 0)
        return concat 
    
    def relu_layer(input, wgts):
        # pulling model parameters for ReLU layer
        relu_wgts = np.matrix(wgts['1.layers.0.lin.weight'].numpy())
        relu_bias = np.matrix(wgts['1.layers.0.lin.bias'].numpy())
        # relu calculation
        output    = relu(np.matmul(relu_wgts, input) + relu_bias)
        return output
    
    def clas_predict(input, wgts):
        # pulling model parameters for softmax layer
        softmax_wgts = np.matrix(wgts['1.layers.1.lin.weight'].numpy())
        softmax_bias = np.array(wgts['1.layers.1.lin.bias'].numpy())
        # softmax calculation
        pred         = softmax(np.transpose(np.array(np.matmul(softmax_wgts, np.transpose(input)))) + softmax_bias)
        
        predict = {
            "negative": pred[0][0],
            "neutral": pred[0][1],
            "positive": pred[0][2]
        }
        return predict  

In [1]:
# function to show the effect of the freezing
def show_freezing(wgts):
    
    output = pd.DataFrame()
    # from embedding and decoder matrix and from input weights of three lstms the first line is printed
    output['embedding'] = wgts['0.encoder.weight'][0][0:10]
    output['lstm_0']    = wgts['0.rnns.0.module.weight_ih_l0'][0][0:10]
    output['lstm_1']    = wgts['0.rnns.1.module.weight_ih_l0'][0][0:10]
    output['lstm_2']    = wgts['0.rnns.2.module.weight_ih_l0'][0][0:10]
    output['decoder']   = wgts['1.decoder.weight'][0][0:10]
    
    return output

In [1]:
def data_loader(path, trn_clas, val_clas, trn_labels, val_labels, bs, pad_idx):
    
    # fastai class TextDataset stores numpy ndarray with encoded tweets
    # and array with labels into one object
    trn_ds = TextDataset(trn_clas, trn_labels)
    val_ds = TextDataset(val_clas, val_labels)

    # fastai samplers
    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), 
                              bs=bs)
    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))
    
    # fastai data loader
    trn_dl = DataLoader(trn_ds, bs, transpose=True, num_workers=1,
                        pad_idx=1, sampler=trn_samp)
    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, 
                        pad_idx=1, sampler=val_samp)
    
    # fastai ModelData stores all model inputs provided by the data loader
    md = ModelData(path , trn_dl, val_dl)
    return md