In [1]:
#from utils import *

import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import re
from os import mkdir
from os.path import join, isfile, isdir, exists
import bcolz
import pickle 
import emoji
import torch.nn as nn
import torch.nn.functional as F


In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload

## Data Loader
This Class Loads the Tweet Dataset, Cleans it. It also enables the loading for the training and testing. 
TODO : Loading for training and testing

In [3]:
class DataLoader(object):
    def __init__(self):
        # All the Text Data path
        self.path = {}
        self.path['train'] = join('..','data','start-kit','training-v1','offenseval-training-v1.tsv')
        #self.path['trial'] = join('..','data','start-kit','trial-data','offenseval-trial.txt')
        self.path['testA'] = join('..','data','Test A Release','testset-taska.tsv')
        self.path['testB'] = join('..','data','Test B Release','testset-taskb.tsv')
        self.path['testC'] = join('..','data','Test C Release','test_set_taskc.tsv')
        
        # Load, Clean and Tokenize the Datasets
        print('---- Load, Clean and Tokensize Dataset : ',end='')
        self.data = {}
        self.inital_dataload()
        print('Done')
        
        # Compute List of All words in the datasets
        print('---- List all tokenized words : ',end='')
        self.all_words_freq = {}
        self.all_words = []
        self.compute_wordlist()
        print('Done')


    def inital_dataload(self):
        for f in self.path : 
            self.data[f] = pd.read_table(self.path[f],index_col='id')
            self.data[f]['token'] = self.data[f]['tweet'].apply(lambda x : self.clean_tweet(x))
            
    def compute_wordlist(self):
        
        for f in self.data : 
            for i in range(len(self.data[f])):
                for e in self.data[f].iloc[i].token:
                    self.all_words_freq[e] = 1 + self.all_words_freq.get(e,0)
        self.all_words = list(self.all_words_freq.keys())
    def clean_tweet(self,text):
        ''' Function that is applied to every to tweet in the dataset '''
        
        # =========== TEXT ===========
        # Replace @USER by <user>
        text = re.compile(r'@USER').sub(r'<user>',text)
        
        # Remove  Hashtags (#)
        text = re.compile(r'#').sub(r'',text)

        # Replace URL by <url>
        text = re.compile(r'URL').sub(r'<url>',text)

        # Remove numbers :
        text = re.compile(r'[0-9]+').sub(r' ',text)

        # Remove some special characters
        text = re.compile(r'([_\{\}\[\]¬•$,:;/@#|\^*%().”"“-])').sub(r' ',text) 

        # Space the special characters with white spaces
        text = re.compile(r'([$&+,:;=?@#|\'.^*()%!"’“-])').sub(r' \1 ',text) 

        # Replace some special characters : 
        replace_dict = {r'&' : 'and' , 
                        r'\+' : 'plus'}
        for cha in replace_dict:
            text = re.compile(str(cha)).sub(str(replace_dict[cha]),text)
            
        # Handle Emoji : translate some and delete the others
        text = self.handle_emoji(text)

        # Cut the words with caps in them : 
        text = re.compile(r'([a-z]+|[A-Z]+|[A-Z][a-z]+)([A-Z][a-z]+)').sub(r'\1 \2',text)
        text = re.compile(r'([a-z]+|[A-Z]+|[A-Z][a-z]+)([A-Z][a-z]+)').sub(r'\1 \2',text)        
        # =========== TOKENS ===========
        # TOKENIZE 
        text = text.split(' ')

        # Remove white spaces tokens
        text = [text[i] for i in range(len(text)) if text[i] != ' ']

        # Remove empty tokens
        text = [text[i] for i in range(len(text)) if text[i] != '']

        # Remove repetition in tokens (!!! => !)
        text = [text[i] for i in range(len(text)) if text[i] != text[i-1]]

        #  Handle the ALL CAPS Tweets 
        ### if ratio of caps in the word > 75% add allcaps tag <allcaps>
        caps_r = np.mean([text[i].isupper() for i in range(len(text))])
        if caps_r > 0.6 : 
            text.append('<allcaps>')

        # Lower Case : 
        text = [text[i].lower() for i in range(len(text))]

        return text

    def handle_emoji(self,text):
        # Dictionnary of "important" emojis : 
        emoji_dict =  {'♥️': ' love ',
                       '❤️' : ' love ',
                       '❤' : ' love ',
                       '😘' : ' kisses ',
                      '😭' : ' cry ',
                      '💪' : ' strong ',
                      '🌍' : ' earth ',
                      '💰' : ' money ',
                      '👍' : ' ok ',
                       '👌' : ' ok ',
                      '😡' : ' angry ',
                      '🍆' : ' dick ',
                      '🤣' : ' haha ',
                      '😂' : ' haha ',
                      '🖕' : ' fuck you '}

        for cha in emoji_dict:
            text = re.compile(str(cha)).sub(str(emoji_dict[cha]),text)
        # Remove ALL emojis
        text = emoji.get_emoji_regexp().sub(r' ',text) 
        text = re.compile("([\U0001f3fb-\U0001f3ff])").sub(r'',text) 
        text = re.compile("([\U00010000-\U0010ffff])").sub(r'',text) 
        text = re.compile("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])").sub(r'',text)

        # Add Space between  the Emoji Expressions : 
        text = re.compile("([\U00010000-\U0010ffff])").sub(r' \1 ',text) 
        return text

In [4]:
mydata = DataLoader()

---- Load, Clean and Tokensize Dataset : 

NameError: name 'handle_emoji' is not defined

In [None]:
words = np.array(list(mydata.all_words_freq.keys()))
list(words[list(np.argsort(list(mydata.all_words_freq.values())))[::-1]][:50])

In [None]:
mydata.data['train'].head(10)


## Word Embedings : GloVe
This Class Loads the GloVe Embeding, processes it, and create a word embedding given the DataLoader.

In [None]:
class GloVe_embedding(object):
    def __init__(self, dataLoader,dim_vect = 25 ):
        ########## VARIABLES ##########
        
        # Defining variables for GloVe: 
        self.words = []
        self.word2idx = {}
        self.glove_dict = {}
        
        # Defining variables for our Embedding:
        self.dim_vect = dim_vect
        self.size_vocab = len(dataLoader.all_words)
        self.emb_vocab = dataLoader.all_words
        self.emb_matrix = torch.zeros((self.size_vocab,self.dim_vect))
        
        ########## LOADING GLOVE DATA ##########
        
        # Defining path for GloVe Data : 
        self.path = join('..','data','glove') # Path of glove
        self.path_glove = join(self.path,'glove.twitter.27B.'+str(dim_vect))
        if not(isdir(self.path_glove)):
            mkdir(self.path_glove)
        self.path_vec_original = join(self.path,'glove.twitter.27B.'+str(dim_vect)+'d.txt') # Path of glove original vectors
        self.path_vec_save = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.vectors.dat')  # Path of glove saved vectors
        self.path_words = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.words.pkl')
        self.path_word2idx = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.word2idx.pkl')
                
        if not(isdir(self.path_vec_save) and isfile(self.path_words) and isfile(self.path_word2idx)) : 
            # If files are allready processed, just load them
            print('---- Processing the GloVe files : ',end='')
            self.process_GloVe()
            print('Done')
            
        # Load the wordvec files
        print('---- Loading the processed GloVe files : ',end='')
        self.load_GloVe()
        print('Done')
        
        # Adding the unknown words to the embedding
        print('---- Processing the Missing Vocabulary  : ',end='')
        nb_added = self.add_wordsData()
        print('Done')
        print('---- Words added : {0:} - Words in total : {1:}  -  Unreferenced Vocabulary : {2:.2f}%'.format(nb_added,self.size_vocab,100*nb_added/self.size_vocab))
        
        ########## TORCH EMBEDDING ##########
        # Creating the Pytorch Embedding Layer : 
        print('---- Creating the Pytorch Embedding Layer  : ',end='')
        self.emb_layer = nn.Embedding(self.size_vocab, self.dim_vect)
        self.create_emb_layer(non_trainable=True)
        print('Done')

               
    def process_GloVe(self):
        ''' Processes the GloVe Dataset - Saves files'''
        words = []
        word2idx = {}
        
        vectors = bcolz.carray(np.zeros(1), rootdir=self.path_vec_save, mode='w') # defining vector saved

        idx = 0
        with open(self.path_vec_original, 'rb') as f:
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                vect = np.array(line[1:]).astype(np.float)
                vectors.append(vect)

        vectors = bcolz.carray(vectors[:].reshape((-1, self.dim_vect)), rootdir=self.path_vec_save, mode='w')

        vectors.flush()
        pickle.dump(words, open(self.path_words, 'wb'))
        pickle.dump(word2idx, open(self.path_word2idx, 'wb'))
        
    def load_GloVe(self):
        ''' Loads previously processed dataset'''
        
        vectors = bcolz.open(self.path_vec_save)[:]

        self.words = pickle.load(open(self.path_words, 'rb'))
        self.word2idx = pickle.load(open(self.path_word2idx, 'rb'))
        
        self.glove_dict = {w: vectors[self.word2idx[w]] for w in self.words}
    
    def add_wordsData(self):
        nb_added = 0
        for i, word in enumerate(self.emb_vocab) :
            try: 
                self.emb_matrix[i,:] = torch.Tensor(self.glove_dict[word])
            except KeyError:
                nb_added +=1
                self.emb_matrix[i,:] = torch.Tensor(np.random.normal(scale=0.6, size=(self.dim_vect,)))
        return nb_added
    
    def create_emb_layer(self, non_trainable=True):
        self.emb_layer.load_state_dict({'weight': self.emb_matrix})
        if non_trainable:
            self.emb_layer.weight.requires_grad = False
            
    def word2vec_(self,word):
        return self.vectors[self.word2idx[word]]


In [None]:
myEmbedding = GloVe_embedding(mydata,dim_vect=25)

## Classifier 
Set of Classes used as classifier for the tweets. 

In [None]:
# Classification NN : 
class FFNN(nn.Module):
    
    def __init__(self, embedding, hidden_dim, vocab_size, max_len, num_classes):
        print('----Creating FFNN : ',end='')
        super(FFNN, self).__init__()
        
        #embedding (lookup layer) layer
        self.embedding = embedding
        
        #hidden layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        
        #activation
        self.relu1 = nn.ReLU()
        
        #output layer
        self.fc2 = nn.Linear(hidden_dim, num_classes)  
        print('Done')
    
    def forward(self, x):
        
        embedded = self.embedding(x)
        
        # we average the embeddings of words in a sentence
        averaged = embedded.mean(1)
        
        # (batch size, max sent length, embedding dim) to (batch size, embedding dim)

        out = self.fc1(averaged)
        out = self.relu1(out)
        out = self.fc2(out)
        return out


## Main Class : Trainer
Main Class for the loading, training, testing etc ...

In [None]:
class OffensiveClassifier(object):
    ''' Main Class for the loading, training, testing etc ...'''
    def __init__(self, dim_vect=25,cType='FFNN'):
        
        self.dim_vect = dim_vect
        
        # Loading the Tweet Data : 
        self.dataloader = DataLoader()
        

        # Loading the GloVe Embedding 
        self.embedding = GloVe_embedding(dim_vect=dim_vect,self.dataloader)
        
        # Classification : 
        if cType == 'FFNN':
            # Creating the Neuronal Network
            self.model = FFNN()
        if cType == 'logistic':
            pass

    def accuracy(output, target):

        output = torch.round(torch.sigmoid(output))
        correct = (output == target).float()
        acc = correct.sum()/len(correct)
        return acc
    
    def train(self,nb_epochs):
        # we will train for N epochs (N times the model will see all the data)
        epochs=20

        # the input dimension is the vocabulary size
        INPUT_DIM = len(word2idx)

        # we define our embedding dimension (dimensionality of the output of the first layer)
        EMBEDDING_DIM = 100

        # dimensionality of the output of the second hidden layer
        HIDDEN_DIM = 50

        #the outut dimension is the number of classes, 1 for binary classification
        OUTPUT_DIM = 1


        # recall input parameters to our model
        #embedding_dim, hidden_dim, vocab_size, max_len, num_classes
        # max_len is the maximum length of the input sentences as we defined during padding

        model = FFNN(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), max_len, OUTPUT_DIM)

        # we use the stochastic gradient descent (SGD) optimizer
        optimizer = optim.SGD(model.parameters(), lr=0.5)

        # we use the binary cross-entropy loss with sigmoid (applied to logits) 
        #Recall we did not apply any activation to our output layer, we need to make our outputs look like probality.
        loss_fn = nn.BCEWithLogitsLoss()

        feature = train_sent_tensor
        target = train_label_tensor

        for epoch in range(1, epochs+1):

            #to ensure the dropout (exlained later) is "turned on" while training
            #good practice to include even if do not use here
            model.train()

            #we zero the gradients as they are not removed automatically
            optimizer.zero_grad()

            # queeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1 
            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)
            acc = accuracy(predictions, target)
            #calculate the gradient of each parameter
            loss.backward()
            #update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            epoch_loss = loss.item()
            epoch_acc = acc



            print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}%')
            #print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f}')


