In [1]:
#from utils import *

import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
import re
from os import mkdir
from os.path import join, isfile, isdir, exists
import bcolz
import pickle 
import emoji
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from pattern.en import spelling
from tqdm import tqdm
import ast

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload

## Word Embedings : GloVe
This Class Loads the GloVe Embeding, processes it, and create a word embedding given the DataLoader.

In [3]:
class GloVe_embedding(object):
    def __init__(self,dim_vect = 25 ):
        ########## VARIABLES ##########
        self.dim_vect = dim_vect

        # Defining variables for GloVe: 
        self.words = []
        self.word2idx = {}
        self.glove_dict = {}
        
        ########## LOADING GLOVE DATA ##########
        
        # Defining path for GloVe Data : 
        self.path = join('..','data','glove') # Path of glove
        self.path_glove = join(self.path,'glove.twitter.27B.'+str(dim_vect))
        if not(isdir(self.path_glove)):
            mkdir(self.path_glove)
        self.path_vec_original = join(self.path,'glove.twitter.27B.'+str(dim_vect)+'d.txt') # Path of glove original vectors
        self.path_vec_save = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.vectors.dat')  # Path of glove saved vectors
        self.path_words = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.words.pkl')
        self.path_word2idx = join(self.path_glove,'glove.twitter.27B.'+str(dim_vect)+'d.word2idx.pkl')
                
        if not(isdir(self.path_vec_save) and isfile(self.path_words) and isfile(self.path_word2idx)) : 
            # If files are allready processed, just load them
            print('---- Processing the GloVe files : ',end='')
            self.process_GloVe()
            print('Done')
            
        # Load the wordvec files
        print('---- Loading the processed GloVe files : ',end='')
        self.load_GloVe()
        print('Done')
        
        ########## TORCH EMBEDDING ##########
        
        # Defining variables for our Embedding:
        self.size_vocab = len(self.words)
        
        # Creating the Pytorch Embedding Layer : 
        print('---- Creating the Pytorch Embedding Layer  : ',end='')
        self.emb_layer = nn.Embedding(self.size_vocab, self.dim_vect)
        self.create_emb_layer(non_trainable=True)
        print('Done')

               
    def process_GloVe(self):
        ''' Processes the GloVe Dataset - Saves files'''
        words = []
        word2idx = {}
        
        vectors = bcolz.carray(np.zeros(1) , rootdir=self.path_vec_save , mode='w' ) # defining vector saved
        
        # Adding Padding vector : 
        word2idx['<pad>'] = 0
        words.append('<pad>')
        #vect = np.random.normal(scale=0.6, size=(self.dim_vect , )) # random padding vect
        vect = np.zeros((self.dim_vect , )) # 0's padding vect. 
        vectors.append(vect)
        
        idx = 1
        with open(self.path_vec_original, 'rb') as f:
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                vect = np.array(line[1:]).astype(np.float)
                vectors.append(vect)
                

        vectors = bcolz.carray(vectors[:].reshape((-1, self.dim_vect)), rootdir=self.path_vec_save, mode='w')

        vectors.flush()
        pickle.dump(words, open(self.path_words, 'wb'))
        pickle.dump(word2idx, open(self.path_word2idx, 'wb'))
        
    def load_GloVe(self):
        ''' Loads previously processed dataset'''
        
        vectors = bcolz.open(self.path_vec_save)[:]
        
        self.words = pickle.load(open(self.path_words, 'rb'))
        self.word2idx = pickle.load(open(self.path_word2idx, 'rb'))
        
        self.glove_dict = {w: vectors[self.word2idx[w]] for w in self.words}
        self.emb_matrix = torch.Tensor(vectors)
    
    def create_emb_layer(self, non_trainable=True):
        self.emb_layer.load_state_dict({'weight': self.emb_matrix})
        if non_trainable:
            self.emb_layer.weight.requires_grad = False
        


In [4]:
myEmbedding = GloVe_embedding(dim_vect=25)

---- Loading the processed GloVe files : Done
---- Creating the Pytorch Embedding Layer  : Done


## Data Loader
This Class Loads the Tweet Dataset, Cleans it. It also enables the loading for the training and testing. 
TODO : Loading for training and testing

In [112]:
class TestTweetDataset(Dataset):
    ''' 
    Pytorch Dataset for the Test set. 
    initialisation : - data : training pandas dataframe
                     - subtask : subtask we are working on {'subtask_a', 'subtask_b', 'subtask_c', }
                     - balanced : if we balance the dataset by oversampling it in the smallest classes
    '''
    def __init__(self,data, subtask):
        self.id = data.index.tolist()
        self.token = data.token.tolist()
        self.token_id = data.token_id.tolist()
        
    def __getitem__(self, index):
        return torch.LongTensor(self.token_id[index]), torch.FloatTensor([self.id[index]])

    def __len__(self):
        return len(self.token) 

        
    
class TweetDataset(Dataset):
    ''' 
    Pytorch Dataset for the Training set. 
    initialisation : - data : training pandas dataframe
                     - subtask : subtask we are working on {'subtask_a', 'subtask_b', 'subtask_c', }
                     - balanced : if we balance the dataset by oversampling it in the smallest classes
    '''
    def __init__(self,data,subtask):        
        # Save in lists the ids, labels, label_id, token, and token_id . 
        self.id = data.index.tolist()
        self.label_id = data[subtask].tolist()
        self.token = data.token.tolist()
        self.token_id = data.token_id.tolist()
        
    def __getitem__(self, index):
        return torch.LongTensor(self.token_id[index]), torch.FloatTensor([self.label_id[index]])

    def __len__(self):
        return len(self.token) 
    
    

In [211]:
class DataHandling(object):
    def __init__(self, embedding, pValid):
        print('-- Data Handling : ')
        
        self.embedding = embedding
        
        self.defineClasses()

        # All the Text Data path
        self.definePath()
        
        self.data = {}
        
        processed_ = True
        for f in self.path:
            processed_ = processed_ and isfile(self.path_clean[f])
        
        if  not(processed_) : 
            ### PROCESSING OF THE ORIGINAL DATASET
            # Load, Clean and Tokenize the Datasets
            print('---- Load, Clean and Tokensize Dataset : ',end='')
            self.inital_dataload()
            print('Done')
            
            # Compute List of All words in the datasets
            print('---- Finalize tokenized words and translation to id : ',end='')
            self.compute_wordlist()
            self.token2id()
            print('Done')

            # Add Embedding and correct clean the words not in embedding : 
            print('---- Adapt Dataset for Embedding : ',end='')
            self.adaptDataset()
            print('Done')

            # Save the Cleaned Datasets
            print('---- Saving all tokenized words : ',end='')
            self.save_cleanDataset()
            print('Done')
        else : 
            # Save the Cleaned Datasets
            print('---- Load the Clean Adapted Dataset : ',end='')
            self.load_cleanDataset()
            
            # Compute List of All words in the datasets
            self.compute_wordlist()
            print('Done')
        
        # Create Validation Set (split the test dataset) for every subtask
        self.splitValidation(p=pValid)
        self.prepareLabels()
        
        
    def defineClasses(self):
        ''' Function that defines the classes labels and id per subtask '''
        self.classes_dict = {}
        self.classes_dict['subtask_a'] = {'NOT' : 0 ,'OFF' : 1}
        self.classes_dict['subtask_b'] = {'UNT' : 0 ,'TIN' : 1}
        self.classes_dict['subtask_c'] = {'IND' : 0 ,'OTH' : 1, 'GRP' : 2}
        
    def definePath(self):
        ''' Function that defines all the paths of the datasets. '''
        self.path = {}
        self.path_clean = {}
        
        self.path['train'] = join('..','data','start-kit','training-v1','offenseval-training-v1.tsv')
        self.path_clean['train'] = join('..','data','start-kit','training-v1','clean-offenseval-training-v1.tsv')
        
        self.path['subtask_a'] = join('..','data','Test A Release','testset-taska.tsv')
        self.path_clean['subtask_a'] = join('..','data','Test A Release','clean-testset-taska.tsv')
        
        self.path['subtask_b'] = join('..','data','Test B Release','testset-taskb.tsv')
        self.path_clean['subtask_b'] = join('..','data','Test B Release','clean-testset-taskb.tsv')
        
        self.path['subtask_c'] = join('..','data','Test C Release','test_set_taskc.tsv')
        self.path_clean['subtask_c'] = join('..','data','Test C Release','clean-test_set_taskc.tsv')
        
    def getDataset(self, dataT='train',subtask='subtask_a',balanced = True):
        ''' Returns the pytorch Dataset
            - file : {'train','test','validation'}
            - subtask : {'subtask_a','subtask_b','subtask_c'} '''
        
            
        if dataT == 'train':
            if balanced : 
                data_train = self.balanceData(self.data[dataT][subtask],subtask)
            else : 
                data_train = self.data[dataT][subtask]
            dataset = TweetDataset(data_train, subtask)
        elif dataT == 'validation':
            dataset = TweetDataset(self.data[dataT][subtask], subtask)
        elif dataT == 'test':
            dataset = TestTweetDataset(self.data[subtask], subtask)
            
        return dataset
    
    def token2id(self):
        ''' Function that translates the list of tokens into a list of token id of the embedding.
            Adds a new 'token_id' column to the dataframe '''
        for f in self.path : 
            def token2id_x(x):
                
                return [self.embedding.word2idx[k] for k in x if k in self.embedding.words]
            self.data[f]['token_id'] = self.data[f]['token'].apply(lambda x : token2id_x(x))

    def save_cleanDataset(self):
        ''' Saves at the defined path the cleaned dataset '''
        for f in self.path : 
            self.data[f].to_csv(self.path_clean[f])
        
    def load_cleanDataset(self):
        ''' Loads at the defined path the cleaned dataset '''
        for f in self.path : 
            self.data[f] = pd.read_csv(self.path_clean[f],index_col='id')
            self.data[f]['token'] = self.data[f]['token'].apply(lambda x : ast.literal_eval(x))
            self.data[f]['token_id'] = self.data[f]['token_id'].apply(lambda x : ast.literal_eval(x))
           
               
    def adaptDataset(self):
        ''' Function that finds all the words which are not in the embedding and tries to 
            correct them with the pattern.en package by taking the most probable replacement.
            If the suggested word in very unlikely, the word is removed from the tweets. 
        '''
        # Find all words wich are not in the Embedding :
        missing_words = []
        for i, word in enumerate(self.all_words) :
            if self.embedding.word2idx.get(word) == None : 
                missing_words.append(word)
        
        # Correct if possible the missing_words : 
        ### We use theshold over which we correct the word. Under which we discard the word
        t = 0.5 # threshold
        rejected_words = []
        corrected_words = {}
        for word in tqdm(missing_words) : 
            suggestion, prob = spelling.suggest(word)[0]
            if prob < t : 
                rejected_words.append(word)
            else : 
                corrected_words[word] = suggestion
        
        # Modify the Original Datasets with those corrected_words : 
        for f in self.path : 
            self.data[f]['token'] = self.data[f]['token'].apply(lambda x : [corrected_words.get(k,k) for k in x])
            self.data[f]['token'] = self.data[f]['token'].apply(lambda x : [k for k in x if k not in rejected_words ])
        nb_rejected = len(rejected_words)
        nb_corrected = len(corrected_words)
        nb_vocab = len(self.embedding.glove_dict)
        p_rejected = 100* nb_rejected / nb_vocab
        p_corrected = 100* nb_corrected / nb_vocab
        print('---- Words removed   : {0:} / {1:.2f} - {2:} %'.format(nb_rejected,nb_vocab,p_rejected))
        print('---- Words corrected : {0:} / {1:.2f} - {2:} %'.format(nb_corrected,nb_vocab,p_corrected))
        
    def inital_dataload(self):
        for f in self.path : 
            self.data[f] = pd.read_table(self.path[f],index_col='id')
            self.data[f]['token'] = self.data[f]['tweet'].apply(lambda x : self.clean_tweet(x))
            
    def compute_wordlist(self):
        self.all_words_freq = {}
        self.all_words = []
        
        for f in self.data : 
            for i in range(len(self.data[f])):
                for e in self.data[f].iloc[i].token:
                    self.all_words_freq[e] = 1 + self.all_words_freq.get(e,0)
        self.all_words = list(self.all_words_freq.keys())
        
    def splitValidation(self,p):
        ''' Creates the validation set by  taking p % of the train dataset '''
        data = self.data['train'].copy()
        self.data['train'] = {}
        self.data['validation'] = {}

        for subtask in self.classes_dict: # per subtask
            self.data['train'][subtask] = pd.DataFrame()
            for label in self.classes_dict[subtask]: #per label in this subtask 
                data_label =  data[data[subtask]==label]
                self.data['train'][subtask] = self.data['train'][subtask].append(data.loc[data_label.index])
                nb_valid = int(len(data_label)*p)
                # Select randmoly (without repetition) the indexes of the selected vaidation tweets
                index_valid = np.random.choice(data_label.index,(nb_valid,),replace=False)
                # Add the the selected validation tweets to the new dataframe
                self.data['validation'][subtask] = self.data['train'][subtask].loc[index_valid,:]
                # Drop the selected validation tweets from the training set
                self.data['train'][subtask] = self.data['train'][subtask].drop(index = index_valid)
                
    def prepareLabels(self) : 
        ''' Transform the labels into classes id '''
        for subtask in self.classes_dict: # per subtask
            self.data['validation'][subtask][subtask] =self.data['validation'][subtask][subtask].apply(lambda x : self.classes_dict[subtask][x])  
            self.data['train'][subtask][subtask] = self.data['train'][subtask][subtask].apply(lambda x : self.classes_dict[subtask][x])  

    def balanceData(self,data,subtask):
        ''' Augments the Data given in input in order to balance the dataset'''
        class_size = {}
        for label in self.classes_dict[subtask]:
            class_size[label] = len(data[data[subtask]==self.classes_dict[subtask][label]])
        largest_class = max(class_size, key=class_size.get)
        print('---- Augmenting the Data : ')
        print('Before Augmentation : ',class_size)

        for label in self.classes_dict[subtask]:  
            if label != largest_class:
                id_list = data[data[subtask]==self.classes_dict[subtask][label]].index
                nb_augmentation = class_size[largest_class] - class_size[label]
                id_augmentation = np.random.choice(id_list, (nb_augmentation,))
                data = data.append(data.loc[id_augmentation,:])
        # Check if it went well
        for label in self.classes_dict[subtask]:
            class_size[label] = len(data[data[subtask]==self.classes_dict[subtask][label]])
        
        print('After Augmentation : ',class_size)
        return data
    
    def clean_tweet(self,text):
        ''' Function that is applied to every to tweet in the dataset '''
        
        # =========== TEXT ===========
        # Replace @USER by <user>
        text = re.compile(r'@USER').sub(r'<user>',text)

        # Replace URL by <url>
        text = re.compile(r'URL').sub(r'<url>',text)

        # Remove numbers :
        text = re.compile(r'[0-9]+').sub(r' ',text)

        # Remove some special characters
        text = re.compile(r'([\xa0_\{\}\[\]¬•$,:;/@#|\^*%().~`”"“-])').sub(r' ',text) 

        # Space the special characters with white spaces
        text = re.compile(r'([$&+,:;=?@#|\'.^*()%!"’“-])').sub(r' \1 ',text)
        
        # Replace some special characters : 
        replace_dict = {r'&' : 'and' , 
                        r'\+' : 'plus'}
        for cha in replace_dict:
            text = re.compile(str(cha)).sub(str(replace_dict[cha]),text)
            
        # Handle Emoji : translate some and delete the others
        text = self.handle_emoji(text)
        
        # Word delengthening : 
        text = re.compile(r'(.)\1{3,}').sub(r'\1\1',text)

        # Cut the words with caps in them : 
        text = re.compile(r'([a-z]+|[A-Z]+|[A-Z][a-z]+)([A-Z][a-z]+)').sub(r'\1 \2',text)
        text = re.compile(r'([a-z]+|[A-Z]+|[A-Z][a-z]+)([A-Z][a-z]+)').sub(r'\1 \2',text)        
        # =========== TOKENS ===========
        # TOKENIZE 
        text = text.split(' ')

        # Remove white spaces tokens
        text = [text[i] for i in range(len(text)) if text[i] != ' ']

        # Remove empty tokens
        text = [text[i] for i in range(len(text)) if text[i] != '']

        # Remove repetition in tokens (!!! => !)
        text = [text[i] for i in range(len(text)) if text[i] != text[i-1]]

        #  Handle the ALL CAPS Tweets 
        ### if ratio of caps in the word > 75% add allcaps tag <allcaps>
        caps_r = np.mean([text[i].isupper() for i in range(len(text))])
        if caps_r > 0.6 : 
            text.append('<allcaps>')

        # Lower Case : 
        text = [text[i].lower() for i in range(len(text))]

        return text

    def handle_emoji(self,text):
        # Dictionnary of "important" emojis : 
        emoji_dict =  {'♥️': ' love ',
                       '❤️' : ' love ',
                       '❤' : ' love ',
                       '😘' : ' kisses ',
                      '😭' : ' cry ',
                      '💪' : ' strong ',
                      '🌍' : ' earth ',
                      '💰' : ' money ',
                      '👍' : ' ok ',
                       '👌' : ' ok ',
                      '😡' : ' angry ',
                      '🍆' : ' dick ',
                      '🤣' : ' haha ',
                      '😂' : ' haha ',
                      '🖕' : ' fuck you '}

        for cha in emoji_dict:
            text = re.compile(str(cha)).sub(str(emoji_dict[cha]),text)
        # Remove ALL emojis
        text = emoji.get_emoji_regexp().sub(r' ',text) 
        text = re.compile("([\U0001f3fb-\U0001f3ff])").sub(r'',text) 
        text = re.compile("([\U00010000-\U0010ffff])").sub(r'',text) 
        text = re.compile("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])").sub(r'',text)

        # Add Space between  the Emoji Expressions : 
        text = re.compile("([\U00010000-\U0010ffff])").sub(r' \1 ',text) 
        return text

In [212]:
mydata = DataHandling(myEmbedding, pValid=0.15)

-- Data Handling : 
---- Load the Clean Adapted Dataset : Done


In [215]:
train = mydata.getDataset(dataT='train',subtask='subtask_a', balanced=True)

---- Augmenting the Data : 
Before Augmentation :  {'NOT': 7514, 'OFF': 3740}
After Augmentation :  {'NOT': 7514, 'OFF': 7514}


## Classifier 
Set of Classes used as classifier for the tweets. 

In [216]:
# Classification NN : 
class FFNN(nn.Module):
    
    def __init__(self, embedding, hidden_dim , num_classes ,embedding_dim):
        print('------ Creating FFNN : ',end='')
        
        super(FFNN, self).__init__()
        
        # Embedding
        self.embedding = embedding
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes) 
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)


        
        # Activation Layers
        self.relu1 = nn.ReLU()
        self.output = nn.Sigmoid()
        print('Done')
    
    def forward(self, x):
        
        embedded = self.embedding(x)
        # we average the embeddings of words in a sentence
        
        non_zero_nb = (x!=0).sum(1,keepdim=True)
        #print(x.shape, non_zero_nb,embedded.sum(1).shape)
        averaged = embedded.sum(1) / non_zero_nb.float()
        #averaged = embedded.mean(1)
        # (batch size, max sent length, embedding dim) to (batch size, embedding dim)

        out = self.fc1(averaged)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.output(out)
        return out
    
    def loss_fn(self):
        ''' Returns the loss function best associated with the model'''
        return nn.BCELoss()


Pad Data Method : Used with the pytorch DataLoader in order to pad the length of the tweets by batch. 

In [217]:
def padding_tweet(batch):
    '''
    Pad Data Method : Used with the pytorch DataLoader in order to pad the length of the tweets by batch. 
    args: 
        batch - List of elements ( x , label )
    return 
        batch - Padded ( list(x) , list(label))
    
    '''
    batch = list(zip(*batch))
    max_len = max([len(t) for t in batch[0]])
    batch[0] = torch.stack([pad_tensor(vec=t, pad=max_len, dim=0) for t in batch[0]],dim=0)
    batch[1] = torch.stack(batch[1])
    return batch[0] , batch[1]

def pad_tensor(vec, pad, dim):
    """
    args:
        vec - tensor to pad
        pad - the size to pad to
        dim - dimension to pad

    return:
        a new tensor padded to 'pad' in dimension 'dim'
    """
    pad_size = list(vec.shape)
    pad_size[dim] = pad - vec.size(dim)
    return torch.cat([vec, torch.zeros(*pad_size,dtype=torch.long)], dim=dim)


## Main Class : Trainer
Main Class for the loading, training, testing etc ...

In [256]:
class OffensiveClassifier(object):
    ''' Main Class for the loading, training, testing etc ...'''
    def __init__(self,subtask='subtask_a', dim_vect=25,cType='FFNN',pValid = 0.15):
        
        self.dim_vect = dim_vect
        
        self.subtask = subtask

        # Loading the GloVe Embedding and Torch Formating of this Embedding
        self.GloVe = GloVe_embedding(dim_vect= dim_vect )
        self.embedding = self.GloVe.emb_layer
        
        # Loading the Data Handler : 
        self.dataHandler = DataHandling(self.GloVe,pValid=pValid)
        
        # Retrieving Training DataSet (pytorch)
        self.train_set = self.dataHandler.getDataset('train',subtask,balanced=True)
        
        # Retrieving the Validation Set (pytorch)
        self.valid_set = self.dataHandler.getDataset('validation',subtask)

        # Retrieving Test DataSet (pytorch)
        self.test_set = self.dataHandler.getDataset('test',subtask)
        
        # Classification : 
        if cType == 'FFNN':
            # Creating the Neural Network
            self.model = FFNN(self.embedding, 100, 1, self.dim_vect)
        elif cType == 'RNN':
            
            pass
        

    def accuracy(self, output, target ):

        output = torch.round(output)
        correct = (output == target).float()
        acc = correct.sum()/len(correct)
        return acc, correct.sum()
    
    def train( self, nb_epochs, lr=0.01, batch_size = 1000 ):
        
        self.train_generator = DataLoader(self.train_set, batch_size=batch_size,collate_fn=padding_tweet, shuffle=True)

        optimizer = optim.RMSprop(self.model.parameters(), lr=lr)#, weight_decay=0.005)
        
        loss_fn = self.model.loss_fn()

        for epoch in range(nb_epochs):
            i_batch = 0
            epoch_correct = 0
            self.model.train() 
            
            for tokens, target  in self.train_generator :
                
                i_batch += 1
                target = target.float()
                tokens = tokens.long()
                #to ensure the dropout (exlained later) is "turned on" while training
                #good practice to include even if do not use here
                self.model.train()

                #we zero the gradients as they are not removed automatically
                optimizer.zero_grad()

                predictions = self.model(tokens)
                
                loss = loss_fn(predictions, target)
                acc, correct = self.accuracy(predictions, target)
                
                #calculate the gradient of each parameter
                loss.backward()
                
                #update the parameters using the gradients and optimizer algorithm 
                optimizer.step()

                epoch_loss = loss.item()

                if i_batch % 10  !=0:
                    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {acc*100:.2f}%')
            
            self.validation(batch_size = 50)
            
    def validation(self, batch_size = 1000):
        all_correct = 0
        self.validation_generator = DataLoader(self.valid_set, batch_size=batch_size, collate_fn=padding_tweet, shuffle=True)
        loss_fn = self.model.loss_fn()
        nb_valid = len(taskAclassifier.valid_set)
        self.model.eval()  # set model to evaluation mode
        with torch.no_grad(): 
            for tokens, target  in self.validation_generator :

                predictions = self.model(tokens)   
                loss = loss_fn(predictions, target)
                acc, correct = self.accuracy(predictions, target)
                all_correct += correct
        print(f'|+| Validation Accuracy : {100*all_correct/nb_valid:.2f} %')
            
            

    def test(self):
        ''' 
            Test Function : Tests the Network on the Test Data of the Subtask and Saves in a file
        '''
        self.test_generator = DataLoader(self.test_set,collate_fn= padding_tweet )




In [257]:
taskAclassifier = OffensiveClassifier(subtask='subtask_a', dim_vect=200, cType='FFNN',pValid = 0.2)

---- Loading the processed GloVe files : Done
---- Creating the Pytorch Embedding Layer  : Done
-- Data Handling : 
---- Load the Clean Adapted Dataset : Done
---- Augmenting the Data : 
Before Augmentation :  {'NOT': 7072, 'OFF': 3520}
After Augmentation :  {'NOT': 7072, 'OFF': 7072}
------ Creating FFNN : Done


In [258]:
taskAclassifier.train(500,lr=0.01,batch_size=500)

| Epoch: 00 | Train Loss: 0.719 | Train Acc: 47.60%
| Epoch: 00 | Train Loss: 6.380 | Train Acc: 51.60%
| Epoch: 00 | Train Loss: 2.119 | Train Acc: 51.20%
| Epoch: 00 | Train Loss: 1.036 | Train Acc: 51.80%
| Epoch: 00 | Train Loss: 0.983 | Train Acc: 54.60%
| Epoch: 00 | Train Loss: 0.619 | Train Acc: 65.80%
| Epoch: 00 | Train Loss: 0.622 | Train Acc: 62.00%
| Epoch: 00 | Train Loss: 0.607 | Train Acc: 66.80%
| Epoch: 00 | Train Loss: 0.611 | Train Acc: 65.40%
| Epoch: 00 | Train Loss: 0.611 | Train Acc: 65.60%
| Epoch: 00 | Train Loss: 0.676 | Train Acc: 57.00%
| Epoch: 00 | Train Loss: 0.738 | Train Acc: 58.20%
| Epoch: 00 | Train Loss: 0.643 | Train Acc: 60.60%
| Epoch: 00 | Train Loss: 0.574 | Train Acc: 70.80%
| Epoch: 00 | Train Loss: 0.589 | Train Acc: 68.00%
| Epoch: 00 | Train Loss: 0.576 | Train Acc: 70.80%
| Epoch: 00 | Train Loss: 0.594 | Train Acc: 65.40%
| Epoch: 00 | Train Loss: 0.545 | Train Acc: 72.00%
| Epoch: 00 | Train Loss: 0.581 | Train Acc: 70.40%
| Epoch: 00 

| Epoch: 05 | Train Loss: 0.525 | Train Acc: 70.80%
| Epoch: 05 | Train Loss: 0.491 | Train Acc: 77.40%
| Epoch: 05 | Train Loss: 0.432 | Train Acc: 78.47%
|+| Validation Accuracy : 77.73 %
| Epoch: 06 | Train Loss: 0.499 | Train Acc: 76.00%
| Epoch: 06 | Train Loss: 0.495 | Train Acc: 74.80%
| Epoch: 06 | Train Loss: 0.494 | Train Acc: 73.60%
| Epoch: 06 | Train Loss: 0.487 | Train Acc: 75.00%
| Epoch: 06 | Train Loss: 0.501 | Train Acc: 74.40%
| Epoch: 06 | Train Loss: 0.499 | Train Acc: 74.00%
| Epoch: 06 | Train Loss: 0.541 | Train Acc: 71.40%
| Epoch: 06 | Train Loss: 0.526 | Train Acc: 71.40%
| Epoch: 06 | Train Loss: 0.509 | Train Acc: 71.40%
| Epoch: 06 | Train Loss: 0.564 | Train Acc: 68.80%
| Epoch: 06 | Train Loss: 0.508 | Train Acc: 74.20%
| Epoch: 06 | Train Loss: 0.503 | Train Acc: 74.80%
| Epoch: 06 | Train Loss: 0.482 | Train Acc: 75.40%
| Epoch: 06 | Train Loss: 0.462 | Train Acc: 80.40%
| Epoch: 06 | Train Loss: 0.477 | Train Acc: 78.60%
| Epoch: 06 | Train Loss: 0.46

| Epoch: 11 | Train Loss: 0.449 | Train Acc: 81.80%
| Epoch: 11 | Train Loss: 0.408 | Train Acc: 80.60%
| Epoch: 11 | Train Loss: 0.430 | Train Acc: 79.40%
| Epoch: 11 | Train Loss: 0.457 | Train Acc: 79.00%
| Epoch: 11 | Train Loss: 0.471 | Train Acc: 77.00%
| Epoch: 11 | Train Loss: 0.502 | Train Acc: 73.80%
| Epoch: 11 | Train Loss: 0.441 | Train Acc: 79.00%
| Epoch: 11 | Train Loss: 0.511 | Train Acc: 78.47%
|+| Validation Accuracy : 85.80 %
| Epoch: 12 | Train Loss: 0.451 | Train Acc: 78.00%
| Epoch: 12 | Train Loss: 0.522 | Train Acc: 75.00%
| Epoch: 12 | Train Loss: 0.420 | Train Acc: 81.00%
| Epoch: 12 | Train Loss: 0.444 | Train Acc: 81.20%
| Epoch: 12 | Train Loss: 0.428 | Train Acc: 79.00%
| Epoch: 12 | Train Loss: 0.431 | Train Acc: 80.80%
| Epoch: 12 | Train Loss: 0.398 | Train Acc: 82.80%
| Epoch: 12 | Train Loss: 0.441 | Train Acc: 78.40%
| Epoch: 12 | Train Loss: 0.416 | Train Acc: 82.20%
| Epoch: 12 | Train Loss: 0.479 | Train Acc: 76.20%
| Epoch: 12 | Train Loss: 0.45

| Epoch: 17 | Train Loss: 0.473 | Train Acc: 75.20%
| Epoch: 17 | Train Loss: 0.429 | Train Acc: 80.80%
| Epoch: 17 | Train Loss: 0.420 | Train Acc: 82.00%
| Epoch: 17 | Train Loss: 0.424 | Train Acc: 77.40%
| Epoch: 17 | Train Loss: 0.400 | Train Acc: 81.80%
| Epoch: 17 | Train Loss: 0.366 | Train Acc: 83.00%
| Epoch: 17 | Train Loss: 0.446 | Train Acc: 77.40%
| Epoch: 17 | Train Loss: 0.431 | Train Acc: 80.00%
| Epoch: 17 | Train Loss: 0.411 | Train Acc: 78.20%
| Epoch: 17 | Train Loss: 0.406 | Train Acc: 81.20%
| Epoch: 17 | Train Loss: 0.372 | Train Acc: 81.40%
| Epoch: 17 | Train Loss: 0.393 | Train Acc: 81.25%
|+| Validation Accuracy : 76.93 %
| Epoch: 18 | Train Loss: 0.407 | Train Acc: 79.80%
| Epoch: 18 | Train Loss: 0.420 | Train Acc: 81.20%
| Epoch: 18 | Train Loss: 0.427 | Train Acc: 80.60%
| Epoch: 18 | Train Loss: 0.378 | Train Acc: 83.80%
| Epoch: 18 | Train Loss: 0.398 | Train Acc: 80.60%
| Epoch: 18 | Train Loss: 0.369 | Train Acc: 82.00%
| Epoch: 18 | Train Loss: 0.36

| Epoch: 23 | Train Loss: 0.343 | Train Acc: 86.20%
| Epoch: 23 | Train Loss: 0.352 | Train Acc: 85.40%
| Epoch: 23 | Train Loss: 0.327 | Train Acc: 86.40%
| Epoch: 23 | Train Loss: 0.340 | Train Acc: 85.40%
| Epoch: 23 | Train Loss: 0.299 | Train Acc: 87.60%
| Epoch: 23 | Train Loss: 0.377 | Train Acc: 81.20%
| Epoch: 23 | Train Loss: 0.389 | Train Acc: 81.80%
| Epoch: 23 | Train Loss: 0.417 | Train Acc: 79.40%
| Epoch: 23 | Train Loss: 0.367 | Train Acc: 81.00%
| Epoch: 23 | Train Loss: 0.373 | Train Acc: 83.60%
| Epoch: 23 | Train Loss: 0.295 | Train Acc: 88.40%
| Epoch: 23 | Train Loss: 0.358 | Train Acc: 82.80%
| Epoch: 23 | Train Loss: 0.362 | Train Acc: 81.60%
| Epoch: 23 | Train Loss: 0.369 | Train Acc: 82.80%
| Epoch: 23 | Train Loss: 0.412 | Train Acc: 79.60%
| Epoch: 23 | Train Loss: 0.416 | Train Acc: 77.40%
| Epoch: 23 | Train Loss: 0.475 | Train Acc: 75.00%
| Epoch: 23 | Train Loss: 0.421 | Train Acc: 78.47%
|+| Validation Accuracy : 76.02 %
| Epoch: 24 | Train Loss: 0.35

| Epoch: 29 | Train Loss: 0.329 | Train Acc: 85.60%
| Epoch: 29 | Train Loss: 0.317 | Train Acc: 84.80%
| Epoch: 29 | Train Loss: 0.311 | Train Acc: 86.80%
| Epoch: 29 | Train Loss: 0.359 | Train Acc: 84.00%
| Epoch: 29 | Train Loss: 0.388 | Train Acc: 80.60%
| Epoch: 29 | Train Loss: 0.406 | Train Acc: 80.20%
| Epoch: 29 | Train Loss: 0.369 | Train Acc: 84.40%
| Epoch: 29 | Train Loss: 0.347 | Train Acc: 85.60%
| Epoch: 29 | Train Loss: 0.279 | Train Acc: 89.40%
| Epoch: 29 | Train Loss: 0.327 | Train Acc: 86.20%
| Epoch: 29 | Train Loss: 0.292 | Train Acc: 88.60%
| Epoch: 29 | Train Loss: 0.284 | Train Acc: 87.80%
| Epoch: 29 | Train Loss: 0.280 | Train Acc: 89.00%
| Epoch: 29 | Train Loss: 0.294 | Train Acc: 87.20%
| Epoch: 29 | Train Loss: 0.293 | Train Acc: 86.20%
| Epoch: 29 | Train Loss: 0.299 | Train Acc: 87.20%
| Epoch: 29 | Train Loss: 0.298 | Train Acc: 85.80%
| Epoch: 29 | Train Loss: 0.316 | Train Acc: 86.20%
| Epoch: 29 | Train Loss: 0.298 | Train Acc: 85.42%
|+| Validati

| Epoch: 35 | Train Loss: 0.302 | Train Acc: 85.60%
| Epoch: 35 | Train Loss: 0.269 | Train Acc: 88.60%
| Epoch: 35 | Train Loss: 0.231 | Train Acc: 90.60%
| Epoch: 35 | Train Loss: 0.255 | Train Acc: 89.20%
| Epoch: 35 | Train Loss: 0.265 | Train Acc: 89.40%
| Epoch: 35 | Train Loss: 0.244 | Train Acc: 91.40%
| Epoch: 35 | Train Loss: 0.251 | Train Acc: 89.80%
| Epoch: 35 | Train Loss: 0.295 | Train Acc: 88.40%
| Epoch: 35 | Train Loss: 0.318 | Train Acc: 86.80%
| Epoch: 35 | Train Loss: 0.298 | Train Acc: 86.80%
| Epoch: 35 | Train Loss: 0.311 | Train Acc: 86.60%
| Epoch: 35 | Train Loss: 0.252 | Train Acc: 89.80%
| Epoch: 35 | Train Loss: 0.264 | Train Acc: 88.40%
| Epoch: 35 | Train Loss: 0.311 | Train Acc: 85.40%
| Epoch: 35 | Train Loss: 0.313 | Train Acc: 85.20%
| Epoch: 35 | Train Loss: 0.314 | Train Acc: 85.00%
| Epoch: 35 | Train Loss: 0.297 | Train Acc: 86.00%
| Epoch: 35 | Train Loss: 0.265 | Train Acc: 87.20%
| Epoch: 35 | Train Loss: 0.266 | Train Acc: 86.81%


KeyboardInterrupt: 