In [1]:
pwd

'/home/mohit19014/Hostility Detection/Code'

In [2]:
cd ..

/home/mohit19014/Hostility Detection


In [3]:
ls

 [0m[01;34mASL[0m/    [01;34mData[0m/   [01;34m'Final Experiments'[0m/   [01;34mPhotos[0m/      [01;34mTensorboard[0m/
 [01;34mCode[0m/   [01;34mDumps[0m/   [01;34mModels[0m/               sample.tex


In [4]:
!nvidia-smi

Wed Mar 10 08:26:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 455.45.01    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
!ls

 ASL    Data   'Final Experiments'   Photos	  Tensorboard
 Code   Dumps   Models		     sample.tex


In [6]:
### General
import re
import copy
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings

### Sklearn
import joblib
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

### Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

### Transformers
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

#### Configuration Class


In [7]:
class Config1:
    def __init__(self):
        super(Config1, self).__init__()

        self.SEED       = 42
#         self.MODEL_PATH = 'Models/IndicBert'#'ai4bharat/indic-bert'
#         self.MODEL_PATH = 'Models/IndicBert'
        self.MODEL_PATH = 'monsoon-nlp/hindi-bert'


        # data
        self.PERCENTAGE_DATA  = 100
        self.TOKENIZER        = AutoTokenizer.from_pretrained(self.MODEL_PATH)
        self.MAX_LENGTH       = 128
        self.BATCH_SIZE       = 16
        self.VALIDATION_SPLIT = 0.10
        self.PREPROCESS_INPUT = True

        # Coarse-grained or Fine-Grained
        self.NUM_LABELS           = 3                                        ### 1:CG   2:Combine    4:FG
        self.COARSE_GRAINED       = True if self.NUM_LABELS ==1 else False 
        self.COARSE_GRAINED_CLASS = 'Hostile'                                ### 'Fake' 'Defamation' 'Hate' or 'Offensive' 
        
        # Combining Hostile Dimensions
        self.COMBINE_HATE_OFFENSIVE  = False     #True if self.NUM_LABELS ==2 else False
        self.COMBINE_DEFAMATION_FAKE = False     #if self.NUM_LABELS ==2 else False


        ###Features to include
        self.SUPERVISED_LEXICON_ATTENTION = True
        self.HOSTILITY_LEXICON            = True
        self.LEXICON_AVERAGE              = False
        self.EMOJI_INFO                   = True
        self.EMBED_EMOJI                  = False    ### Doesnot do preprocessing, just adds emoji text into the post
        self.HASHTAG_INFO                 = False

        # model
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.NUMBER_OF_BERT_LAYERS = 1
        self.FULL_FINETUNING = True
        self.LR = 0.01   #2e-5
        self.OPTIMIZER = 'AdamW'
        # class_weight = torch.FloatTensor([2.0])
        # self.CRITERION = torch.nn.BCEWithLogitsLoss(pos_weight=class_weight)
        self.CRITERION = 'CrossEntropyLoss'#'BCEWithLogitsLoss'
        self.OUTPUT_HIDDEN_STATE=True
        self.OUTPUT_ATTENTIONS = True
        self.EPOCHS = 5

        
        ### Loss Hyperparameters
        self.DEFAMATION_LOSS_LAMBDA = 1.0
        self.FAKE_LOSS_LAMBDA       = 1.0
        self.HATE_LOSS_LAMBDA       = 1.0
        self.OFFENSIVE_LOSS_LAMBDA  = 1.0

        
        ### Load Checkpoint
        self.SAVE_BEST_ONLY   = True
        self.LOAD_CHECKPOINT  = False
        self.FINE_TUNE_COARSE = False
        self.CHECKPOINT_PATH  = ""
        self.MODEL_FOLDER     = "Models/Supervised Lexicon Attention/"

        ### Visualization
        self.VISUALIZE_EMBEDDINGS = False
        self.PLOT_LOSS = True

        ### Evaluation 
        self.THRESHOLD = 0.5

config = Config1()

warnings.filterwarnings("ignore")
# warnings.filterwarnings("ignore", category=ResourceWarning)
# warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
print(config.NUM_LABELS)
print(config.COARSE_GRAINED)      

3
False


<h3> Helper Functions

In [9]:
def get_texts(df):
    if(config.EMBED_EMOJI):
        return df['replace_emoji']
    else:
        return df['Post']

In [10]:
def get_tokenized_texts(df):
    if(config.EMBED_EMOJI):
        return df['replace_emoji']
    else:
        return df['Tokenized Post']

In [11]:
def get_labels(df):
    if(config.COARSE_GRAINED):
        return [[float(i)] for i in df[config.COARSE_GRAINED_CLASS]]  
    
    elif(config.COMBINE_HATE_OFFENSIVE):
        labels = []
        for i in range(len(df)):
            label = []
            label.append(df['Hate'][i])
            label.append(df['Offensive'][i])
            
            if(df['Hate'][i]==0 and df['Offensive'][i]==0):
                label.append(1)
            else:
                label.append(0)
            label = [float(i) for i in label]
            labels.append(label)
        return labels
    
    elif(config.COMBINE_DEFAMATION_FAKE):
        labels = []
        for i in range(len(df)):
            label = []
            label.append(df['Defamation'][i])
            label.append(df['Fake'][i])
            
            if(df['Defamation'][i]==0 and df['Fake'][i]==0):
                label.append(1)
            else:
                label.append(0)
            
            label = [float(i) for i in label]
            labels.append(label)
        return labels
    
    else:
        labels = []
        for i in range(len(df)):
            label = []
            label.append(df['Defamation'][i])
            label.append(df['Fake'][i])
            label.append(df['Hate'][i])
            label.append(df['Offensive'][i])
            label = [float(i) for i in label]
            labels.append(label)
        return labels

In [12]:
def get_gold_attention_vectors(df):
    gold_attn_vectors = []
    for i in range(len(df)):
        gold_attn_vector = []
        gold_attn_vector.append(df['Defamation Gold Attention'][i])
        gold_attn_vector.append(df['Fake Gold Attention'][i])
        gold_attn_vector.append(df['Hate Gold Attention'][i])
        gold_attn_vector.append(df['Offensive Gold Attention'][i])
        gold_attn_vectors.append(gold_attn_vector)
    return gold_attn_vectors

In [13]:
def get_emoji_vectors(df):
    return df['emoji2vec'].values

In [14]:
def get_lexicons(df):
    if(config.LEXICON_AVERAGE):
        return df['Softmaxed Hostility Lexicon Average'].values ### returns [len(df], 4]   sized array
    return df['Softmaxed Hostility Lexicon Padded'].values   ### returns [len(df), 200] sized array

In [15]:
def preprocess(df):

    for i in range(len(df)):
        text = df['Post'][i]
        text = text.lower()
        
        ### Converting Every URL to https://someurl
        text = re.sub('http[a-zA-Z0-9./:]*', 'https://someurl',text)
        
        ### Converting Every User Mention to @Someuser
        text = re.sub('@[a-zA-Z0-9_]*', '@someuser', text)
        
        ### Removing Emojis
        emoji = re.compile("["      u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                    u"\U00002500-\U00002BEF"  # chinese char
                                    u"\U00002702-\U000027B0"
                                    u"\U00002702-\U000027B0"
                                    u"\U000024C2-\U0001F251"
                                    u"\U0001f926-\U0001f937"
                                    u"\U00010000-\U0010ffff"
                                    u"\u2640-\u2642"
                                    u"\u2600-\u2B55"
                                    u"\u200d"
                                    u"\u23cf"
                                    u"\u23e9"
                                    u"\u231a"
                                    u"\ufe0f"  # dingbats
                                    u"\u3030"
                                    "]+", flags=re.UNICODE)
        text =  emoji.sub(r'', text)
        text = ' '.join([word[1:] if word[0] == '#' else word for word in text.split()])
        
        ### Removing Punctuations
        table = str.maketrans("","", string.punctuation)
        text = text.translate(table)

        df.at[i,'Post'] = text

    return df 

In [16]:
def plot_tsne(embeddings,labels):

    tsne = TSNE(n_components=2, random_state=0)

    outer_cname = { '0' : {1: 'Defamation', 0:'Non-Defamation'},
                    '1' : {1: 'Fake', 0:'Non-Fake'},
                    '2' : {1: 'Hate', 0:'Non-Hate'},
                    '3' : {1: 'Offensive', 0:'Non-Offensive'},
                    '4' : {1: 'Hostile', 0:'Non-Hostile'}}

    class_mapping = {'Defamation': '0',
                     'Fake': '0',
                     'Hate': '2',
                     'Offensive': '3',
                     'Hostile': '4'}

    if(config.COARSE_GRAINED):
        transformed = tsne.fit_transform(embeddings)
        fig, ax = plt.subplots()
        for label in np.unique(labels): 
            indices = [i for i, l in enumerate(labels) if l == label]
            x = np.take(transformed[:,0], indices)
            y = np.take(transformed[:,1], indices)

            cdict = {1: 'red', 0: 'blue'}
            cname = outer_cname[class_mapping[config.COARSE_GRAINED_CLASS]]
            ax.scatter(x,y, color = cdict[label], label=cname[label])
        
        ax.legend(loc='best')
        plt.show()

    else:
        transformed = tsne.fit_transform(embeddings)
        fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,12))

        
        counter = 0
        all_labels = labels      
        for row in ax:
            for col in row:
                labels  = all_labels[:,counter]
                for label in np.unique(labels): 
                    indices = [i for i, l in enumerate(labels) if l == label]
                    x = np.take(transformed[:,0], indices)
                    y = np.take(transformed[:,1], indices)

                    cdict = {1: 'red', 0: 'blue'}
                    cname = outer_cname[str(counter)]
                    col.scatter(x,y, color = cdict[label], label=cname[label], alpha =0.5)
                    col.legend(loc='best')
                counter += 1
        plt.show()

<h3>Dataset Class


In [17]:
class HindiHostilityDataset(Dataset):
    def __init__(self, dataset_path):
        super(HindiHostilityDataset, self).__init__()

        df          = pickle.load(open(dataset_path, 'rb'))
        
#         if(config.COARSE_GRAINED == False):
#             df = df[df['Hostile']==1]
#             df = df.reset_index(drop=True)
        
        
        
        
        num_samples = int(config.PERCENTAGE_DATA*len(df)/100)
        df          = df.iloc[:num_samples,:] 
        
        print("Taking ",num_samples," only")
        if(config.PREPROCESS_INPUT):
            df = preprocess(df)

           
        self.posts             = get_texts(df)
        self.tokenized_posts   = get_tokenized_texts(df)
        self.labels            = get_labels(df)
        self.gold_attn_vectors = get_gold_attention_vectors(df) ### Value at an index is a list four 50 dim vectors
        self.lexicon_vectors   = get_lexicons(df)      ### A vector of 50*4 size or 4 size vector
        self.emoji_vectors     = get_emoji_vectors(df) ### A 300 dimension mean vector
        self.max_length        = config.MAX_LENGTH
        self.tokenizer         = config.TOKENIZER

    def __len__(self):
        return len(self.posts)
    
    def __getitem__(self, index):

        tokenized_conts = self.tokenizer.encode_plus( self.posts[index], 
                                                      max_length=self.max_length,
                                                      padding='max_length',
                                                      truncation=True,
                                                      return_attention_mask=True,
                                                      return_token_type_ids=False,
                                                      return_tensors='pt')
        
        post_input_ids       = tokenized_conts['input_ids'].squeeze()
        post_attention_masks = tokenized_conts['attention_mask'].squeeze()

        # print(type(self.lexicon_vectors))
        # print(self.lexicon_vectors)
        return { 'posts': {
                          'text': self.posts[index],
                          'tokenized_text': self.tokenized_posts[index],
                          'lexicon_vector': torch.Tensor(self.lexicon_vectors[index]),
                          'gold_attn_vectors': torch.Tensor(self.gold_attn_vectors[index]),
                          'emoji_vector': torch.Tensor(self.emoji_vectors[index]),
                          'input_ids': post_input_ids,
                          'attention_masks': post_attention_masks
                        },
                
                 'labels': torch.Tensor(self.labels[index])#.float()
                }


#### Model Class - BERT + LSTM + Concatenation of Hidden Layers

Parameters:


*   Droput = 0.25
*   LSTM Hidden Size = 64, bidirectional true so = 128
*   Second last layer size = 128
*   Last layer = 1 node



In [18]:
class HO_Model(nn.Module):
    def __init__(self):
        super(HO_Model, self).__init__()
    
        self.bert_model          = AutoModel.from_pretrained(config.MODEL_PATH, output_hidden_states = config.OUTPUT_HIDDEN_STATE, output_attentions = config.OUTPUT_ATTENTIONS)
        self.dropout             = nn.Dropout(0.25)
        
        self.complete_mha = nn.MultiheadAttention(embed_dim = 50, num_heads=5)
        
        self.lstm_lexicon                  = nn.LSTM(input_size = 4, hidden_size = 32, batch_first=True, bidirectional=True )
        self.linear_lexicon                = nn.Linear(64, 32)
        self.linear_emoji                  = nn.Linear(300, 32)
        
        self.concat_fc1                    = nn.Linear(420, 128)
        self.concat_fc2                    = nn.Linear(128, config.NUM_LABELS)



    def forward(self, input_ids, attention_masks, gold_attn_vectors=None, lexicon_vector=None, emoji_vector=None, hashtag_vecto=None):
        
        output_embeddings, hidden_states, attention = self.bert_model(input_ids = input_ids, attention_mask = attention_masks).values()

        bert_attn = attention[-1]
        
        h_bert_attn = bert_attn[:,2,:50,:50]
        o_bert_attn = bert_attn[:,3,:50,:50]

        h_bert_attn = torch.mean(h_bert_attn, 1)
        o_bert_attn = torch.mean(o_bert_attn, 1)

        
        concat_attn       = torch.cat((h_bert_attn, o_bert_attn),0)
        concat_attn       = concat_attn.reshape(2, len(input_ids), 50)
        concat_attn       = self.complete_mha(concat_attn, concat_attn, concat_attn)
        concat_attn       = concat_attn[0].permute(1,0,2)
        concat_attn       = torch.flatten(concat_attn, start_dim=1)                ### [16,100]
        
        
        
        
        ### Main Stem --> Augmenting Lexicon Vector + Emoji Vector
        
        output_embeddings = torch.mean(output_embeddings, 1)                ### [16,256]

        
        lexicon_all_hidden_states, (lexicon_last_hidden_state, lexicon_cell_state) = self.lstm_lexicon(lexicon_vector) # [16, 50, 64]
        lexicon_sum_hidden_states = torch.sum(lexicon_all_hidden_states, 1)  ### [16,64]
        lexicon_drop = self.dropout(lexicon_sum_hidden_states)               ### [16,64]
        lexicon_linear = self.linear_lexicon(lexicon_drop)                   ### [16,32]

        emoji_vector      = self.linear_emoji(emoji_vector)                  ### [16,32]  (300 --> 32)
        lex_emo           = torch.cat((lexicon_linear, emoji_vector),dim=1)  ### [16,64]
        lex_emo_drop      = self.dropout(lex_emo)        

        concat            = torch.cat((output_embeddings, lex_emo_drop,concat_attn),dim=1) ### [main_stem, d_out,  f_out,  h_out,  o_out]

        model_outputs     = self.concat_fc1(concat)
        model_outputs     = self.concat_fc2(model_outputs)
        
        
        ### Supervised Attention Loss
        model_attentions    = [[gold_attn_vectors[:,2,:], h_bert_attn],
                               [gold_attn_vectors[:,3,:], o_bert_attn]]


        return model_outputs, model_attentions
        



In [19]:
class DF_Model(nn.Module):
    def __init__(self):
        super(DF_Model, self).__init__()
    
        self.bert_model          = AutoModel.from_pretrained(config.MODEL_PATH, output_hidden_states = config.OUTPUT_HIDDEN_STATE, output_attentions = config.OUTPUT_ATTENTIONS)
        self.dropout             = nn.Dropout(0.25)
        
        self.complete_mha = nn.MultiheadAttention(embed_dim = 50, num_heads=5)
        
        self.lstm_lexicon                  = nn.LSTM(input_size = 4, hidden_size = 32, batch_first=True, bidirectional=True )
        self.linear_lexicon                = nn.Linear(64, 32)
        self.linear_emoji                  = nn.Linear(300, 32)
        
        self.concat_fc1                    = nn.Linear(420, 128)
        self.concat_fc2                    = nn.Linear(128, config.NUM_LABELS)



    def forward(self, input_ids, attention_masks, gold_attn_vectors, lexicon_vector=None, emoji_vector=None, hashtag_vecto=None):
        
        output_embeddings, hidden_states, attention = self.bert_model(input_ids = input_ids, attention_mask = attention_masks).values()

        bert_attn = attention[-1]
        
        d_bert_attn = bert_attn[:,0,:50,:50]
        f_bert_attn = bert_attn[:,1,:50,:50]
     
        d_bert_attn = torch.mean(d_bert_attn, 1)
        f_bert_attn = torch.mean(f_bert_attn, 1)
        
        concat_attn       = torch.cat((d_bert_attn, f_bert_attn),0)
        concat_attn       = concat_attn.reshape(2, len(input_ids), 50)
        concat_attn       = self.complete_mha(concat_attn, concat_attn, concat_attn)
        concat_attn       = concat_attn[0].permute(1,0,2)
        concat_attn       = torch.flatten(concat_attn, start_dim=1)                ### [16,100]
        
        
        ### Main Stem --> Augmenting Lexicon Vector + Emoji Vector
        
        output_embeddings = torch.mean(output_embeddings, 1)                 ### [16,256]

        
        lexicon_all_hidden_states, (lexicon_last_hidden_state, lexicon_cell_state) = self.lstm_lexicon(lexicon_vector) # [16, 50, 64]
        lexicon_sum_hidden_states = torch.sum(lexicon_all_hidden_states, 1)  ### [16,64]
        lexicon_drop = self.dropout(lexicon_sum_hidden_states)               ### [16,64]
        lexicon_linear = self.linear_lexicon(lexicon_drop)                   ### [16,32]

        emoji_vector      = self.linear_emoji(emoji_vector)                  ### [16,32]  (300 --> 32)
        lex_emo           = torch.cat((lexicon_linear, emoji_vector),dim=1)  ### [16,64]
        lex_emo_drop      = self.dropout(lex_emo)        

        concat            = torch.cat((output_embeddings, lex_emo_drop,concat_attn),dim=1) ### [main_stem, d_out,  f_out,  h_out,  o_out]

        model_outputs     = self.concat_fc1(concat)
        model_outputs     = self.concat_fc2(model_outputs)
        
        
        ### Supervised Attention Loss
        model_attentions    = [[gold_attn_vectors[:,0,:], d_bert_attn],
                               [gold_attn_vectors[:,1,:], f_bert_attn]]


        return model_outputs, model_attentions
        



#### Predict Labels

In [20]:
def predict_labels(dataloader, checkpoint, labels, model_name=None ):
    
    print("Predicting Labels for : \n",model_name,"\n\n")
    criterion = nn.BCEWithLogitsLoss()
    
    
    if(model_name == "Hate + Offensive"):
        model = HO_Model()
        model.to(config.DEVICE)

    if(model_name == "Defamation + Fake"):
        model = DF_Model()
        model.to(config.DEVICE)


#     print("\n\n-----------------Model configuration----------------\n\n",model,"\n\n\n")

    # define the parameters to be optmized and add regularization
    if config.FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [    {
                                        "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                                        "weight_decay": 0.001,
                                    },
                                    {
                                        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                                        "weight_decay": 0.0,
                                    }]
        optimizer = optim.AdamW(optimizer_parameters, lr=config.LR)

    num_training_steps = len(train_dataloader) * config.EPOCHS

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=num_training_steps)
    

    if(checkpoint != None):
        print("\n\n------------------- Loading Checkpoint-----------------------\n\n")
        warnings.filterwarnings("ignore")
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        max_val_f1_score = checkpoint['val_f1_score']
        warnings.filterwarnings("default")
        print("\n\n------------------- Checkpoint Loaded-----------------------\n\n")


    pred  = []
    
    # set model.eval() every time during evaluation
    model.eval()
    
    for step, batch in enumerate(dataloader):
        # unpack the batch contents and push them to the device (cuda or cpu).
        b_input_ids       = batch['posts']['input_ids'].to(config.DEVICE)
        b_attention_masks = batch['posts']['attention_masks'].to(config.DEVICE)
        b_labels          = batch['labels'].to(config.DEVICE)

        if(config.HOSTILITY_LEXICON):
            b_lexicon     = batch['posts']['lexicon_vector'].to(config.DEVICE)
        else:
            b_lexicon     = None
                 
        if(config.EMOJI_INFO):
            b_emoji       = batch['posts']['emoji_vector'].to(config.DEVICE)
        else:
            b_emoji       = None

        if(config.HASHTAG_INFO):
            b_hashtag     = None
        else:
            b_hashtag     = None
            
        if(config.SUPERVISED_LEXICON_ATTENTION):
            b_gold_attn_vectors = batch['posts']['gold_attn_vectors'].to(config.DEVICE)
        else:
            b_gold_attn_vectors = None
            
            
        # using torch.no_grad() during validation/inference is faster -
        # - since it does not update gradients.
        
        with torch.no_grad():
            # forward pass
            logits, model_attentions = model(b_input_ids, b_attention_masks, b_gold_attn_vectors, b_lexicon,b_emoji,b_hashtag)

            # since we're using BCEWithLogitsLoss, to get the predictions -
            # - sigmoid has to be applied on the logits first
            logits = torch.sigmoid(logits)
            logits = np.round(logits.cpu().numpy())
        
            # the tensors are detached from the gpu and put back on -
            # - the cpu, and then converted to numpy in order to -
            # - use sklearn's metrics.

            for item in logits:
                pred.append(item[:2])
                
    print(len(pred))

    return pred 

In [22]:
torch.manual_seed(config.SEED)

config.COMBINE_HATE_OFFENSIVE  = True
config.COMBINE_DEFAMATION_FAKE = False
config.NUM_LABELS = 3

train_dataset = HindiHostilityDataset("Data/Old Pickles/pkl_fine_train.pkl")
val_dataset   = HindiHostilityDataset("Data/Old Pickles/pkl_fine_valid.pkl")
test_dataset  = HindiHostilityDataset("Data/Old Pickles/pkl_fine_test.pkl")

Taking  5728  only
Taking  811  only
Taking  1653  only


In [23]:
torch.manual_seed(config.SEED)

train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
val_dataloader   = DataLoader(val_dataset,   batch_size=config.BATCH_SIZE, shuffle=False)
test_dataloader  = DataLoader(test_dataset,  batch_size=config.BATCH_SIZE, shuffle=False)

In [24]:
batch = next(iter(test_dataloader))

b_texts             = batch['posts']['text']
b_tokenized_texts   = batch['posts']['tokenized_text']
b_input_ids         = batch['posts']['input_ids']
b_attention_masks   = batch['posts']['attention_masks']
b_lexicon           = batch['posts']['lexicon_vector']
b_gold_attn_vectors = batch['posts']['gold_attn_vectors']
b_emoji             = batch['posts']['emoji_vector']
b_labels            = batch['labels']


print(b_labels.shape)
print(b_gold_attn_vectors.shape)
print(b_lexicon.shape)
print(b_labels.shape)

torch.Size([16, 3])
torch.Size([16, 4, 50])
torch.Size([16, 50, 4])
torch.Size([16, 3])


In [25]:
model = DF_Model()
outputs, attentions = model(b_input_ids, b_attention_masks, b_gold_attn_vectors, b_lexicon,b_emoji)
outputs

tensor([[-0.3379,  0.5068,  0.3279],
        [-0.9140,  0.7598,  0.2832],
        [-0.1779,  0.5209,  0.3080],
        [-0.3962,  0.5482,  0.1996],
        [-0.4645,  0.3286,  0.3209],
        [-0.5997,  0.6249,  0.1806],
        [-0.4810,  0.4724,  0.3608],
        [-0.2981,  0.4703,  0.2339],
        [-0.4881,  0.3831,  0.1130],
        [-0.5429,  0.7935,  0.7432],
        [-0.5459,  0.3077,  0.2132],
        [-0.7081,  0.3350,  0.2339],
        [-0.5634,  0.3643,  0.0875],
        [-0.6795,  0.6869,  0.2196],
        [-0.2786,  0.2684,  0.1079],
        [-0.1590,  0.3773,  0.6171]], grad_fn=<AddmmBackward>)

<h4> HO - Fine Grained Evaluation

In [27]:
torch.manual_seed(config.SEED)

config.COMBINE_HATE_OFFENSIVE  = True
config.COMBINE_DEFAMATION_FAKE = False
config.NUM_LABELS = 3

train_dataset = HindiHostilityDataset("Data/Old Pickles/pkl_fine_train.pkl")
val_dataset   = HindiHostilityDataset("Data/Old Pickles/pkl_fine_valid.pkl")
test_dataset  = HindiHostilityDataset("Data/Old Pickles/pkl_fine_test.pkl")

Taking  5728  only
Taking  811  only
Taking  1653  only


In [28]:
torch.manual_seed(config.SEED)

train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
val_dataloader   = DataLoader(val_dataset,   batch_size=config.BATCH_SIZE, shuffle=False)
test_dataloader  = DataLoader(test_dataset,  batch_size=config.BATCH_SIZE, shuffle=False)

In [29]:
config.FINE_TUNE_COARSE = False
config.LOAD_CHECKPOINT  = True
config.HO_CHECKPOINT_PATH  = "Models/Final/Supervised Attention/Two Models/128 HO HindiBert 4BertAttn CustomLoss -  F1 - 0.6108531947502773.pt"


In [30]:
if(config.LOAD_CHECKPOINT):

    ho_checkpoint_path = config.HO_CHECKPOINT_PATH
    ho_checkpoint      = torch.load(ho_checkpoint_path, map_location=torch.device(config.DEVICE))
    print("\n\n------------HO Checkpoint Loaded-----------------\n\n", config.CHECKPOINT_PATH)

    
labels = ["Hate", "Offensive"]
pred_HO = predict_labels(test_dataloader, ho_checkpoint, labels, model_name = "Hate + Offensive")



------------HO Checkpoint Loaded-----------------

 
Predicting Labels for : 
 Hate + Offensive 




------------------- Loading Checkpoint-----------------------




------------------- Checkpoint Loaded-----------------------


1653


In [31]:
pred_HO[:5]

  and should_run_async(code)


[array([1., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32)]

<h4> DF - Fine Grained Evaluation

In [32]:
torch.manual_seed(config.SEED)

config.COMBINE_DEFAMATION_FAKE  = True
config.COMBINE_HATE_OFFENSIVE   = False
config.NUM_LABELS = 3

train_dataset = HindiHostilityDataset("Data/Old Pickles/pkl_fine_train.pkl")
val_dataset   = HindiHostilityDataset("Data/Old Pickles/pkl_fine_valid.pkl")
test_dataset  = HindiHostilityDataset("Data/Old Pickles/pkl_fine_test.pkl")

  df          = pickle.load(open(dataset_path, 'rb'))


Taking  5728  only


  df          = pickle.load(open(dataset_path, 'rb'))


Taking  811  only


  df          = pickle.load(open(dataset_path, 'rb'))


Taking  1653  only


In [33]:
torch.manual_seed(config.SEED)

train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
val_dataloader   = DataLoader(val_dataset,   batch_size=config.BATCH_SIZE, shuffle=False)
test_dataloader  = DataLoader(test_dataset,  batch_size=config.BATCH_SIZE, shuffle=False)

In [34]:
config.LOAD_CHECKPOINT  = True
config.DF_CHECKPOINT_PATH  = "Models/Final/Supervised Attention/Two Models/128 DF HindiBert 4BertAttn CustomLoss -  F1 - 0.6127511129760648.pt"


In [35]:
if(config.LOAD_CHECKPOINT):
    df_checkpoint_path = config.DF_CHECKPOINT_PATH
    df_checkpoint      = torch.load(df_checkpoint_path, map_location=torch.device(config.DEVICE))    
    print("\n\n------------DF Checkpoint Loaded-----------------\n\n", config.CHECKPOINT_PATH)


labels = ["Defamation", "Fake"]
pred_DF = predict_labels(test_dataloader, df_checkpoint, labels, model_name = "Defamation + Fake")



------------DF Checkpoint Loaded-----------------

 
Predicting Labels for : 
 Defamation + Fake 




------------------- Loading Checkpoint-----------------------




------------------- Checkpoint Loaded-----------------------


1653


In [36]:
pred_DF[:5]

  and should_run_async(code)


[array([0., 0.], dtype=float32),
 array([0., 1.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 1.], dtype=float32),
 array([0., 0.], dtype=float32)]

In [37]:
pred_HO[:5]

[array([1., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([0., 0.], dtype=float32)]

In [38]:
pred = []
for a,b in zip(pred_DF, pred_HO):
    final_label = []
    for item in a:
        final_label.append(item)
    for item in b:
        final_label.append(item)
    
    pred.append(final_label)
    
print(pred[:5])

[[0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]


In [39]:
test_df = pickle.load(open("Data/pkl_fine_test.pkl", 'rb'))
# test_df = test_df[test_df['Hostile']==1]
# test_df = test_df.reset_index(drop=True)


config.COMBINE_DEFAMATION_FAKE  = False
config.COMBINE_HATE_OFFENSIVE   = False
config.NUM_LABELS = 4

true = get_labels(test_df)
print(true[:5])
test_df.head(5)


  test_df = pickle.load(open("Data/pkl_fine_test.pkl", 'rb'))


[[1.0, 1.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]


Unnamed: 0,Unique ID,Post,Processed With Emoji,Processed Without Emoji,Tokenized Post,All IndicFT Embeddings,Padded IndicFT Embeddings,Softmaxed Lex Padded,Softmaxed Lex Average,Softmaxed Lex Mask,...,emojis_list,replace_emoji,emoji2text,emoji2vec,Hostile,Defamation,Fake,Hate,Offensive,Labels Set
0,1,कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...,कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...,कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...,▁की स ▁की ▁को ▁रोजगार ▁चाहिए ▁फिर ▁नहीं ▁कहना ...,"[[-0.010175878, 0.17647043, 0.32597065, -0.152...","[[-0.010175878, 0.17647043, 0.32597065, -0.152...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[0.12281525615536089, 0.10788757219638383, 0.1...","[0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, ...",...,,कीस की को रोजगार चाहिए फिर नहीं कहना रोजगार नह...,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,1,0,1,"defamation,fake,offensive"
1,2,पटना: BMP कैंप में पुरुष और महिला कांस्टेबल ने...,पटना BMP कैंप में पुरुष और महिला कांस्टेबल ने ...,पटना BMP कैंप में पुरुष और महिला कांस्टेबल ने ...,▁पटना ▁कैंप ▁में ▁पुरुष ▁और ▁महिला ▁कांस्टेबल ...,"[[-0.02822564, -0.34272358, 0.015664268, -0.26...","[[-0.02822564, -0.34272358, 0.015664268, -0.26...","[[0.21712413297477984, 0.4650181247330267, 0.1...","[0.10860063058899874, 0.2558063704043224, 0.10...","[1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, ...",...,,पटना: BMP कैंप में पुरुष और महिला कांस्टेबल ने...,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,0,0,0,non-hostile
2,3,"कोई भी कांग्रेसी, ऊंची छत पर, रेलवे लाइन पर, ऊ...",कोई भी कांग्रेसी ऊंची छत पर रेलवे लाइन पर ऊंची...,कोई भी कांग्रेसी ऊंची छत पर रेलवे लाइन पर ऊंची...,▁कोई ▁भी ▁कांग्रेस ी ▁ऊंची ▁छत ▁पर ▁रेलवे ▁लाइ...,"[[0.076749325, 0.09945227, 0.01090258, 0.20269...","[[0.076749325, 0.09945227, 0.01090258, 0.20269...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[0.1490461053213771, 0.19102224971784912, 0.10...","[0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, ...",...,🙏😂👍,"कोई भी कांग्रेसी, ऊंची छत पर, रेलवे लाइन पर, ऊ...",folded_hands face_with_tears_of_joy thumbs_up,"[0.045116836205124855, -6.391700298991054e-05,...",1,0,0,1,0,hate
3,4,अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...,अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...,अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...,▁ अंडरवर्ल्ड ▁डॉन ▁छोटा ▁राज न ▁के ▁भाई ▁को ▁ब...,"[[0.24119985, -0.019852951, -0.09618474, 0.396...","[[0.24119985, -0.019852951, -0.09618474, 0.396...","[[0.0, 0.0, 0.0, 0.0], [0.25, 0.25, 0.25, 0.25...","[0.2565532888839337, 0.16772352341047086, 0.09...","[0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, ...",...,,अंडरवर्ल्ड डॉन छोटा राजन के भाई को बीजेपी द्वा...,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,1,0,0,fake
4,5,RT @_Pb_swain_: इन पंचर छापों को कोन समझाए कि ...,RT someuser इन पंचर छापों को कोन समझाए कि उनके...,RT someuser इन पंचर छापों को कोन समझाए कि उनके...,▁इन ▁पंच र ▁छाप ों ▁को ▁को न ▁समझा ए ▁कि ▁उनके...,"[[0.015281931, 0.2259393, -0.053103715, 0.0909...","[[0.015281931, 0.2259393, -0.053103715, 0.0909...","[[0.0, 0.0, 0.0, 0.0], [0.20008834622120195, 0...","[0.13704723817167858, 0.13018843364746632, 0.1...","[0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, ...",...,👇😂😂😂😂,RT @_Pb_swain_: इन पंचर छापों को कोन समझाए कि ...,backhand_index_pointing_down face_with_tears_o...,"[-0.00757233202457428, -0.017534792050719263, ...",1,1,0,0,0,defamation


In [40]:
f1 = f1_score(true, pred, average=None)
print("\n Weighted F1 - Score (For Fine Grained - Hostile Posts Only): ", f1)
f1 = f1_score(true, pred, average='weighted')
print("\n Weighted F1 - Score (For Fine Grained - Hostile Posts Only): ", f1)


warnings.filterwarnings('ignore')
print("\n\n",classification_report(true,pred))
warnings.filterwarnings('default')


 Weighted F1 - Score (For Fine Grained - Hostile Posts Only):  [0.26724138 0.42050391 0.41608392 0.46021505]

 Weighted F1 - Score (For Fine Grained - Hostile Posts Only):  0.40142555655052653


               precision    recall  f1-score   support

           0       0.49      0.18      0.27       169
           1       0.30      0.72      0.42       334
           2       0.35      0.51      0.42       234
           3       0.43      0.49      0.46       219

   micro avg       0.34      0.52      0.41       956
   macro avg       0.39      0.48      0.39       956
weighted avg       0.38      0.52      0.40       956
 samples avg       0.26      0.26      0.25       956



  and should_run_async(code)


In [147]:
rows = []


for id,labels in enumerate(pred):
    label_vector = [0,""]
    
    label_vector[0] = id+1
    
    for i,label in enumerate(labels):
        
        if(i==0 and label==1.0):
            label_vector[1] += "defamation,"
            
        if(i==1 and label==1.0):
            label_vector[1] += "fake,"
            
        if(i==2 and label==1.0):
            label_vector[1] += "hate,"
            
        if(i==3 and label==1.0):
            label_vector[1] += "offensive,"
            
    if(label_vector[1] == ""):
        label_vector[1] = "non-hostile "
    

    
    label_vector[1] = label_vector[1][:-1]
    rows.append(label_vector)
    

df = pd.DataFrame(data = rows, columns= ["Unique ID", "Labels Set"])
print(df.shape)
df.head()
            

(1653, 2)


Unnamed: 0,Unique ID,Labels Set
0,1,fake
1,2,fake
2,3,"fake,hate"
3,4,fake
4,5,fake


In [148]:
ls

[0m[01;34mCode[0m/  [01;34mData[0m/  [01;34mDumps[0m/  [01;34mModels[0m/  [01;34mPhotos[0m/  [01;34mTensorboard[0m/


  and should_run_async(code)


In [149]:
df.to_csv("Dumps/ensemble_check.csv", index=False)