In [None]:
################################
# Tunisian sentiment analysis
# The model is based on a pre-trained ELMo built on top of it linear layers for classification


In [None]:
#deal with tensors
import torch   
import sys
#handling text data
from torchtext import data    
import pandas as pd
from torchtext.vocab import Vectors
from collections import Counter

In [None]:
import os
import json

In [None]:
from time import time

In [None]:
#Reproducing same results
SEED = 2

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
train = pd.read_csv("train.csv")


In [None]:
train["text"] = train["text"].apply(str)
train.head()

In [None]:
# corpus = list of tokenized sentences
corpus = []
def to_corpus(row):
    new = str(row).split()
    corpus.append(new)
    return new

train["samples"] = train["text"].apply(to_corpus)

In [None]:
texts = " ".join(train['text'].tolist())
words = texts.split(" ")

In [None]:
dictionary = Counter(words)
print("Size of Vocab",len(dictionary))
sorted_vocab = ["<S>","</S>","<UNK>"]
sorted_vocab.extend([pair[0] for pair in dictionary.most_common()])

In [None]:
fp = open("train_vocab.txt","w")
for i in sorted_vocab:
  fp.write(i)
  fp.write("\n")
fp.close()



In [None]:
train_d = train.sample(frac=1).reset_index(drop=True)
train_d.head()

In [None]:
train_d["count"] = train_d["samples"].apply(len)

In [None]:
train_d = train_d[ train_d["count"] <= 80 ]

In [None]:
train_d["count"].value_counts()

In [None]:
help(Elmo)

In [None]:
import torch.nn as nn
from allennlp.modules.elmo import Elmo, batch_to_ids


class Classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 dropout,options_file,weight_file):
        
        #Constructor
        super().__init__()          
        
        #Elmo Layers
        self.elmo = Elmo(options_file, weight_file,1 ,requires_grad = True,dropout = dropout)            
        
        
        #dense layer
        self.linear = nn.Linear(embedding_dim,hidden_dim)
        self.relu   = nn.ReLU()
        
        #dropout
        self.dropout = nn.Dropout(p=dropout)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, sentences):

        #inputs for elmo of shape [batch size,time_steps,50]
        embedded = self.elmo(sentences)
        #print(len(embedded["elmo_representations"]))

        #retrieve the embeddings having shape [batch size, sent_len, emb dim]
        embedded = embedded["elmo_representations"][0]
        embedded = embedded[:,0,:]
        embedded = embedded.view(embedded.shape[0],embedded.shape[-1])
        
        dense_outputs = self.relu(self.linear(embedded))
        
        dense_outputs = self.dropout(dense_outputs)


        #dense_outputs=self.fc(embedded)
        dense_outputs=self.fc(dense_outputs)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
import torch.optim as optim
from torch import LongTensor

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    


In [None]:
def train(model, iterator, optimizer, criterion):

    
    t0 = time()
    print("\ntraining process...:")

    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    #set the model in training phase
    model.train()  
    iterator_length = len(iterator)
    
    for step, batch in enumerate(iterator):

        if step % 50 == 0 and not step == 0:
          # Calculate elapsed time in minutes.
          elapsed = format_time(time() - t0)
          # Report progress.
          print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(iterator), elapsed)) 
         # [0]: input ids 
         # [1]: labels
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #convert to 1D tensor
        predictions = model(b_input_ids).squeeze()
        b_labels = b_labels.type_as(predictions)

        #compute the loss
        loss = criterion(predictions, b_labels)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, b_labels)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()
    
    print("\ntraining_epoch_loss: ",epoch_loss/iterator_length,"\ntraining_epoch_acc: ",epoch_acc/iterator_length)
    
        
    return epoch_loss / iterator_length , epoch_acc / iterator_length

In [None]:
def evaluate(model, iterator, criterion):
    
    t0 = time()
    print("\nvalidation process...:")
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for step, batch in enumerate(iterator):

            if step % 40 == 0 and not step == 0:
               # Calculate elapsed time in minutes.
               elapsed = format_time(time() - t0)
               # Report progress.
               print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(iterator), elapsed)) 
            
             # [0]: input ids 
             # [1]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            
            #convert to 1d tensor
            predictions = model(b_input_ids).squeeze()
            b_labels = b_labels.type_as(predictions) 
            
            #compute loss and accuracy
            loss = criterion(predictions, b_labels)
            acc = binary_accuracy(predictions, b_labels)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    print("\nvalidation_epoch_loss: ",epoch_loss/len(iterator),"\nValidation_epoch_acc: ",epoch_acc/len(iterator))
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def predict(model,iterator):
    
    model.eval()
    preds = torch.FloatTensor().to(device)
    labels = torch.FloatTensor().to(device)
    with torch.no_grad():
    
        for step, batch in enumerate(iterator):            
             # [0]: input ids 
             # [1]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            
            #convert to 1d tensor
            predictions = model(b_input_ids).squeeze()
            b_labels = b_labels.type_as(predictions) 
            
            preds = torch.cat((preds,predictions))
            labels = torch.cat((labels,b_labels))

    return preds,labels
        

        

In [None]:
sentences = train_d.samples.values.tolist()
labels = train_d.label.values

#sentences

In [None]:
#help(batch_to_ids)

In [None]:
#input_ids of shape [len(batch), max sentence length, max word length]
input_ids = batch_to_ids(sentences)

In [None]:
input_ids.shape

In [None]:
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2020, test_size=0.18)

In [None]:
train_inputs.shape

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 128 


#Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

from shutil import copyfile
if not os.path.exists("model"):
    os.makedirs("model")

copyfile("/home/mahmoud/Desktop/Tunisian Dialect Language Model TDLM/Sentiment analysis/ELMo/swb/vocab.txt", "model/vocab.txt")

copyfile("/home/mahmoud/Desktop/Tunisian Dialect Language Model TDLM/Sentiment analysis/ELMo/swb/checkpoint/options.json", "model/options.json")

copyfile("/home/mahmoud/Desktop/Tunisian Dialect Language Model TDLM/Sentiment analysis/ELMo/swb/swb_weights.hdf5", "model/swb_weights.hdf5")

#set n_characters to 262 in options.json
a_file = open("model/options.json", "r")

json_object = json.load(a_file)

a_file.close()
#print(json_object)
json_object["char_cnn"]["n_characters"] = 262

a_file = open("model/options.json", "w")

json.dump(json_object, a_file)

a_file.close()

In [None]:
vocab_file = "train_vocab.txt"

#parameters of the model
options_file = "model/options.json"
weight_file = "model/swb_weights.hdf5"

vocab_size = len(sorted_vocab)
embedding_dim = 128    #Size of ELMO pretrained embeddings
hidden_dim = 32
output_dim = 1
n_layers = 1 
dropout = 0.5




In [None]:
model = Classifier(vocab_size,embedding_dim,hidden_dim,output_dim,dropout,
              options_file,weight_file)

model.cuda()

In [None]:
print("model parameters:\n", count_parameters(model))
N_trainable_params = count_parameters(model)

In [None]:
from IPython.display import clear_output
from torch import optim
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

#push to cuda if available
model = model.to(device)

criterion = criterion.to(device)

In [None]:
import random
import numpy as np
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
#import time
import datetime
def format_time(elapsed):
 '''
 Takes a time in seconds and returns a string hh:mm:ss
 '''
 # Round to the nearest second.
 elapsed_rounded = int(round((elapsed)))

 # Format as hh:mm:ss
 return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
epochs = 6
t1 = time()

loss_values_eval = []
acc_values_eval = []

for epoch in range(epochs):
  train(model, train_dataloader, optimizer, criterion)
  loss_eval,acc_eval = evaluate(model, validation_dataloader, criterion)
  loss_values_eval.append(loss_eval)
  acc_values_eval.append(acc_eval)


elapsed = format_time(time()-t1)
print("elapsed time: ",elapsed)

In [None]:
import plotly.express as px
f = pd.DataFrame(loss_values_eval)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Evaluation loss of the Model',xaxis_title='Epoch',yaxis_title='Loss')
fig.show()

In [None]:
f = pd.DataFrame(acc_values_eval)
f.columns=['Accuracy']
fig = px.line(f, x=f.index, y=f.Accuracy)
fig.update_layout(title='Evaluation accuracy of the Model',xaxis_title='Epoch',yaxis_title='Accuracy')
fig.show()

In [None]:
test = pd.read_csv("test.csv")
test["text"] = test["text"].apply(str)

test.head()

In [None]:
def tokenize(row):
  return row.split()

test["samples"] = test["text"].apply(tokenize)
test.head()

In [None]:
sentences = test.samples.values.tolist()
labels = test.label.values

In [None]:
input_ids = batch_to_ids(sentences)

In [None]:
input_ids.shape

In [None]:
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)

# Create the DataLoader for our validation set.
test_data = TensorDataset(test_inputs, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
predictions, labels = predict(model,test_dataloader)

In [None]:
accuracy = float(binary_accuracy(predictions, labels))

In [None]:
print(accuracy)