In [2]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm

EMBED_DIM = 300
HIDDEN_DIM = 128

USE_TRAINABLE_EMBEDDINGS = True
PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 2
FILTER_SIZES = [3,4,5,6,7]
NUM_FILTERS = 50


device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')



if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')

#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')


In [3]:

glove =   GloVe()
#fasttext_model = fasttext.load_model('./crawl-300d-2M-subword/crawl-300d-2M-subword.bin')

In [None]:
len(glove.itos)

In [None]:
len(glove.itos)

In [None]:
from torchtext.vocab import vocab
glove_vocab = vocab(glove.stoi)
glove_vocab.insert_token("<unk>",2196015)
glove_vocab.set_default_index(2196015)
pretrained_embeddings = glove.vectors
pretrained_embeddings = torch.cat((pretrained_embeddings,torch.zeros(1,300)))

glove.stoi = glove_vocab.get_stoi()
glove.itos = glove_vocab.get_itos()

glove.vectors = pretrained_embeddings


In [None]:
glove.stoi['pad']

In [4]:

file = open('processed_text.txt','w')
punctuation_words = open('punct.txt','w')
punct_in_file = set()
final_words = open('words.txt','w')
punctuations = set(list(string.punctuation))

vocab_words = set()
def preprocess_text(text):
    
    
    text = re.sub(r'<br /><br />',"",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split():
        if word == '':
            continue
        new_text.append(word)
        #if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
        #    words = word.split('.')
        #    for w in words:
        #        new_text.append(w)
        #else:
    
    text = ' '.join(new_text)
    words = nlp(text)
    text =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" ]).strip()
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    #review  = " ".join(new_text)
    
    for word in text.split(" "):
        vocab_words.add(word)
        if any([punct in word for punct in punctuations]):
            punctuation_words.write(word + "\n")
    
    file.write(text+ '\n')
    return text


In [5]:

preprocessed_dataset = []
train_dataset_labels = []
with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    json_writer = jsonlines.open('processed_dataset.jsonl','w')

    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        train_dataset_labels.append(label)
        json_writer.write({"text":processed_text,"label":label})
        preprocessed_dataset.append({"text":processed_text,"label":label})
    
    json_writer.close()

train_dataset_labels = np.array(train_dataset_labels)
final_words.write(str(vocab_words))
final_words.close()
file.close()
punctuation_words.close()




In [None]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)


In [None]:
def getWordEmbeddingforText(text,glove=glove):
    
    length = 0
    words = []
    for word in text.split(' '):
        length+=1
        word_embedding = glove[word]
        words.append(word_embedding)
    
    return torch.stack(words),length

In [None]:
processed_dataset = []

for review in preprocessed_dataset:
    embedding,length = getWordEmbeddingforText(review['text'])
    processed_dataset.append({'text': embedding,'length': length,'label' : review['label']})
 

In [None]:
class ReviewDataSet(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

dataset = ReviewDataSet(processed_dataset)

In [None]:
# Train and Validation split and an equal distriubition of classes
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

def collate_function(batch_data):
    inputs = [b['text'] for b in batch_data]
    lengths = [b['length'] for b in batch_data]
    labels = torch.tensor([b['label'] for b in batch_data])

    labels = labels.unsqueeze(1)
    inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : inputs , 'lengths': lengths , 'labels' : labels }

    

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,64,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(dataset,64,sampler=valid_sampler,collate_fn=collate_function)



In [None]:
class DAN(nn.Module):
    
    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM, droput_prob = 0.3, train_device = device_cpu):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        #self.fc = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()
        self.word_dropout_prob = droput_prob

        self.train_device = train_device

        #if(word_vector_types_count > 1) :
        #   self.importance_weights = nn.Linear(word_vector_types_count*EMBED_DIM,word_vector_types_count)
        

    def forward(self,inp,inp_len):
        
    
        inp_mask = torch.ones((inp.shape[0],inp.shape[1]))
        for i  in range(inp.shape[0]):
            inp_mask[i,inp_len[i]:] = 0.0

        inp_mask = inp_mask.to(self.train_device)

        inp_lengths = torch.sum(inp_mask,-1,keepdim=True).int()
        inp_lengths = inp_lengths.to(self.train_device)
        #if word_vector_types_count > 1:
        #    input = inp.view((inp.shape[0],inp.shape[1],-1))
        #    out = self.importance_weights(input)
        #    weights = F.softmax(out,dim=2).unsqueeze(3)
        #    weights_multiplied_vector = weights * inp
        #    inp = torch.sum(weights_multiplied_vector,dim=2)

        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        #ans = F.relu(self.fc(ans))
        ans = self.sigmoid(self.fc2(ans))
        return ans
dan = DAN(train_device=device_cpu)


In [None]:
# bidirectional
# rnn_type
# use_cnn
# num_layers


class SentimentModel(nn.Module):

    def __init__(self,
            embed_dim=EMBED_DIM,hidden_dim =HIDDEN_DIM,bidirectional=False,
            rnn_type = 'gru',num_layers=1,dropout = 0.0):
        
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        self.rnn = None
        if rnn_type == 'gru':
            self.rnn  = nn.GRU(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = self.num_layers,
                batch_first = True,
                dropout = self.dropout,
                bidirectional = self.bidirectional
            )

        bidirectional_factor = 2 if self.bidirectional else 1
        self.dnn_input_size= bidirectional_factor * self.num_layers * self.hidden_dim

        '''nearest_power_of_2 = np.log2(self.dnn_input_size)
        self.modulelist = []

        starting_hidden_layer_dim = np.floor(nearest_power_of_2)
        if( starting_hidden_layer_dim != nearest_power_of_2):
            starting_hidden_layer_dim = int(starting_hidden_layer_dim)
            self.modulelist.append(nn.Linear(self.dnn_input_size,2**starting_hidden_layer_dim))
            self.modulelist.append(nn.Dropout(p=self.dropout))
        
        starting_hidden_layer_dim = int(starting_hidden_layer_dim)
        while(starting_hidden_layer_dim >7):
            self.modulelist.append(nn.Linear(2**(starting_hidden_layer_dim),2**(starting_hidden_layer_dim-1)))
            self.modulelist.append(nn.Dropout(p=self.dropout))
            starting_hidden_layer_dim-=1

        self.modulelist.append(nn.Linear(128,1))
        self.fc =nn.ModuleList(self.modulelist)'''
    
        #self.fc = nn. 
        self.fc1 = nn.Linear(self.dnn_input_size,128)
        self.fc2 = nn.Linear(128,1)
        self.sigmoid  = nn.Sigmoid()



    def forward(self,x,x_len):

        packed_input = pack_padded_sequence(x,x_len,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)
        #hidden = hidden.squeeze()
        hidden = hidden.permute(1,0,2)
        hidden = hidden.contiguous().view((hidden.shape[0],-1))  
                
        #out =F.relu(self.fc1(hidden))
        out = self.fc1(hidden)
        ans = self.sigmoid(self.fc2(out))
   
        
        #for i,layer in enumerate(self.fc):
        #    if i%2 ==0 :
        #        hidden = F.relu(layer(hidden))
        #    else:
        #        hidden = layer(hidden)
        #ans = self.sigmoid(hidden)
        return ans

sent = SentimentModel(bidirectional=True,num_layers=3)

In [None]:
inp = torch.randn((5,10,300))
length = [10 for i in range(5)]
true_label = torch.tensor([[1.0] for i in range(5)])
true_label[4:] = 0.0
crit = nn.BCELoss()
optimizer = optim.Adam(sent.parameters(),lr=0.01)

In [None]:
sent.train()
out = sent(inp,length)
loss = crit(out,true_label)
print(loss.item())
loss.backward()
optimizer.step()

In [None]:
sent

In [None]:
# ADD Dropout Term

class CNNModel(nn.Module):

    def __init__(self, embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM,filter_sizes = FILTER_SIZES, n_filters = NUM_FILTERS,dropout = 0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.filter_sizes = filter_sizes
        self.num_filters = n_filters
        self.dropout = dropout
        
        self.modulelist = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
            out_channels=self.num_filters,
            kernel_size= self.filter_sizes[i]
            )
    
        for i in range(len(self.filter_sizes))])
        
        self.fc1 = nn.Linear(self.num_filters*len(self.filter_sizes),hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x : torch.Tensor,xlen = None):
        
        permuted_x = x.permute(0,2,1)
        x_conv_list = [F.relu(conv(permuted_x)) for conv in self.modulelist]
        x_max_pool_list = [F.max_pool1d(x_conv,kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        
        feature = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_max_pool_list],dim=1)
        
        out = F.relu(self.fc1(feature))
        out = self.fc2(out)
        return self.sigmoid(out)



cnnmodel = CNNModel(10,2,FILTER_SIZES,NUM_FILTERS)
#cnnmodel =  CNNModel()
    

In [None]:
class EnsembleModel(nn.Module):

    def __init__(self) -> None:
        super().__init__()

        

    def forward(self):
        pass  

In [None]:
from torch.utils.tensorboard import SummaryWriter
from datetime import  datetime

def train(model,train_dataloader,valid_dataloader,num_epochs,loss_fn,optimizer_name,check_point_name,tensorboard_name,learning_rate=0.01,device_train = device_fast,use_rnn = False):
    current_datetime = datetime.now().strftime("%d/%m/%Y_%H:%M:%S")
    writer = SummaryWriter('runs/' + tensorboard_name)
    model = model.to(device_train)
    criterion = None
    optimizer = None
    
    clip = 0
    if use_rnn:
        clip = 5

    
    if loss_fn == 'bce':
        criterion = nn.BCELoss()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(),lr = learning_rate)
    
    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            
            optimizer.zero_grad()
            input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,inp_lengths)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            if use_rnn:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
        
        current_training_loss = training_loss / training_set_size
        print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
        writer.add_scalars(tensorboard_name + '; Loss vs Epoch',{'train' : current_training_loss},e)
        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):
                input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,inp_lengths)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_loss = (1.0* validation_loss)/validation_set_size
            print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
            print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )

            writer.add_scalar(tensorboard_name + ' Validation Accuracy vs Epoch ',int((correct_count/validation_set_size)*100),e)
            writer.add_scalars('Loss vs Epoch',{'valid' : current_validation_loss},e)
            if current_validation_loss < best_validation_loss:
                valdiation_loss_not_decreased_steps = 0
                torch.save(model.state_dict(),check_point_name)
                best_validation_loss = current_validation_loss
            else:
                valdiation_loss_not_decreased_steps +=1
        
        if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
            break

In [None]:
EPOCHS = 50
LOSS_FN = 'bce'
OPTIMIZER = 'adam'
LR = 0.01

In [None]:

checkpoint_name = 'checkpoints/' + 'dan_' + str(EPOCHS)+"_"+ str(LOSS_FN)  +"_"+ str(OPTIMIZER) + "_" + str(LR)  
train(dan,train_dataloader,valid_dataloader,EPOCHS,'bce','adam',checkpoint_name,'DAN',0.01,device_cpu,use_rnn=False)

In [None]:
LR = 0.01
checkpoint_name = 'checkpoints/' + 'rnn_gru_' + str(EPOCHS)+"_"+ str(LOSS_FN)  +"_"+ str(OPTIMIZER) + "_" + str(LR)  
train(sent,train_dataloader,valid_dataloader,EPOCHS,LOSS_FN,OPTIMIZER,checkpoint_name,'Simple GRU',LR,device_cpu,use_rnn=True)

In [None]:
checkpoint_name = 'checkpoints/' + 'cnn_'+  str(EPOCHS)+"_"+ str(LOSS_FN)  +"_"+ str(OPTIMIZER) + "_" + str(LR)  
train(cnnmodel,train_dataloader,valid_dataloader,EPOCHS,LOSS_FN,OPTIMIZER,checkpoint_name,'cnnmodeltrial',LR,device_cpu,False)

In [None]:
batch,length,sentiments = next(iter(train_dataloader))

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(sent.parameters(),lr= 0.001)
sent.train()
output = sent(batch,length)
l = criterion(output,sentiments.float())
print(l.item())
l.backward()
optimizer.step()

In [None]:
def test(checkpoint_name,test_data,test_lengths,test_labels):
    model_name = checkpoint_name[14:]
    parameters =  model_name.split('_')
    count = 0
    model = None
    if parameters[0]=='dan':
        model = DAN()
    model.load_state_dict(torch.load(checkpoint_name))
    model.eval()
    for i in range(len(test_data)):
        ans = model(test_data[i],[test_lengths[i]])
        ans = torch.round(ans)
        if ans[0][0] == test_labels[i]:
            count+=1
    
    print("Accuracy = " + str(count/len(test_data)))





In [None]:
test_dataset_labels = []  
test_processed_text = []
with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        test_dataset_labels.append(label)
        test_processed_text.append(processed_text)



test_word_embeddings = []
test_sentence_lengths = []
for text in test_processed_text:
    embeddings,length = getWordEmbeddingforText(text)
    test_sentence_lengths.append(length)
    test_word_embeddings.append(embeddings.unsqueeze(0))



In [None]:
test('./checkpoints/dan_50_bce_adam_0.01',test_word_embeddings,test_sentence_lengths,test_dataset_labels)    

In [None]:
embeddings,length = getWordEmbeddingforText("The movie is so awesome that i want to run away")
dan = DAN()
dan.load_state_dict(torch.load('./checkpoints/dan_50_bce_adam_0.01'))
dan.eval()
dan(embeddings.unsqueeze(0),[length])