In [1]:
%%capture 
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

EMBED_DIM = 300
HIDDEN_DIM = 128

USE_TRAINABLE_EMBEDDINGS = True

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 1


FILTER_SIZES = [3,4,5,6,7]
NUM_FILTERS = 50

device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')

#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')
glove = GloVe()

In [2]:

#punctuation_words = open('punct.txt','w')
#punct_in_file = set()
#final_words = open('words.txt','w')
#punctuations = set(list(string.punctuation))
#vocab_words = set()

def preprocess_text(text):
        
    text = re.sub(r'<br /><br />',".",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split():
        if word == '':
            continue
        new_text.append(word)
        #if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
        #    words = word.split('.')
        #    for w in words:
        #        new_text.append(w)
        #else:
    
    text = ' '.join(new_text)
    words = nlp(text)
    text =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" or token.text=="."]).strip()
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    #review  = " ".join(new_text)
    new_words = []
    for word in text.split(" "):
        
        #vocab_words.add(word)
        if word == 'n\'t':
            if len(new_words) > 1:
                new_words[-1] = new_words[-1] + word
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    text = " ".join(new_words)
    return text


In [3]:
preprocessed_dataset = []
train_dataset_labels = []
with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    json_writer = jsonlines.open('processed_dataset.jsonl','w')

    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        train_dataset_labels.append(label)
        json_writer.write({"text":processed_text,"label":label})
        preprocessed_dataset.append({"text":processed_text,"label":label})
    
    json_writer.close()

train_dataset_labels = np.array(train_dataset_labels)
#final_words.write(str(vocab_words))
#final_words.close()
#file.close()
#punctuation_words.close()




In [3]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)


In [4]:
def getWordEmbeddingforText(text,glove=glove):
    
    length = 0
    words = []
    for word in text.split(' '):
        length+=1
        word_embedding = glove[word]
        words.append(word_embedding)

    if length < max(FILTER_SIZES):
        while length < max(FILTER_SIZES):
            length+=1
            words.append(torch.zeros(300))

    return torch.stack(words),length

In [5]:
processed_dataset = []

for review in preprocessed_dataset:
    embedding,length = getWordEmbeddingforText(review['text'])
    processed_dataset.append({'text': embedding,'length': length,'label' : review['label']})
 

In [6]:
class ReviewDataSet(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

dataset = ReviewDataSet(processed_dataset)

In [7]:
# Train and Validation split and an equal distriubition of classes
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

def collate_function(batch_data):
    inputs = [b['text'] for b in batch_data]
    lengths = [b['length'] for b in batch_data]
    labels = torch.tensor([b['label'] for b in batch_data])

    labels = labels.unsqueeze(1)
    inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : inputs , 'lengths': lengths , 'labels' : labels }

    

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,64,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(dataset,64,sampler=valid_sampler,collate_fn=collate_function)

In [8]:
class DAN(nn.Module):
    
    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM, droput_prob = 0.3, train_device = device_cpu):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        #self.fc = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()
        self.word_dropout_prob = droput_prob

        self.train_device = train_device


    def forward(self,inp,inp_len):    
        inp_mask = torch.ones((inp.shape[0],inp.shape[1]))
        for i  in range(inp.shape[0]):
            inp_mask[i,inp_len[i]:] = 0.0

        inp_mask = inp_mask.to(self.train_device)
        inp_lengths = torch.sum(inp_mask,-1,keepdim=True).int()
        inp_lengths = inp_lengths.to(self.train_device)
        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        #ans = F.relu(self.fc(ans))
        ans = self.sigmoid(self.fc2(ans))
        return ans

dan = DAN()


In [9]:
class RNNModel(nn.Module):

    def __init__(self,
            embed_dim=EMBED_DIM,hidden_dim =HIDDEN_DIM,bidirectional=False,
            rnn_type = 'gru',num_layers=1,rnn_dropout = 0.4,fc_dropout = 0.3):
        
        super().__init__()
        
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = rnn_dropout
        self.fc_dropout = nn.Dropout(fc_dropout)

        self.rnn = None
        
        if rnn_type == 'gru':
            self.rnn  = nn.GRU(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = self.num_layers,
                batch_first = True,
                dropout = self.dropout,
                bidirectional = self.bidirectional
            )
        elif rnn_type == 'rnn':
            self.rnn = nn.RNN(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = self.num_layers,
                batch_first = True,
                dropout = self.dropout,
                bidirectional = self.bidirectional)
        

        self.bidirectional_factor = 2 if self.bidirectional else 1
        self.dnn_input_size= self.bidirectional_factor * self.num_layers * self.hidden_dim
        self.fc_list = []

        log_base_2 = np.log2(self.dnn_input_size)
        nearest_power_2 = int(log_base_2)

        if(float(nearest_power_2) != log_base_2):
            self.fc_list.append(nn.Linear(self.dnn_input_size,2**nearest_power_2))

        while nearest_power_2 > 7 :
            self.fc_list.append(nn.Linear(2**(nearest_power_2),2**(nearest_power_2-1)))
            nearest_power_2-=1
        
        self.fc_list.append(nn.Linear(128,1))
        self.fc = nn.ModuleList(self.fc_list)

        #self.fc1 = nn.Linear(self.dnn_input_size,128)
        #self.fc2 = nn.Linear(128,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x,x_len):

        packed_input = pack_padded_sequence(x,x_len,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)
        #hidden = hidden.squeeze()
        hidden = torch.permute(hidden,(1,0,2))
        hidden = hidden.contiguous().view((hidden.shape[0],-1))
        out = hidden
        for i,l in enumerate(self.fc):
            if i!=(len(self.fc_list)-1):
                out = self.fc_dropout(l(out))
            
        #out = self.fc_dropout(self.fc1(hidden))
        ans = self.sigmoid(self.fc_list[len(self.fc_list)-1](out))
        return ans



In [10]:
# ADD Dropout Term

class CNNModel(nn.Module):

    def __init__(self, embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM,filter_sizes = FILTER_SIZES, n_filters = NUM_FILTERS,dropout = 0.2):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.filter_sizes = filter_sizes
        self.num_filters = n_filters
        self.dropout = nn.Dropout(p=dropout)
        
        self.modulelist = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
            out_channels=self.num_filters,
            kernel_size= self.filter_sizes[i]
            )
    
        for i in range(len(self.filter_sizes))])
        
        self.fc1 = nn.Linear(self.num_filters*len(self.filter_sizes),hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x : torch.Tensor,xlen = None):
        
        permuted_x = x.permute(0,2,1)
        x_conv_list = [F.relu(conv(permuted_x)) for conv in self.modulelist]
        x_max_pool_list = [F.max_pool1d(x_conv,kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        
        feature = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_max_pool_list],dim=1)
        
        out = self.dropout(F.relu(self.fc1(feature)))
        out = self.fc2(out)
        return self.sigmoid(out)



cnnmodel = CNNModel(10,2,FILTER_SIZES,NUM_FILTERS)    

In [14]:
def length_after_1d_convolution(x,filter_size,max_pool_size):

    def fn(sent_len,a=filter_size,pool_size = max_pool_size):
        conv_out = sent_len - a +1
        return int(conv_out/pool_size)

    ans = list(map(fn,x))
    return ans

class EnsembleModel(nn.Module):

    def __init__(self,embed_dim = EMBED_DIM,hidden_dim = HIDDEN_DIM,filter_size = 3,num_filters=256,max_pool_size = 3,dropout = 0.3,
        rnn_bidirectional = True,
        rnn_num_layers = 1,
        rnn_dropout = 0.0
    ):
        super().__init__()
        
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.filter_size = filter_size
        self.max_pool_size = max_pool_size
        self.dropout = dropout

        self.conv = nn.Conv1d(self.embed_dim,num_filters,self.filter_size)
        self.pool = nn.MaxPool1d(max_pool_size)

        self.RNNModel = RNNModel(num_filters,hidden_dim,rnn_bidirectional,num_layers=rnn_num_layers,rnn_dropout=rnn_dropout,fc_dropout=dropout)


    def forward(self,x,x_len):
        
        x = x.permute(0,2,1)
        conv_x = F.relu(self.conv(x))
        pooled_x = self.pool(conv_x)

        new_x = pooled_x.permute(0,2,1)
        #print(new_x.shape)
        new_x_len = length_after_1d_convolution(x_len,self.filter_size,self.max_pool_size)
        #print(new_x_len)

        ans = self.RNNModel(new_x,new_x_len)
        return ans
ensemble_model = EnsembleModel()


In [12]:
import os
from torch.utils.tensorboard import SummaryWriter
from datetime import  datetime

def train(model,train_dataloader,valid_dataloader,num_epochs,criterion,optimizer,check_point_name,tensorboard_name,device_train = device_fast,use_rnn = False,log=True):
    
    if log == True:
        current_datetime = datetime.now().strftime("%d/%m/%Y_%H:%M:%S")
        tensorboard_name = tensorboard_name + "_" + current_datetime
        writer = SummaryWriter('runs/' + tensorboard_name)
    
    model = model.to(device_train)


    clip = 0
    if use_rnn:
        clip = 5

    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            
            optimizer.zero_grad()
            input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,inp_lengths)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            if use_rnn:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
        
        current_training_loss = training_loss / training_set_size
        if log==True:
            print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
            writer.add_scalars(tensorboard_name + '; Loss vs Epoch',{'train' : current_training_loss},e)
        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP==0:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):
                input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,inp_lengths)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_accuracy = (correct_count/validation_set_size)*100
            current_validation_loss = (1.0* validation_loss)/validation_set_size
            if log == True:
                print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
                print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )

                writer.add_scalar(tensorboard_name + ' Validation Accuracy vs Epoch ',int((correct_count/validation_set_size)*100),e)
                writer.add_scalars('Loss vs Epoch',{'valid' : current_validation_loss},e)
            
            if log==True:
                if current_validation_loss < best_validation_loss:
                    valdiation_loss_not_decreased_steps = 0
                    torch.save(model.state_dict(),check_point_name)
                    best_validation_loss = current_validation_loss
                else:
                    valdiation_loss_not_decreased_steps +=1
            else:   
                with tune.checkpoint_dir(e) as checkpoint_dir:
                    g = check_point_name[14:].split('_')
                    checkpoint_name = 'tune_checkpoints/' + '_'.join(g)
                    path = os.path.join(checkpoint_dir,check_point_name)
                    torch.save((model.state_dict(),optimizer.state_dict()),path)
                tune.report(loss = current_validation_loss,accuracy = current_validation_accuracy)   

        if log == True:
            if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
                break

In [None]:
EPOCHS = 50
dan = DAN(train_device=device_cpu)
criterion = nn.BCELoss()
LR = 0.01
optimizer = optim.Adam(dan.parameters(),lr=LR)

model_name = type(dan).__name__
optimizer_name = type(optimizer).__name__
loss_fn_name = type(criterion).__name__

checkpoint_name = 'checkpoints/'+model_name+ '_' + str(EPOCHS)+"_"+ loss_fn_name  +"_"+ optimizer_name + "_" + str(LR)+'_.pth'  
train(dan,train_dataloader,valid_dataloader,EPOCHS,criterion,optimizer,checkpoint_name,model_name,device_cpu,use_rnn=False)

In [None]:
EPOCHS = 50
BIDIRECTIONAL = 'F'
NUM_LAYERS = 1
bidirectional = True if BIDIRECTIONAL=='T' else False
RNN_TYPE = 'gru'
RNN_DROPOUT = 0.3 if NUM_LAYERS == 1 else 0.0

model = RNNModel(rnn_type=RNN_TYPE,bidirectional=bidirectional,num_layers=NUM_LAYERS,rnn_dropout=RNN_DROPOUT)
criterion = nn.BCELoss()
LR = 0.001
optimizer = optim.Adam(model.parameters(),lr=LR)

model_name = type(model).__name__
optimizer_name = type(optimizer).__name__
loss_fn_name = type(criterion).__name__

checkpoint_name = 'checkpoints/'+model_name+ '_' +RNN_TYPE + '_'+ str(EPOCHS)+"_"+ loss_fn_name  +"_"+ optimizer_name + "_" + str(LR) + "_" + str(NUM_LAYERS) +'_'+ BIDIRECTIONAL +'_' + str(RNN_DROPOUT)+'_.pth' 
train(model,train_dataloader,valid_dataloader,EPOCHS,criterion,optimizer,checkpoint_name,model_name,device_cpu,use_rnn=False)

In [None]:
EPOCHS = 50
model = CNNModel()
criterion = nn.BCELoss()
LR = 0.01
optimizer = optim.Adam(model.parameters(),lr=LR)

model_name = type(model).__name__
optimizer_name = type(optimizer).__name__
loss_fn_name = type(criterion).__name__

checkpoint_name = 'checkpoints/'+model_name+ '_' + str(EPOCHS)+"_"+ loss_fn_name  +"_"+ optimizer_name + "_" + str(LR) +'_.pth' 
train(model,train_dataloader,valid_dataloader,EPOCHS,criterion,optimizer,checkpoint_name,model_name,device_cpu,use_rnn=False)

In [None]:
EPOCHS = 50
BIDIRECTIONAL = 'T'
NUM_LAYERS = 1
bidirectional = True if BIDIRECTIONAL=='T' else False
RNN_DROPOUT = 0.3 if NUM_LAYERS == 1 else 0.0


model = EnsembleModel(rnn_bidirectional=bidirectional,rnn_num_layers=NUM_LAYERS,rnn_dropout=RNN_DROPOUT)
criterion = nn.BCELoss()
LR = 0.001
optimizer = optim.Adam(model.parameters(),lr=LR)

model_name = type(model).__name__
optimizer_name = type(optimizer).__name__
loss_fn_name = type(criterion).__name__

checkpoint_name = 'checkpoints/' + model_name +'_' + str(EPOCHS)+'_'+ loss_fn_name  +'_'+ optimizer_name + '_' + str(LR) + '_' + str(NUM_LAYERS)+ '_' + BIDIRECTIONAL +'_' + str(RNN_DROPOUT) +'_.pth' 
train(model,train_dataloader,valid_dataloader,EPOCHS,criterion,optimizer,checkpoint_name,model_name,device_cpu,use_rnn=False)

### Test phase

In [15]:
def test(checkpoint_name,test_data,test_lengths,test_labels):
    model_name = checkpoint_name[14:]
    parameters =  model_name.split('_')
    count = 0
    model = None
    print(parameters)
    if parameters[0]=='DAN':
        model = DAN()
    elif parameters[0] == 'CNNModel':
        model = CNNModel()
    elif parameters[0] == 'RNNModel':
        bidir = True if parameters[7]=='T' else False
        model = RNNModel(num_layers=int(parameters[6]),rnn_type=parameters[1],bidirectional=bidir)
    elif parameters[0] == 'EnsembleModel':
        bidir = True if parameters[6] =='T' else False
        num_layers = int(parameters[5])
        model = EnsembleModel(rnn_bidirectional=bidir,rnn_num_layers=num_layers,rnn_dropout=float(parameters[7]))
    
    model.load_state_dict(torch.load(checkpoint_name,map_location=device_cpu))
    model.eval()
    for i in range(len(test_data)):
        ans = model(test_data[i],[test_lengths[i]])
        ans = torch.round(ans)
        if ans[0][0] == test_labels[i]:
            count+=1
    
    print("Accuracy = " + str((count/len(test_data)*100)))


In [16]:
test_dataset_labels = []  
test_processed_text = []
with open("./E0334 Assignment2 Test Dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        test_dataset_labels.append(label)
        test_processed_text.append(processed_text)


test_word_embeddings = []
test_sentence_lengths = []
for text in test_processed_text:
    embeddings,length = getWordEmbeddingforText(text)
    test_sentence_lengths.append(length)
    test_word_embeddings.append(embeddings.unsqueeze(0))





In [24]:
test('./checkpoints/RNNModel_gru_50_BCELoss_Adam_0.001_2_F_0.3_.pth',test_word_embeddings,test_sentence_lengths,test_dataset_labels)    

['RNNModel', 'gru', '50', 'BCELoss', 'Adam', '0.001', '2', 'F', '0.3', '.pth']
Accuracy = 91.53915391539154


In [9]:
embeddings,length = getWordEmbeddingforText("the movie is great.")
#dan = DAN()
#dan.load_state_dict(torch.load('./checkpoints/DAN_50_BCELoss_Adam_0.01_.pth'))
dan_model.eval()
dan_model(embeddings.unsqueeze(0),[length])

tensor([[0.9883]], grad_fn=<SigmoidBackward0>)