In [2]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm
import io

# SENTENCE_SPLITTING_USED; whether to use the splitting of reviews into sentences.
EMBED_DIM = 300
HIDDEN_DIM = 128
ATTENTION_DIM = 128
NUM_FILTERS = 86

PATIENCE_PARAMETER = 7
VALIDATION_LOSS_COMPUTE_STEP = 1

device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')



if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')

#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')
glove = GloVe()


torch.cuda.empty_cache()

print(torch.cuda.is_available())

False


In [3]:
def preprocess_text(text):    
    text = re.sub(r'<br /><br />',".",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split():
        if word == '':
            continue
        new_text.append(word)
    
    text = ' '.join(new_text)
    words = nlp(text)
    text =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" or token.text=="."]).strip()
    new_words = []
    for word in text.split(" "):
        if word == 'n\'t':
            if len(new_words) > 1:
                new_words[-1] = new_words[-1] + word
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    
    text = " ".join(new_words)
    return text

In [4]:
# preprocess the training data which was given for Assignment 2
def process_assignment2_training_data():
    preprocessed_dataset = []
    train_dataset_labels = []
    with open("./Train dataset.csv") as csvfile:
        csvFile = csv.reader(csvfile)
        next(csvFile)
        json_writer = jsonlines.open('processed_dataset.jsonl','w')

        for line in csvFile:
            processed_text = preprocess_text(line[0])
            label = 1.0 if line[1] == 'positive' else 0.0
            train_dataset_labels.append(label)
            json_writer.write({"text":processed_text,"label":label})
            preprocessed_dataset.append({"text":processed_text,"label":label})
    
        json_writer.close()


#process_assignment2_training_data()

In [5]:
preprocessed_dataset = []
train_dataset_labels = []


TRAIN_FILE_NAME = './processed_dataset.jsonl'

with open(TRAIN_FILE_NAME ,encoding='utf-8') as f:
#with open('processed_dataset.jsonl',encoding='utf-8') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
      
train_dataset_labels = np.array(train_dataset_labels)

In [6]:
def getWordEmbeddingforText(text,glove=glove):
    length = 0
    words = []
    text = text.strip()
    for word in text.split(' '):
        w = word.strip()
        if w=='':
            continue
        length+=1
        word_embedding = glove[w]
        words.append(word_embedding)
    
    return torch.stack(words),length

In [7]:
# Sentences, word
def review_to_embed(review,glove=glove): 
    sentences = review.split(".")
    sentence_lengths = []
    review_embeddings = []
    num_sentences = 0
    for sentence in sentences:
        s = sentence.strip()
        if s == '':
            continue
        num_sentences += 1
        sentence_word_embeddings,sentence_length = getWordEmbeddingforText(s,glove)
        sentence_lengths.append(sentence_length)
        review_embeddings.append(sentence_word_embeddings)

    return torch.nn.utils.rnn.pad_sequence(review_embeddings,batch_first=True),sentence_lengths,num_sentences

In [8]:
class ReviewDataSet(Dataset):
    
    def __init__(self,reviews):
        super().__init__()
        self.reviews = reviews
        
    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        return self.reviews[index]

In [9]:
processed_dataset = []
for review in preprocessed_dataset:
        embeddings, sent_length ,n_sents = review_to_embed(review['text'])
        processed_dataset.append({'review': embeddings,'sent_lengths': sent_length,'length' : n_sents,'label' : review['label']})

In [10]:
def collate_function(batch_data):   
    
    inputs = [b['review'] for b in batch_data]
    sent_lengths = [ b['sent_lengths'] for b in batch_data ]
    n_sentences = [ b['length'] for b in batch_data ]
    labels = torch.tensor([b['label'] for b in batch_data])


    labels = labels.unsqueeze(1)
    max_n_sentences = max([i.shape[0] for i in inputs] )
    max_n_words = max([i.shape[1] for i in inputs])

 
    processed_inputs = []
    for inp in inputs:

        t1 = torch.permute(inp,(2,1,0))
        t1 = torch.nn.functional.pad(t1,(0,max_n_sentences-inp.shape[0],0,max_n_words-inp.shape[1]))
        t1 = torch.permute(t1,(2,1,0))
        processed_inputs.append(t1)

    final_inp = torch.stack(processed_inputs)
    #inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : final_inp , 'sent_lengths': sent_lengths , 'lengths' : n_sentences ,'labels' : labels }

In [11]:
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

dataset = ReviewDataSet(processed_dataset)
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,16,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(dataset,16,sampler=valid_sampler,collate_fn=collate_function)


In [13]:
batch_data = next(iter(train_dataloader))

In [84]:
import torch.nn.functional as F

class WordAttention(nn.Module):

    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM,attention_dim = ATTENTION_DIM,num_layers=1,bidirectional=True):
        super().__init__()
        self.rnn = nn.GRU(embed_dim,hidden_dim,num_layers=num_layers,batch_first=True,bidirectional=bidirectional)
        bidirectional_factor = 2 if bidirectional else 1
        self.word_attention = nn.Linear(bidirectional_factor*hidden_dim,attention_dim)
        self.u_w = nn.Linear(attention_dim,1)
    
    def forward(self,inp,inp_len):
        
        # inp = 1 review  = [num_sentences , num_words , embed_dim]
        # inp_len = length = num_sentences , each element number of words in  sentence.

        packed_embedding = nn.utils.rnn.pack_padded_sequence(inp,inp_len,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_embedding)
        outputs,_ = nn.utils.rnn.pad_packed_sequence(packed_output,batch_first=True)
        
        attention_outs = torch.tanh(self.word_attention(outputs))
        attention_scores = self.u_w(attention_outs)
        attention_scores = attention_scores.squeeze(2)
        attention_probs = F.softmax(attention_scores,dim=1)
        attention_probs = attention_scores.unsqueeze(2)
        weighted_embeddings = attention_probs * outputs
        output = torch.sum(weighted_embeddings,dim=1)
        return output


In [85]:
class SentenceAttention(nn.Module):
    
    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM,attention_dim=ATTENTION_DIM,num_layers=1,bidirectional=True):
        super().__init__()
        self.rnn = nn.GRU(embed_dim,hidden_dim,num_layers=num_layers,batch_first=True,bidirectional=bidirectional)
        bidirectional_factor = 2 if bidirectional else 1
        self.sentence_attention = nn.Linear(bidirectional_factor*hidden_dim,attention_dim)
        self.u_s = nn.Linear(attention_dim,1)

    def forward(self,sents,sent_len):
        
        packed_embedding = nn.utils.rnn.pack_padded_sequence(sents,sent_len)
        packed_output,hidden = self.rnn(packed_embedding)
        outputs,_ = nn.utils.rnn.pad_packed_sequence(packed_output,batch_first=True)

        attention_outs = torch.tanh(self.sentence_attention(outputs))
        attetnion_scores = self.u_s(attention_outs)
        attention_scores = attention_scores.squeeze(2)
        attention_probs = F.softmax(attention_scores,dim=1)
        attention_probs = attention_probs.unsqueeze(2)
        weighted_embeddings = attention_probs*outputs
        output = torch.sum(weighted_embeddings,dim=1)
        return output

In [20]:
class HierarchialAttention(nn.Module):

    def __init__(self,
                
                input_embed_dim = EMBED_DIM,
                word_encoder_hidden_dim = HIDDEN_DIM,
                word_encoder_num_layers = 1,
                word_encoder_bidirectional = True,
                word_encoder_attention_dim = HIDDEN_DIM,

                sentence_encoder_hidden_dim = HIDDEN_DIM,
                sentence_encoder_num_layers = 1,
                sentence_encoder_bidirectional= True,
                sentence_encoder_attention_dim = HIDDEN_DIM,
                rnn_dropout = 0.0,
                fc_dropout = 0.3,
            ):
      
        super().__init__()

        self.word_encoder = WordAttention(input_embed_dim,word_encoder_hidden_dim,word_encoder_attention_dim,word_encoder_num_layers,word_encoder_bidirectional)
        bidirectional_factor = 2 if word_encoder_bidirectional else 1
        self.sentence_encoder = SentenceAttention(bidirectional_factor*word_encoder_hidden_dim,sentence_encoder_hidden_dim,sentence_encoder_attention_dim,sentence_encoder_num_layers,sentence_encoder_bidirectional)
        self.fc_list = [
                nn.Linear(bidirectional_factor*sentence_encoder_hidden_dim,sentence_encoder_hidden_dim),
        ] 
        
        self.fc = nn.ModuleList(self.fc_list)
        self.fc_out = nn.Linear(sentence_encoder_hidden_dim,1)

    def forward(self,inp,inp_sentence_lengths,inp_words_lengths):
        
        sentence_embeddings = []
        for i in range(inp.shape[0]):
            sentence_embeddings.append(self.word_encoder(inp[i],inp_words_lengths[i]))
        
        batch_sentences = torch.stack(sentence_embeddings)
        doc_embedding = self.sentence_encoder(batch_sentences,inp_sentence_lengths)

        out = doc_embedding
        for i,l in enumerate(self.fc_list):
            out = l(out)

        




In [79]:
inp = torch.tensor([  [[2,3,4],[5,6,7]],[[7,8,9],[1,4,5]]])
inp_scores = torch.tensor([[[1],[2]],[[3],[4]]])


In [66]:
inp_scores.shape

torch.Size([2, 2, 1])

In [67]:
inp.shape

torch.Size([2, 2, 3])

In [69]:
inp_scores * inp

tensor([[[ 2,  3,  4],
         [10, 12, 14]],

        [[21, 24, 27],
         [ 4, 16, 20]]])

In [68]:
inp

tensor([[[2, 3, 4],
         [5, 6, 7]],

        [[7, 8, 9],
         [1, 4, 5]]])

In [80]:
word_model = WordAttention()

In [82]:
inp = torch.randn((5,14,300))
inp_lengths = torch.randint(3,14,(5,))
out = word_model(inp,inp_lengths)

In [83]:
out

tensor([[ 0.0341, -0.0069, -0.0314,  ...,  0.0415, -0.0498,  0.1286],
        [ 0.0623,  0.1266, -0.1210,  ...,  0.1599,  0.0732, -0.0543],
        [ 0.0805, -0.4734,  0.1516,  ..., -0.0677,  0.1505,  0.1741],
        [-0.0646,  0.1689, -0.1075,  ...,  0.2273, -0.0997, -0.0022],
        [-0.1843,  0.3853,  0.0317,  ...,  0.0818, -0.1666,  0.1818]],
       grad_fn=<SumBackward1>)

In [None]:
word_model()

In [None]:

class HierarchialAttentionModel(nn.Module):
    
    def __init__(self):
        super().__init__()



In [None]:
import torch.nn.functional as F
class EnsembleModel(nn.Module):
    
    def __init__(self,EMBED_DIM,CNN_DIM,HIDDEN_DIM):
        super().__init__()
        self.rnn = nn.GRU(input_size = CNN_DIM,hidden_size = HIDDEN_DIM, batch_first = True)
        self.cnn = nn.Conv1d(in_channels=EMBED_DIM,out_channels=CNN_DIM,kernel_size=3)
        self.fc = nn.Linear(HIDDEN_DIM,1)


    def forward(self,inp : torch.Tensor,n_sents=None):

        ## inp  = (batch_size,max_sent_length,max_word_length,embed_dim)

        outputs = []
      
        
        for i in range(inp.shape[1]):
            current_inp = inp[:,i,:,:]
            current_inp = torch.permute(current_inp,(0,2,1))
            current_output = self.cnn(current_inp)
            current_output = F.max_pool1d(current_output,kernel_size = current_output.shape[2]).squeeze(dim=2)
            outputs.append(current_output)
        
        #print(len(outputs))
        #print(outputs[0].shape)
        lstm_in = torch.stack(outputs,dim=1)
   
        packed_input = pack_padded_sequence(lstm_in,n_sents,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output,batch_first=True)

        hidden = torch.permute(hidden,(1,0,2))
        hidden = hidden.contiguous().view((hidden.shape[0],-1))

        out = self.fc(hidden)
        return nn.Sigmoid()(out)
        #out = self.cnn(inp)
        #return out

batch_data = next(iter(train_dataloader))
j = EnsembleModel(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
j(batch_data['input'],batch_data['n_sent'])

KeyError: 'n_sent'

In [13]:
import torch.nn.functional as F
class CNNLSTMAttention(nn.Module):
    
    def __init__(self,EMBED_DIM,CNN_DIM,HIDDEN_DIM,bidirectional_factor = 2,fc_dropout=0.3):
        super().__init__()

        bidirectional = False

        if bidirectional_factor==2:
            bidirectional = True

        self.rnn = nn.GRU(input_size = CNN_DIM,hidden_size = HIDDEN_DIM, bidirectional=bidirectional,batch_first = True)
        
        cnn_layers = [  
            nn.Conv1d(in_channels=EMBED_DIM,out_channels=NUM_FILTERS,kernel_size=FILTER_SIZES[i]) for i in range(len(FILTER_SIZES))
        ]
        self.cnn_list = nn.ModuleList(cnn_layers)
    
        #self.cnn = nn.Conv1d(in_channels=EMBED_DIM,out_channels=CNN_DIM,kernel_size=3)
        self.attention_layer = nn.Linear(bidirectional_factor* HIDDEN_DIM,1)
        self.fc = nn.Linear(bidirectional_factor*HIDDEN_DIM,HIDDEN_DIM)
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.out_fc = nn.Linear(HIDDEN_DIM,1)
        self.batchnorm1d = nn.BatchNorm1d(CNN_DIM)

    def forward(self,inp : torch.Tensor,n_sents=None):

        ## inp  = (batch_size,max_sent_length,max_word_length,embed_dim)

        outputs = []
      
        
        for i in range(inp.shape[1]):
            current_inp = inp[:,i,:,:]
            current_inp = torch.permute(current_inp,(0,2,1))
            
            current_output = None
            for cnn in self.cnn_list:
                current_out = cnn(current_inp)
                current_out = F.max_pool1d(current_out,kernel_size=current_out.shape[2]).squeeze(dim=2)

                if current_output is None:
                    current_output = current_out
                else:
                    current_output = torch.cat([current_output,current_out],dim=1)
                #current_output = self.cnn(current_inp)
                #current_output = F.max_pool1d(current_output,kernel_size = current_output.shape[2]).squeeze(dim=2)

            outputs.append(current_output)
        
        #print(len(outputs))
        #print(outputs[0].shape)
        lstm_in = torch.stack(outputs,dim=2)
        lstm_in = self.batchnorm1d(lstm_in)
        lstm_in = torch.permute(lstm_in,(0,2,1))

        packed_input = pack_padded_sequence(lstm_in,n_sents,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output,batch_first=True)
        attention_logs = self.attention_layer(output).squeeze(dim=2)
        attention_score = F.softmax(attention_logs,dim=1).unsqueeze(2)

        final_out = attention_score*output

        averaged_vector = torch.sum(final_out,dim=1,keepdim=False)

        #hidden = torch.permute(hidden,(1,0,2))
        #hidden = hidden.contiguous().view((hidden.shape[0],-1))
        out = self.fc_dropout(F.leaky_relu(self.fc(averaged_vector)))
        out = self.out_fc(out)
        return nn.Sigmoid()(out)
        #out = self.cnn(inp)
        #return out


In [14]:
batch_data = next(iter(train_dataloader))

In [19]:
print(batch_data['input'].shape)
print(batch_data['lengths'])

torch.Size([16, 58, 65, 300])
[14, 30, 36, 29, 8, 11, 55, 14, 5, 10, 58, 13, 14, 8, 15, 20]


In [20]:
cnn = CNNLSTMAttention(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
cnn(batch_data['input'],batch_data['lengths'])

tensor([[0.4688],
        [0.4460],
        [0.4683],
        [0.4364],
        [0.4712],
        [0.4773],
        [0.4318],
        [0.4664],
        [0.4781],
        [0.4691],
        [0.5031],
        [0.4612],
        [0.4626],
        [0.4737],
        [0.4584],
        [0.4693]], grad_fn=<SigmoidBackward0>)

: 

In [None]:
import os
from torch.utils.tensorboard import SummaryWriter
from datetime import  datetime

def train(model,train_dataloader,valid_dataloader,num_epochs,criterion,optimizer,
    checkpoint_name='best_model.pt',
    device_train = device_fast,use_rnn = False,log=True):

    tensorboard_name='Ensemble'
    if log == True:
        current_datetime = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
        tensorboard_name = tensorboard_name + "_" + current_datetime
        writer = SummaryWriter('runs/' + tensorboard_name)
    
    
    model = model.to(device_train)
    clip = 0
    if use_rnn:
        clip = 5

    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            
            optimizer.zero_grad()
            if SENTENCE_SPLITTING_USED:
                input_reviews,sent_lengths,n_sents,output_labels = data['input'], data['sent_lengths'],data['lengths'],data['labels']
            else:
                input_reviews,n_sents,output_labels = data['input'],data['lengths'],data['labels']

            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,n_sents)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            if use_rnn:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
        
        current_training_loss = training_loss
        if log==True:
            print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
            writer.add_scalars(tensorboard_name + 'Training Loss vs Epoch',{'train' : current_training_loss},e)

        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP==0:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):
                if SENTENCE_SPLITTING_USED:
                    input_reviews,sent_lengths,n_sents,output_labels = data['input'], data['sent_lengths'],data['lengths'],data['labels']
                else:
                    input_reviews,n_sents,output_labels = data['input'],data['lengths'],data['labels']
                
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,n_sents)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_accuracy = (correct_count/validation_set_size)*100
            current_validation_loss = (1.0* validation_loss)
            if log == True:
                print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
                print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )
                writer.add_scalar(tensorboard_name + ' Validation Accuracy vs Epoch ',(correct_count/validation_set_size*100),e)
                writer.add_scalars(tensorboard_name + 'Validation Loss vs Epoch',{'valid' : current_validation_loss},e)

            
            if log==True:
                if current_validation_loss < best_validation_loss:
                    valdiation_loss_not_decreased_steps = 0
                    torch.save(model.state_dict(),checkpoint_name)
                    best_validation_loss = current_validation_loss
                else:
                    valdiation_loss_not_decreased_steps +=1
        if log == True:
            if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
                break

In [26]:
torch.cuda.empty_cache()

net = CNNLSTMAttention(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
optimizer= optim.SGD(net.parameters(),lr=0.0054,momentum=0.9,nesterov=True)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0054, max_lr=0.0072,step_size_up=10000)
train(net,train_dataloader,valid_dataloader,100,nn.BCELoss(),optimizer,'test_cnn_rnn_att_adam_batch_nrom_cyclelr_bidir_0.0054.pt',device_fast,True,True)

100%|██████████| 2000/2000 [09:00<00:00,  3.70it/s]


Epoch 0 Average Training Loss = 730.8767990171909
Epoch 0 Validation Loss = 135.62008828297257
Validation Set Accuracy = 89.1


100%|██████████| 2000/2000 [08:35<00:00,  3.88it/s]


Epoch 1 Average Training Loss = 480.47654472664
Epoch 1 Validation Loss = 126.14620087854564
Validation Set Accuracy = 89.6875


100%|██████████| 2000/2000 [08:41<00:00,  3.84it/s]


Epoch 2 Average Training Loss = 402.75145566184074
Epoch 2 Validation Loss = 125.95687280595303
Validation Set Accuracy = 90.14999999999999


100%|██████████| 2000/2000 [08:36<00:00,  3.87it/s]


Epoch 3 Average Training Loss = 320.07007088838145
Epoch 3 Validation Loss = 195.673126203008
Validation Set Accuracy = 84.95


  6%|▌         | 111/2000 [00:33<09:28,  3.33it/s]


RuntimeError: CUDA out of memory. Tried to allocate 1.64 GiB (GPU 0; 2.00 GiB total capacity; 157.35 MiB already allocated; 536.18 MiB free; 444.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.cuda.empty_cache()
net = EnsembleModel(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
train(net,train_dataloader,valid_dataloader,50,nn.BCELoss(),optim.Adam(net.parameters(),0.001),'cnn_rnn_dnn_adam.pt',device_fast,True,True)

  0%|          | 0/2000 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 46.00 MiB (GPU 0; 2.00 GiB total capacity; 1.64 GiB already allocated; 0 bytes free; 1.70 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [27]:
def test(model_name,test_data,test_lengths,test_labels):
    model = CNNLSTMAttention(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
    model.load_state_dict(torch.load(model_name,map_location=device_cpu))
    model.eval()
    count = 0
    for i in range(len(test_data)):
        ans = model(test_data[i],[test_lengths[i]])
        ans = torch.round(ans)
        if ans[0][0] == test_labels[i]:
            count+=1
    
    print("Accuracy = " + str((count/len(test_data)*100)))


In [28]:
test_word_embeddings = [] 
test_sentence_lengths = []
test_dataset_labels = []  

def getAssignment2TestData():
    test_processed_text = []
    with open("./E0334 Assignment2 Test Dataset.csv",encoding='utf-8') as csvfile:
        csvFile = csv.reader(csvfile)
        next(csvFile)
        for line in csvFile:
            processed_text = preprocess_text(line[0])
            label = 1.0 if line[1] == 'positive' else 0.0
            test_dataset_labels.append(label)
            test_processed_text.append(processed_text)

    for i in range(len(test_processed_text)):
        if SENTENCE_SPLITTING_USED:
            current_embeddings,current_sent_lengths,current_n_sent = review_to_embed(test_processed_text[i]) 
        else:
            current_embeddings,current_n_sent = getWordEmbeddingforText(test_processed_text[i])

        test_word_embeddings.append(current_embeddings.clone().detach().unsqueeze(0))
        test_sentence_lengths.append(current_n_sent)


def getAssignment1TestData():
    
    correct_count = 0

    reviews = open('./TestData','r',encoding='latin-1').readlines()
    for i in range(len(reviews)):
        r = reviews[i]
        reviews[i] = preprocess_text(r)

        if SENTENCE_SPLITTING_USED:
            current_embeddings,current_sent_lengths,current_n_sent = review_to_embed(reviews[i]) 
        else:
            current_embeddings,current_n_sent = getWordEmbeddingforText(reviews[i])
      
        if (i<331):
            test_dataset_labels.append(1.0)
        else:
            test_dataset_labels.append(0.0)
        test_word_embeddings.append(current_embeddings.clone().detach().unsqueeze(0))
        test_sentence_lengths.append(current_n_sent)

getAssignment2TestData()



In [29]:
test('./test_cnn_rnn_att_adam_batch_nrom_cyclelr_bidir_0.0054.pt',test_word_embeddings,test_sentence_lengths,test_dataset_labels)

Accuracy = 89.6989698969897


In [None]:
test('./first_cnn_rnn_att_adam_batch_nrom_cyclelr_bidir_0.0054.pt',test_word_embeddings,test_sentence_lengths,test_dataset_labels)

Accuracy = 80.76923076923077
