In [3]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm

EMBED_DIM = 300
HIDDEN_DIM = 128

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 2


device_cpu = torch.device('cpu')

device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')


#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')

In [None]:
fasttext_model = fasttext.load_model('./crawl-300d-2M-subword/crawl-300d-2M-subword.bin')

In [9]:
glove =   GloVe()

In [6]:

file = open('processed_text.txt','w')
punctuation_words = open('punct.txt','w')
break_file = open('break.txt','w')
final_words = open('words.txt','w')
punctuations = set(list(string.punctuation))

vocab_words = set()
def preprocess_text(text):
    
    
    text = re.sub(r'<br /><br />',"",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split(' '):
        if word == '':
            continue
        #if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
        #    words = word.split('.')
        #    for w in words:
        #        new_text.append(w)
        #else:
        new_text.append(word)
    
    text = " ".join(new_text)

    words = nlp(text)
    review =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" ]).strip()
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    
    
    for word in review.split(" "):
        vocab_words.add(word)
        if any([punct in word for punct in punctuations]):
            punctuation_words.write(word + "\n")


    file.write(review + '\n')
    return review


In [None]:

preprocessed_dataset = []
train_dataset_labels = []
with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    json_writer = jsonlines.open('processed_dataset.jsonl','w')

    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        train_dataset_labels.append(label)
        json_writer.write({"text":processed_text,"label":label})
        preprocessed_dataset.append({"text":processed_text,"label":label})
    
    json_writer.close()

train_dataset_labels = np.array(train_dataset_labels)
final_words.write(str(vocab_words))
final_words.close()
file.close()
punctuation_words.close()
break_file.close()


In [20]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)


In [11]:
def getWordEmbeddingforText(text,glove=glove):
    
    length = 0
    words = []
    for word in text.split(' '):
        length+=1
        word_embedding = glove[word]
        words.append(word_embedding)
    
    return torch.stack(words),length



In [None]:
processed_dataset = []

for review in preprocessed_dataset:
    embedding,length = getWordEmbeddingforText(review['text'])
    processed_dataset.append({'text': embedding,'length': length,'label' : review['label']})
 

In [22]:
class ReviewDataSet(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

dataset = ReviewDataSet(processed_dataset)

In [23]:
# Train and Validation split and an equal distriubition of classes
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

def collate_function(batch_data):
    inputs = [b['text'] for b in batch_data]
    lengths = [b['length'] for b in batch_data]
    labels = torch.tensor([b['label'] for b in batch_data])

    labels = labels.unsqueeze(1)
    inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : inputs , 'lengths': lengths , 'labels' : labels }

    

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,64,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(dataset,64,sampler=valid_sampler,collate_fn=collate_function)



In [4]:
class DAN(nn.Module):
    
    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM, droput_prob = 0.3, train_device = device_cpu):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        #self.fc = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()
        self.word_dropout_prob = droput_prob

        self.train_device = train_device

        #if(word_vector_types_count > 1) :
        #   self.importance_weights = nn.Linear(word_vector_types_count*EMBED_DIM,word_vector_types_count)
        

    def forward(self,inp,inp_len):
        
    
        inp_mask = torch.ones((inp.shape[0],inp.shape[1]))
        for i  in range(inp.shape[0]):
            inp_mask[i,inp_len[i]:] = 0.0

        inp_mask = inp_mask.to(self.train_device)

        inp_lengths = torch.sum(inp_mask,-1,keepdim=True).int()
        inp_lengths = inp_lengths.to(self.train_device)
        #if word_vector_types_count > 1:
        #    input = inp.view((inp.shape[0],inp.shape[1],-1))
        #    out = self.importance_weights(input)
        #    weights = F.softmax(out,dim=2).unsqueeze(3)
        #    weights_multiplied_vector = weights * inp
        #    inp = torch.sum(weights_multiplied_vector,dim=2)

        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        #ans = F.relu(self.fc(ans))
        ans = self.sigmoid(self.fc2(ans))
        return ans
dan = DAN(train_device=device_cpu)


In [32]:
# bidirectional
# rnn_type
# use_cnn
# num_layers


class SentimentModel(nn.Module):

    def __init__(self,
            embed_dim=EMBED_DIM,hidden_dim =HIDDEN_DIM, bidirectional=False,
            rnn_type = 'gru', num_layers=1):
        
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = None

        if rnn_type == 'gru':
            self.rnn  = nn.GRU(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = self.num_layers,
                batch_first = True,
                dropout = 0,
                bidirectional = bidirectional
            )

        self.fc1 = nn.Linear(HIDDEN_DIM,128)
        self.fc2 = nn.Linear(128,1)
        self.sigmoid  = nn.Sigmoid()



    def forward(self,x,x_len):

        packed_input = pack_padded_sequence(x,x_len,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)
        hidden = hidden.squeeze()
        
        out =F.relu(self.fc1(hidden))
        ans = self.sigmoid(self.fc2(out))
        return ans

sent = SentimentModel()

In [28]:
def train(model,train_dataloader,valid_dataloader,num_epochs,loss_fn,optimizer_name,check_point_name,tensorboard_name,learning_rate=0.01,device_train = device_fast,use_rnn = False):

    model = model.to(device_train)
    criterion = None
    optimizer = None
    
    clip = 0
    if use_rnn:
        clip = 5

    
    if loss_fn == 'bce':
        criterion = nn.BCELoss()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(),lr = learning_rate)
    
    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            
            optimizer.zero_grad()
            input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,inp_lengths)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            if use_rnn:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
        
        current_training_loss = training_loss / training_set_size
        print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
        writer.add_scalars(tensorboard_name + '; Loss vs Epoch',{'train' : current_training_loss},e)
        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):
                input_reviews,inp_lengths,output_labels = data['input'], data['lengths'],data['labels']
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,inp_lengths)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_loss = (1.0* validation_loss)/validation_set_size
            print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
            print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )

            writer.add_scalar(tensorboard_name + ' Validation Accuracy vs Epoch ',int((correct_count/validation_set_size)*100),e)
            writer.add_scalars('Loss vs Epoch',{'valid' : current_validation_loss},e)
            if current_validation_loss < best_validation_loss:
                valdiation_loss_not_decreased_steps = 0
                torch.save(model.state_dict(),check_point_name)
                best_validation_loss = current_validation_loss
            else:
                valdiation_loss_not_decreased_steps +=1
        
        if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
            break

In [30]:
EPOCHS = 50
LOSS_FN = 'bce'
OPTIMIZER = 'adam'
LR = 0.01
checkpoint_name = 'checkpoints/' + 'dan_' + str(EPOCHS)+"_"+ str(LOSS_FN)  +"_"+ str(OPTIMIZER) + "_" + str(LR)  
train(dan,train_dataloader,valid_dataloader,EPOCHS,'bce','adam',checkpoint_name,'DAN',0.01,device_cpu,use_rnn=False)

100%|██████████| 500/500 [00:31<00:00, 15.89it/s]


Epoch 0 Average Training Loss = 0.005489189109299332


100%|██████████| 500/500 [00:33<00:00, 14.83it/s]


Epoch 1 Average Training Loss = 0.005342737676110119
Epoch 1 Validation Loss = 0.005421101083979011
Validation Set Accuracy = 84.78750000000001


100%|██████████| 500/500 [00:38<00:00, 13.13it/s]


Epoch 2 Average Training Loss = 0.005260589673183858


100%|██████████| 500/500 [00:39<00:00, 12.78it/s]


Epoch 3 Average Training Loss = 0.00513732267729938
Epoch 3 Validation Loss = 0.005510073263198138
Validation Set Accuracy = 84.52499999999999


100%|██████████| 500/500 [00:38<00:00, 12.91it/s]


Epoch 4 Average Training Loss = 0.005090767510700971


100%|██████████| 500/500 [00:37<00:00, 13.21it/s]


Epoch 5 Average Training Loss = 0.0050639077164232734
Epoch 5 Validation Loss = 0.005793145537376404
Validation Set Accuracy = 82.9125


100%|██████████| 500/500 [00:36<00:00, 13.62it/s]


Epoch 6 Average Training Loss = 0.0050464350245893


100%|██████████| 500/500 [00:35<00:00, 14.13it/s]


Epoch 7 Average Training Loss = 0.0049222657410427926
Epoch 7 Validation Loss = 0.005542709659785032
Validation Set Accuracy = 84.8875


100%|██████████| 500/500 [00:35<00:00, 14.17it/s]


Epoch 8 Average Training Loss = 0.00494405214395374


100%|██████████| 500/500 [00:34<00:00, 14.58it/s]


Epoch 9 Average Training Loss = 0.004858871119562537
Epoch 9 Validation Loss = 0.005000349946320057
Validation Set Accuracy = 86.5375


100%|██████████| 500/500 [00:36<00:00, 13.56it/s]


Epoch 10 Average Training Loss = 0.004824411765206605


100%|██████████| 500/500 [00:38<00:00, 12.82it/s]


Epoch 11 Average Training Loss = 0.004799084085971117
Epoch 11 Validation Loss = 0.005154345031827688
Validation Set Accuracy = 86.675


100%|██████████| 500/500 [00:44<00:00, 11.19it/s]


Epoch 12 Average Training Loss = 0.004795242405030877


100%|██████████| 500/500 [00:42<00:00, 11.72it/s]


Epoch 13 Average Training Loss = 0.004712249295320362
Epoch 13 Validation Loss = 0.0049907771926373246
Validation Set Accuracy = 86.91250000000001


100%|██████████| 500/500 [00:44<00:00, 11.30it/s]


Epoch 14 Average Training Loss = 0.0047251073559746145


100%|██████████| 500/500 [00:36<00:00, 13.77it/s]


Epoch 15 Average Training Loss = 0.004670274353586137
Epoch 15 Validation Loss = 0.005054334500804543
Validation Set Accuracy = 86.7


100%|██████████| 500/500 [00:36<00:00, 13.72it/s]


Epoch 16 Average Training Loss = 0.00466220386675559


100%|██████████| 500/500 [00:36<00:00, 13.56it/s]


Epoch 17 Average Training Loss = 0.00464482789253816
Epoch 17 Validation Loss = 0.005035542137920857
Validation Set Accuracy = 86.45


100%|██████████| 500/500 [00:37<00:00, 13.23it/s]


Epoch 18 Average Training Loss = 0.0045610818143468354


100%|██████████| 500/500 [00:36<00:00, 13.60it/s]


Epoch 19 Average Training Loss = 0.004561093630967661
Epoch 19 Validation Loss = 0.005292854255065322
Validation Set Accuracy = 85.15


100%|██████████| 500/500 [00:37<00:00, 13.38it/s]


Epoch 20 Average Training Loss = 0.004546132667688653


100%|██████████| 500/500 [00:37<00:00, 13.25it/s]


Epoch 21 Average Training Loss = 0.004467940245755017
Epoch 21 Validation Loss = 0.005059155816212297
Validation Set Accuracy = 86.88749999999999


In [33]:
LR = 0.01
checkpoint_name = 'checkpoints/' + 'rnn_gru_' + str(EPOCHS)+"_"+ str(LOSS_FN)  +"_"+ str(OPTIMIZER) + "_" + str(LR)  
train(sent,train_dataloader,valid_dataloader,EPOCHS,LOSS_FN,OPTIMIZER,checkpoint_name,'Simple GRU',LR,device_cpu,use_rnn=True)

100%|██████████| 500/500 [33:02<00:00,  3.97s/it]   


Epoch 0 Average Training Loss = 0.004801629320718348


100%|██████████| 500/500 [26:30<00:00,  3.18s/it]


Epoch 1 Average Training Loss = 0.0036596210626885293
Epoch 1 Validation Loss = 0.004340207109227777
Validation Set Accuracy = 88.75


100%|██████████| 500/500 [26:29<00:00,  3.18s/it]


Epoch 2 Average Training Loss = 0.004004586398834362


100%|██████████| 500/500 [26:45<00:00,  3.21s/it]


Epoch 3 Average Training Loss = 0.004308650922495872
Epoch 3 Validation Loss = 0.0047455248348414894
Validation Set Accuracy = 87.1875


100%|██████████| 500/500 [27:04<00:00,  3.25s/it]


Epoch 4 Average Training Loss = 0.005550439618062228


100%|██████████| 500/500 [26:25<00:00,  3.17s/it]


Epoch 5 Average Training Loss = 0.006867284463252872
Epoch 5 Validation Loss = 0.00599404564127326
Validation Set Accuracy = 82.78750000000001


 25%|██▌       | 126/500 [07:41<22:50,  3.66s/it]  


KeyboardInterrupt: 

In [None]:
batch,length,sentiments = next(iter(train_dataloader))

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(sent.parameters(),lr= 0.001)
sent.train()
output = sent(batch,length)
l = criterion(output,sentiments.float())
print(l.item())
l.backward()
optimizer.step()

In [14]:
def test(checkpoint_name,test_data,test_lengths,test_labels):
    model_name = checkpoint_name[14:]
    parameters =  model_name.split('_')
    count = 0
    model = None
    if parameters[0]=='dan':
        model = DAN()
    model.load_state_dict(torch.load(checkpoint_name))
    model.eval()
    for i in range(len(test_data)):
        ans = model(test_data[i],[test_lengths[i]])
        ans = torch.round(ans)
        if ans[0][0] == test_labels[i]:
            count+=1
    
    print("Accuracy = " + str(count/len(test_data)))


test_dataset_labels = []  
test_processed_text = []
with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        test_dataset_labels.append(label)
        test_processed_text.append(processed_text)



In [17]:
test_word_embeddings = []
test_sentence_lengths = []
for text in test_processed_text:
    embeddings,length = getWordEmbeddingforText(text)
    test_sentence_lengths.append(length)
    test_word_embeddings.append(embeddings.unsqueeze(0))

test('./checkpoints/dan_50_bce_adam_0.01',test_word_embeddings,test_sentence_lengths,test_dataset_labels)

    

Accuracy = 0.87555


In [21]:
embeddings,length = getWordEmbeddingforText("The movie is so awesome that i want to run away")
dan = DAN()
dan.load_state_dict(torch.load('./checkpoints/dan_50_bce_adam_0.01'))
dan.eval()
dan(embeddings.unsqueeze(0),[length])

tensor([[0.9984]], grad_fn=<SigmoidBackward0>)