In [2]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm

EMBED_DIM = 300
HIDDEN_DIM = 128

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 2


device_cpu = torch.device('cpu')

device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')


torch.manual_seed(0)
np.random.seed(0)
nlp = spacy.load('en_core_web_sm')

In [None]:
fasttext_model = fasttext.load_model('./crawl-300d-2M-subword/crawl-300d-2M-subword.bin')

In [3]:
glove =   GloVe()

In [None]:
file = open('processed_text.txt','w')
punctuation_words = open('punct.txt','w')
break_file = open('break.txt','w')

punctuations = set(list(string.punctuation))


def preprocess_text(text):
    

    text = re.sub(r'<br /><br />',"",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    new_text = []
    for word in text.split(' '):
        if word == '':
            continue
        if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
            words = word.split('.')
            for w in words:
                new_text.append(w)
        else:
            new_text.append(word)
    
    text = ' '.join(new_text)

    words = nlp(text)
    review =  ' '.join(token.text for token in words)
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    for word in review.split(' '):
        if any([punct in word for punct in punctuations]):
            punctuation_words.write(word + "\n")


    file.write(text + '\n')
    return text


preprocessed_dataset = []
train_dataset_labels = []


with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)
    json_writer = jsonlines.open('processed_dataset.jsonl','w')

    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1.0 if line[1] == 'positive' else 0.0
        train_dataset_labels.append(label)
        json_writer.write({"text":processed_text,"label":label})
        preprocessed_dataset.append({"text":processed_text,"label":label})
    
    json_writer.close()

train_dataset_labels = np.array(train_dataset_labels)
file.close()
punctuation_words.close()
break_file.close()

In [4]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)

In [5]:
class ReviewDataSet(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

dataset = ReviewDataSet(preprocessed_dataset)

In [6]:
# Train and Validation split and an equal distriubition of classes
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)



train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,64,sampler=train_sampler)
valid_dataloader = DataLoader(dataset,64,sampler=valid_sampler)



In [7]:
def getWordEmbeddings(batch_data,glove : GloVe):

    reviews = None
    sentiments = None

    if len(batch_data)==2:
        reviews = batch_data[0]
        sentiments = batch_data[1]
    else:
        reviews = batch_data[0]

    #sentiments = torch.tensor(reviews)
    sentiments = sentiments.unsqueeze(1)
    
    batch_tensor = []
    inp_lengths = []
    for review in reviews:
        review_tensor = []
        words = review.split(' ')
        inp_lengths.append(len(words))
        for word in words:
            word_vector = glove[word]
            review_tensor.append(word_vector)
        review_tensor = torch.stack(review_tensor)
        batch_tensor.append(review_tensor)
    
    return (pad_sequence(batch_tensor,batch_first=True),inp_lengths,sentiments)

random_batch_data = next(iter(train_dataloader))
batch,length,sentiments = getWordEmbeddings([random_batch_data['text'],random_batch_data['label']],glove)

In [None]:
batch.shape

In [None]:
length

In [None]:
class DAN(nn.Module):
    
    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_DIM, droput_prob = 0.3, train_device = device_cpu):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        #self.fc = nn.Linear(self.hidden_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()
        self.word_dropout_prob = droput_prob

        self.train_device = train_device

        #if(word_vector_types_count > 1) :
        #   self.importance_weights = nn.Linear(word_vector_types_count*EMBED_DIM,word_vector_types_count)
        

    def forward(self,inp,inp_len):
        
    
        inp_mask = torch.ones((inp.shape[0],inp.shape[1]))
        for i  in range(inp.shape[0]):
            inp_mask[i,inp_len[i]:] = 0.0

        inp_mask = inp_mask.to(self.train_device)

        inp_lengths = torch.sum(inp_mask,-1,keepdim=True).int()
        inp_lengths = inp_lengths.to(self.train_device)
        #if word_vector_types_count > 1:
        #    input = inp.view((inp.shape[0],inp.shape[1],-1))
        #    out = self.importance_weights(input)
        #    weights = F.softmax(out,dim=2).unsqueeze(3)
        #    weights_multiplied_vector = weights * inp
        #    inp = torch.sum(weights_multiplied_vector,dim=2)

        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        #ans = F.relu(self.fc(ans))
        ans = self.sigmoid(self.fc2(ans))
        return ans


In [None]:
dan = DAN(train_device=device_fast)

dan.train()
crit = nn.BCELoss()
optimizer = optim.Adam(dan.parameters(),lr=0.01)
dan.train()

In [8]:
# bidirectional
# rnn_type
# use_cnn
# num_layers


class SentimentModel(nn.Module):

    def __init__(self,
            embed_dim=EMBED_DIM,hidden_dim =HIDDEN_DIM, bidirectional=False,
            rnn_type = 'gru', num_layers=1):
        
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn = None

        if rnn_type == 'gru':
            self.rnn  = nn.GRU(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = self.num_layers,
                batch_first = True,
                dropout = 0,
                bidirectional = bidirectional
            )

        self.fc1 = nn.Linear(HIDDEN_DIM,128)
        self.fc2 = nn.Linear(128,1)
        self.sigmoid  = nn.Sigmoid()



    def forward(self,x,x_len):

        packed_input = pack_padded_sequence(x,x_len,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)
        hidden = hidden.squeeze()
        
        out =F.relu(self.fc1(hidden))
        ans = self.sigmoid(self.fc2(out))
        return ans



model = SentimentModel()



In [None]:
model(batch,length)

In [None]:
inp = torch.randn((2,4,EMBED_DIM))
inp_lengths = [2,3]

inp[0,2:,:] = 0.0
inp[1,3:,:] = 0.0
inp = inp.to(device_fast)

In [None]:
model(inp,inp_lengths)

In [12]:
def train(model,train_dataloader,valid_dataloader,num_epochs,loss_fn,optimizer_name,check_point_name,tensorbaord_name,learning_rate=0.01,device_train = device_fast):

    model = model.to(device_train)
    criterion = None
    optimizer = None
    
    if loss_fn == 'bce':
        criterion = nn.BCELoss()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(),lr = learning_rate)
    
    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            optimizer.zero_grad()

            input_reviews,inp_lengths,output_labels = getWordEmbeddings([data['text'],data['label']],glove)
            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,inp_lengths)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        current_training_loss = training_loss / training_set_size
        print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
        writer.add_scalars('Loss vs Epoch',{'train' : current_training_loss},e)
        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):

                input_reviews,inp_lengths,output_labels = getWordEmbeddings([data['text'],data['label']],glove)
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,inp_lengths)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_loss = (1.0* validation_loss)/validation_set_size
            print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
            print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )

            writer.add_scalar('Validation Accuracy vs Epoch',int((correct_count/validation_set_size)*100),e)
            writer.add_scalars('Loss vs Epoch',{'valid' : current_validation_loss},e)
            if current_validation_loss < best_validation_loss:
                valdiation_loss_not_decreased_steps = 0
                torch.save(model.state_dict(),check_point_name)
                best_validation_loss = current_validation_loss
            else:
                valdiation_loss_not_decreased_steps +=1
        
        if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
            break

In [13]:
sent = SentimentModel()


In [None]:
train(dan,train_dataloader,valid_dataloader,10,'bce','adam','chk.pt',0.01)

In [16]:
train(sent,train_dataloader,valid_dataloader,10,'bce','adam','rnnchkpt.pth','Loss',0.001,device_fast)

  0%|          | 0/500 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(sent.parameters(),lr= 0.001)
sent.train()
output = sent(batch,length)
l = criterion(output,sentiments.float())
print(l.item())
l.backward()
optimizer.step()
