In [2]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

EMBED_DIM = 300
HIDDEN_DIM = 128
CNN_DIM = 256

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 1


NUM_FILTERS = 50

device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')

#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')
glove = GloVe()

In [3]:

#punctuation_words = open('punct.txt','w')
#punct_in_file = set()
#final_words = open('words.txt','w')
#punctuations = set(list(string.punctuation))
#vocab_words = set()

def preprocess_text(text):
        
    text = re.sub(r'<br /><br />',".",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split():
        if word == '':
            continue
        new_text.append(word)
        #if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
        #    words = word.split('.')
        #    for w in words:
        #        new_text.append(w)
        #else:
    
    text = ' '.join(new_text)
    words = nlp(text)
    text =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" or token.text=="."]).strip()
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    #review  = " ".join(new_text)
    new_words = []
    for word in text.split(" "):
        
        #vocab_words.add(word)
        if word == 'n\'t':
            if len(new_words) > 1:
                new_words[-1] = new_words[-1] + word
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    text = " ".join(new_words)
    return text


In [4]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl',encoding='utf-8') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)


In [5]:
def getWordEmbeddingforText(text,glove=glove):
    
    length = 0
    words = []
    text = text.strip()
    for word in text.split(' '):
        length+=1
        word_embedding = glove[word]
        words.append(word_embedding)

    return torch.stack(words),length

In [6]:
def review_to_embed(review,glove=glove):
    
    sentences = review.split(".")
    sentence_lengths = []
    review_embeddings = []
    num_sentences = 0
    for sentence in sentences:
        if sentence == '':
            continue
        s= sentence.strip()
        num_sentences += 1
        sentence_word_embeddings,sentence_length = getWordEmbeddingforText(s,glove)
        sentence_lengths.append(sentence_length)
        review_embeddings.append(sentence_word_embeddings)

    return torch.nn.utils.rnn.pad_sequence(review_embeddings,batch_first=True),sentence_lengths,num_sentences


In [7]:
class ReviewDataSet(Dataset):
    
    def __init__(self,reviews):
        super().__init__()
        self.reviews = reviews
        

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        return self.reviews[index]

In [11]:
processed_dataset = []

for review in preprocessed_dataset:
    embeddings, sent_length ,n_sents = review_to_embed(review['text'])
    print(embeddings.shape)
    print(max(sent_length))
    print(n_sents)
    processed_dataset.append({'review': embeddings,'sent_lengths': sent_length,'n_sent' : n_sents,'label' : review['label']})
 

torch.Size([29, 57, 300])
57
29
torch.Size([9, 58, 300])
58
9
torch.Size([8, 39, 300])
39
8
torch.Size([13, 33, 300])
33
13
torch.Size([19, 33, 300])
33
19
torch.Size([6, 24, 300])
24
6
torch.Size([8, 37, 300])
37
8
torch.Size([10, 38, 300])
38
10
torch.Size([8, 43, 300])
43
8
torch.Size([4, 18, 300])
18
4
torch.Size([9, 24, 300])
24
9
torch.Size([13, 52, 300])
52
13
torch.Size([34, 40, 300])
40
34
torch.Size([12, 46, 300])
46
12
torch.Size([5, 19, 300])
19
5
torch.Size([13, 42, 300])
42
13
torch.Size([8, 43, 300])
43
8
torch.Size([40, 23, 300])
23
40
torch.Size([4, 46, 300])
46
4
torch.Size([5, 44, 300])
44
5
torch.Size([19, 64, 300])
64
19
torch.Size([16, 81, 300])
81
16
torch.Size([5, 35, 300])
35
5
torch.Size([19, 63, 300])
63
19
torch.Size([15, 38, 300])
38
15
torch.Size([12, 19, 300])
19
12
torch.Size([34, 43, 300])
43
34
torch.Size([12, 42, 300])
42
12
torch.Size([14, 18, 300])
18
14
torch.Size([26, 47, 300])
47
26
torch.Size([25, 40, 300])
40
25
torch.Size([30, 28, 300])
28
30


In [14]:
dataset = ReviewDataSet(processed_dataset)

In [17]:
def collate_function(batch_data):
 
    
    inputs = [b['review'] for b in batch_data]
    sent_lengths = [ b['sent_lengths'] for b in batch_data ]
    n_sentences = [ b['n_sent'] for b in batch_data ]
    
    labels = torch.tensor([b['label'] for b in batch_data])

    labels = labels.unsqueeze(1)
    
    max_n_sentences = max([i.shape[0] for i in inputs] )
    max_n_words = max([i.shape[1] for i in inputs])

 
    processed_inputs = []
    for inp in inputs:

        t1 = torch.permute(inp,(2,1,0))
        t1 = torch.nn.functional.pad(t1,(0,max_n_sentences-inp.shape[0],0,max_n_words-inp.shape[1]))
        t1 = torch.permute(t1,(2,1,0))
        processed_inputs.append(t1)

    final_inp = torch.stack(processed_inputs)
    #inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : final_inp , 'sent_lengths': sent_lengths , 'n_sent' : n_sentences ,'labels' : labels }

In [18]:
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_dataloader = DataLoader(dataset,32,sampler=train_sampler,collate_fn=collate_function)
valid_dataloader = DataLoader(dataset,32,sampler=valid_sampler,collate_fn=collate_function)

In [24]:
import torch.nn.functional as F
class Network(nn.Module):
    
    def __init__(self,EMBED_DIM,CNN_DIM,HIDDEN_DIM):
        super().__init__()
        self.rnn = nn.GRU(input_size = CNN_DIM,hidden_size = HIDDEN_DIM, batch_first = True)
        self.cnn = nn.Conv1d(in_channels=EMBED_DIM,out_channels=CNN_DIM,kernel_size=3)
        self.fc = nn.Linear(HIDDEN_DIM,1)


    def forward(self,inp : torch.Tensor,n_sents=None):

        ## inp  = (batch_size,max_sent_length,max_word_length,embed_dim)

        outputs = []
      
        
        for i in range(inp.shape[1]):
            current_inp = inp[:,i,:,:]
            current_inp = torch.permute(current_inp,(0,2,1))
            current_output = self.cnn(current_inp)
            current_output = F.max_pool1d(current_output,kernel_size = current_output.shape[2]).squeeze()
            outputs.append(current_output)
        
        #print(len(outputs))
        #print(outputs[0].shape)
        lstm_in = torch.stack(outputs,dim=1)
   
        packed_input = pack_padded_sequence(lstm_in,n_sents,batch_first=True,enforce_sorted=False)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)

        hidden = torch.permute(hidden,(1,0,2))
        hidden = hidden.contiguous().view((hidden.shape[0],-1))

        out = self.fc(hidden)
        return nn.Sigmoid()(out)
        #out = self.cnn(inp)
        #return out

batch_data = next(iter(train_dataloader))
j = Network(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
j(batch_data['input'],batch_data['n_sent'])

In [21]:
import os
from torch.utils.tensorboard import SummaryWriter
from datetime import  datetime

def train(model,train_dataloader,valid_dataloader,num_epochs,criterion,optimizer,device_train = device_fast,use_rnn = False,log=True):

    model = model.to(device_train)
    clip = 0
    if use_rnn:
        clip = 5

    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0
    
    model.train()
    for e in range(num_epochs):
        
        training_set_size = 0
        training_loss = 0.0
        model.train()

        for data in tqdm(train_dataloader):
            
            optimizer.zero_grad()
            input_reviews,sent_lengths,n_sents,output_labels = data['input'], data['sent_lengths'],data['n_sent'],data['labels']
            input_reviews = input_reviews.to(device_train)
            training_set_size += input_reviews.shape[0]
            output = model(input_reviews,n_sents)
            output = output.to(device_cpu)
            loss = criterion(output,output_labels.float())
            training_loss += loss.item()
            loss.backward()
            if use_rnn:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
        
        current_training_loss = training_loss / training_set_size
        if log==True:
            print("Epoch " + str(e) + " Average Training Loss = " +  str(current_training_loss))
        
        model.eval()
        
        if valid_dataloader is None:
            continue
        
        validation_set_size  = 0 
        if e% VALIDATION_LOSS_COMPUTE_STEP==0:
            correct_count = 0
            validation_loss = 0

            for i,data in enumerate(valid_dataloader,0):
                input_reviews,sent_lengths,n_sents,output_labels = data['input'], data['sent_lengths'],data['n_sent'],data['labels']
                input_reviews = input_reviews.to(device_train)
                validation_set_size += input_reviews.shape[0]
                output = model(input_reviews,n_sents)
                output = output.to(device_cpu)
                loss = criterion(output,output_labels.float())
                validation_loss += loss.item()
                nearest_class = torch.round(output)

                correct = (nearest_class == output_labels.float()).float()
                correct_count += correct.sum()
            correct_count = int(correct_count)
            current_validation_accuracy = (correct_count/validation_set_size)*100
            current_validation_loss = (1.0* validation_loss)/validation_set_size
            if log == True:
                print("Epoch " + str(e) + " " +  "Validation Loss = " + str(current_validation_loss) )
                print("Validation Set Accuracy = " + str((correct_count/validation_set_size)*100) )

            
            if log==True:
                if current_validation_loss < best_validation_loss:
                    valdiation_loss_not_decreased_steps = 0
                    #torch.save(model.state_dict(),check_point_name)
                    best_validation_loss = current_validation_loss
                else:
                    valdiation_loss_not_decreased_steps +=1
        if log == True:
            if valdiation_loss_not_decreased_steps >= PATIENCE_PARAMETER:
                break

In [22]:
net = Network(EMBED_DIM,CNN_DIM,HIDDEN_DIM)
train(net,train_dataloader,valid_dataloader,50,nn.BCELoss(),optim.Adam(net.parameters(),0.01),device_cpu,True,True)

  0%|          | 0/1000 [00:00<?, ?it/s]

torch.Size([32, 61, 91, 300])
61
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])
torch.Size([32, 256, 89])


  0%|          | 0/1000 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:



def KFoldCrossValidation():
    pass




In [40]:
print(batch_data['input'].shape)

torch.Size([64, 77, 70, 300])


In [41]:
random_prob = 0.5*torch.ones((64,1))
true_labels = torch.bernoulli(random_prob)
print(true_labels)

tensor([[0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.]])


In [42]:
import torch.optim as optim


crit = nn.BCELoss()
optimizer =optim.Adam(j.parameters(),lr = 0.01)
j.train()

Network(
  (rnn): GRU(256, 128, batch_first=True)
  (cnn): Conv1d(300, 256, kernel_size=(3,), stride=(1,))
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [46]:
for i in range(20):

    optimizer.zero_grad()
    pred_out = j(batch_data['input'])
    loss = crit(pred_out,true_labels)
    print(loss.item())
    loss.backward()
    optimizer.step()

torch.Size([64, 77, 256])
0.6655454635620117
torch.Size([64, 77, 256])
0.6645953059196472
torch.Size([64, 77, 256])
0.66114342212677
torch.Size([64, 77, 256])
0.6593537926673889
torch.Size([64, 77, 256])
0.6578088402748108
torch.Size([64, 77, 256])
0.6835843920707703
torch.Size([64, 77, 256])
0.6554070711135864
torch.Size([64, 77, 256])
0.6443350315093994
torch.Size([64, 77, 256])
0.6433554291725159
torch.Size([64, 77, 256])
0.7229409217834473
torch.Size([64, 77, 256])
0.6723809242248535


KeyboardInterrupt: 

In [47]:
j(batch_data['input'])


torch.Size([64, 77, 256])


tensor([[0.4484],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4865],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4403],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4494],
        [0.4491],
        [0.0278],
        [0.4491],
        [0.4362],
        [0.4490],
        [0.4520],
        [0.4361],
        [0.4491],
        [0.4491],
        [0.4494],
        [0.4491],
        [0.4491],
        [0.4503],
        [0.4491],
        [0.4491],
        [0.4513],
        [0.4494],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4494],
        [0.4486],
        [0.4488],
        [0.4361],
        [0.4491],
        [0.4491],
        [0.4491],
        [0.4494],
        [0.4491],
        [0.4491],
        [0.4494],
        [0.4491],
        [0.4485],
        [0.4491],
        [0.4491],
        [0

In [64]:

pad = nn.ConstantPad2d(())
i2 = torch.randn((11,27,50))


final_out = torch.nn.utils.rnn.pad_sequence([i1,i2],batch_first=True)

RuntimeError: The size of tensor a (30) must match the size of tensor b (27) at non-singleton dimension 1

In [75]:
g = torch.randn((2,3,5))
q = torch.permute(g,(2,0,1))
g

tensor([[[ 0.2576, -1.4110, -3.0002,  1.0355,  1.4575],
         [ 1.9863, -0.6973,  0.2124,  0.2277,  0.0504],
         [-0.6911,  0.6065,  1.6002, -1.1386,  0.5833]],

        [[-0.1150, -0.1750,  0.2525,  0.1081, -0.0693],
         [ 0.3150,  0.8368, -0.0991, -0.0759, -0.4504],
         [ 0.6217, -0.9792, -1.6855,  1.4562,  0.3847]]])

In [76]:
q

tensor([[[ 0.2576,  1.9863, -0.6911],
         [-0.1150,  0.3150,  0.6217]],

        [[-1.4110, -0.6973,  0.6065],
         [-0.1750,  0.8368, -0.9792]],

        [[-3.0002,  0.2124,  1.6002],
         [ 0.2525, -0.0991, -1.6855]],

        [[ 1.0355,  0.2277, -1.1386],
         [ 0.1081, -0.0759,  1.4562]],

        [[ 1.4575,  0.0504,  0.5833],
         [-0.0693, -0.4504,  0.3847]]])

In [77]:
i = torch.nn.functional.pad(q,(0,3,0,2))

In [78]:
i = torch.permute(i,(1,2,0))

In [79]:
i

tensor([[[ 0.2576, -1.4110, -3.0002,  1.0355,  1.4575],
         [ 1.9863, -0.6973,  0.2124,  0.2277,  0.0504],
         [-0.6911,  0.6065,  1.6002, -1.1386,  0.5833],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.1150, -0.1750,  0.2525,  0.1081, -0.0693],
         [ 0.3150,  0.8368, -0.0991, -0.0759, -0.4504],
         [ 0.6217, -0.9792, -1.6855,  1.4562,  0.3847],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000

In [74]:
q

tensor([[[ 4.8514e-02, -8.9418e-01, -7.8548e-01,  3.5751e-01],
         [-1.2053e+00, -8.1523e-01,  8.9067e-01, -1.4337e+00],
         [ 8.9685e-01,  1.6162e+00,  3.1155e-01, -2.1370e-01]],

        [[-1.1456e+00,  1.3498e+00,  4.8107e-01,  3.2546e-01],
         [ 7.3913e-01, -5.7459e-01, -1.0138e+00,  1.2696e+00],
         [-1.0810e+00, -2.3811e+00,  9.7491e-01, -2.0539e+00]],

        [[-1.1427e+00, -2.8126e-02, -4.5579e-01,  1.4867e+00],
         [-3.9454e-01, -5.1207e-01,  4.7273e-01,  2.9859e-01],
         [-9.2391e-02,  1.0780e+00, -1.2114e+00,  2.4218e+00]],

        [[-1.3947e-01,  4.9323e-01, -3.3063e-01,  4.8126e-01],
         [ 6.0514e-02, -3.6486e-01,  1.6370e+00, -4.9131e-01],
         [ 1.8614e-01,  8.1679e-01,  2.5825e-01, -1.1861e+00]],

        [[ 9.3769e-01, -3.0563e-01,  9.5881e-01, -2.6005e+00],
         [ 6.4705e-01, -8.1550e-01,  4.2356e-01,  6.0810e-01],
         [-4.9011e-01,  3.7321e-01,  1.4586e+00, -5.9765e-01]],

        [[ 1.0451e+00,  1.1214e-01, -3.8421e-