In [14]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import jsonlines
import json
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader,Subset
from torchtext.vocab import GloVe
from tqdm import tqdm
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

EMBED_DIM = 300
HIDDEN_DIM = 128
CNN_DIM = 256

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 1


NUM_FILTERS = 50

device_cpu = torch.device('cpu')
device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')

#torch.manual_seed(0)
#np.random.seed(0)
nlp = spacy.load('en_core_web_sm')
glove = GloVe()

In [2]:

#punctuation_words = open('punct.txt','w')
#punct_in_file = set()
#final_words = open('words.txt','w')
#punctuations = set(list(string.punctuation))
#vocab_words = set()

def preprocess_text(text):
        
    text = re.sub(r'<br /><br />',".",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = ' '.join(re.findall(r"[\w']+|[.,!;/\"]", text))
    
    new_text = []
    for word in text.split():
        if word == '':
            continue
        new_text.append(word)
        #if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
        #    words = word.split('.')
        #    for w in words:
        #        new_text.append(w)
        #else:
    
    text = ' '.join(new_text)
    words = nlp(text)
    text =  " ".join([token.text for token in words if not token.is_punct or token.text=='/' or token.text=="\"" or token.text=="."]).strip()
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    #review  = " ".join(new_text)
    new_words = []
    for word in text.split(" "):
        
        #vocab_words.add(word)
        if word == 'n\'t':
            if len(new_words) > 1:
                new_words[-1] = new_words[-1] + word
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    text = " ".join(new_words)
    return text


In [3]:
preprocessed_dataset = []
train_dataset_labels = []
with open('processed_dataset.jsonl',encoding='utf-8') as f:
    for line in f:
        sample = json.loads(line)
        train_dataset_labels.append(sample['label'])
        preprocessed_dataset.append(sample)
train_dataset_labels = np.array(train_dataset_labels)


In [4]:
def getWordEmbeddingforText(text,glove=glove):
    
    length = 0
    words = []
    for word in text.split(' '):
        length+=1
        word_embedding = glove[word]
        words.append(word_embedding)

    return torch.stack(words),length

In [13]:
def review_to_embed(review,glove=glove):
    
    sentences = review.split(".")
    sentence_lengths = []
    review_embeddings = []
    num_sentences = 0

    for sentence in sentences:
        sentence = sentence.strip()
        num_sentences += 1
        sentence_word_embeddings,sentence_length = getWordEmbeddingforText(sentence,glove)
        sentence_lengths.append(sentence_length)
        review_embeddings.append(sentence_word_embeddings)

    return torch.nn.utils.rnn.pad_sequence(review_embeddings,batch_first=True),sentence_lengths,num_sentences


torch.Size([2, 6, 300])

In [16]:
class DataSet(Dataset):
    
    def __init__(self,reviews):
        super().__init__()
        self.reviews = reviews
        

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        return self.reviews[index]

In [None]:
def collate_function(batch_data):
    
    inputs = [b['text'] for b in batch_data]
    lengths = [b['length'] for b in batch_data]
    labels = torch.tensor([b['label'] for b in batch_data])

    labels = labels.unsqueeze(1)
    inputs = pad_sequence(inputs,batch_first=True)
    return  {'input' : inputs , 'lengths': lengths , 'labels' : labels }

In [27]:



import torch.nn.functional as F
class Network(nn.Module):
    
    def __init__(self,EMBED_DIM,CNN_DIM,HIDDEN_DIM):
        super().__init__()
        self.rnn = nn.GRU(input_size = CNN_DIM,hidden_size = HIDDEN_DIM, batch_first = True)
        self.cnn = nn.Conv1d(in_channels=EMBED_DIM,out_channels=CNN_DIM,kernel_size=3)
        self.fc = nn.Linear(HIDDEN_DIM,1)


    def forward(self,inp : torch.Tensor):

        ## inp  = (batch_size,max_sent_length,max_word_length,embed_dim)

        outputs = []
        
        for i in range(inp.shape[1]):
            current_inp = inp[:,i,:,:]
            current_inp = torch.permute(current_inp,(0,2,1))
            current_output = self.cnn(current_inp)
            current_output = F.max_pool1d(current_output,kernel_size = current_output.shape[2]).squeeze()
            outputs.append(current_output)
        
        #print(len(outputs))
        #print(outputs[0].shape)
        lstm_in = torch.stack(outputs,dim=1)
        
        lstm_hidd,lstm_out = self.rnn(lstm_in)
        lstm_out = lstm_out.squeeze()
        out = self.fc(lstm_out)
        return nn.Sigmoid()(out)
        #out = self.cnn(inp)
        #return out


h = torch.randn((12,14,40,60))
j = Network(60,40,30)
j(h)


tensor([[0.5052],
        [0.4717],
        [0.5159],
        [0.5036],
        [0.5055],
        [0.4910],
        [0.4892],
        [0.5035],
        [0.5073],
        [0.5042],
        [0.5065],
        [0.5107]], grad_fn=<SigmoidBackward0>)

In [28]:
random_prob = 0.5*torch.ones((12,1))
true_labels = torch.bernoulli(random_prob)
print(true_labels)

tensor([[1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])


In [29]:
import torch.optim as optim


crit = nn.BCELoss()
optimizer =optim.Adam(j.parameters(),lr = 0.01)
j.train()

Network(
  (rnn): GRU(40, 30, batch_first=True)
  (cnn): Conv1d(60, 40, kernel_size=(3,), stride=(1,))
  (fc): Linear(in_features=30, out_features=1, bias=True)
)

In [62]:
optimizer.zero_grad()
pred_out = j(h)
loss = crit(pred_out,true_labels)
print(loss.item())
loss.backward()
optimizer.step()

0.008538435213267803


In [64]:

pad = nn.ConstantPad2d(())
i2 = torch.randn((11,27,50))


final_out = torch.nn.utils.rnn.pad_sequence([i1,i2],batch_first=True)

RuntimeError: The size of tensor a (30) must match the size of tensor b (27) at non-singleton dimension 1

In [75]:
g = torch.randn((2,3,5))
q = torch.permute(g,(2,0,1))
g

tensor([[[ 0.2576, -1.4110, -3.0002,  1.0355,  1.4575],
         [ 1.9863, -0.6973,  0.2124,  0.2277,  0.0504],
         [-0.6911,  0.6065,  1.6002, -1.1386,  0.5833]],

        [[-0.1150, -0.1750,  0.2525,  0.1081, -0.0693],
         [ 0.3150,  0.8368, -0.0991, -0.0759, -0.4504],
         [ 0.6217, -0.9792, -1.6855,  1.4562,  0.3847]]])

In [76]:
q

tensor([[[ 0.2576,  1.9863, -0.6911],
         [-0.1150,  0.3150,  0.6217]],

        [[-1.4110, -0.6973,  0.6065],
         [-0.1750,  0.8368, -0.9792]],

        [[-3.0002,  0.2124,  1.6002],
         [ 0.2525, -0.0991, -1.6855]],

        [[ 1.0355,  0.2277, -1.1386],
         [ 0.1081, -0.0759,  1.4562]],

        [[ 1.4575,  0.0504,  0.5833],
         [-0.0693, -0.4504,  0.3847]]])

In [77]:
i = torch.nn.functional.pad(q,(0,3,0,2))

In [78]:
i = torch.permute(i,(1,2,0))

In [79]:
i

tensor([[[ 0.2576, -1.4110, -3.0002,  1.0355,  1.4575],
         [ 1.9863, -0.6973,  0.2124,  0.2277,  0.0504],
         [-0.6911,  0.6065,  1.6002, -1.1386,  0.5833],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.1150, -0.1750,  0.2525,  0.1081, -0.0693],
         [ 0.3150,  0.8368, -0.0991, -0.0759, -0.4504],
         [ 0.6217, -0.9792, -1.6855,  1.4562,  0.3847],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000

In [74]:
q

tensor([[[ 4.8514e-02, -8.9418e-01, -7.8548e-01,  3.5751e-01],
         [-1.2053e+00, -8.1523e-01,  8.9067e-01, -1.4337e+00],
         [ 8.9685e-01,  1.6162e+00,  3.1155e-01, -2.1370e-01]],

        [[-1.1456e+00,  1.3498e+00,  4.8107e-01,  3.2546e-01],
         [ 7.3913e-01, -5.7459e-01, -1.0138e+00,  1.2696e+00],
         [-1.0810e+00, -2.3811e+00,  9.7491e-01, -2.0539e+00]],

        [[-1.1427e+00, -2.8126e-02, -4.5579e-01,  1.4867e+00],
         [-3.9454e-01, -5.1207e-01,  4.7273e-01,  2.9859e-01],
         [-9.2391e-02,  1.0780e+00, -1.2114e+00,  2.4218e+00]],

        [[-1.3947e-01,  4.9323e-01, -3.3063e-01,  4.8126e-01],
         [ 6.0514e-02, -3.6486e-01,  1.6370e+00, -4.9131e-01],
         [ 1.8614e-01,  8.1679e-01,  2.5825e-01, -1.1861e+00]],

        [[ 9.3769e-01, -3.0563e-01,  9.5881e-01, -2.6005e+00],
         [ 6.4705e-01, -8.1550e-01,  4.2356e-01,  6.0810e-01],
         [-4.9011e-01,  3.7321e-01,  1.4586e+00, -5.9765e-01]],

        [[ 1.0451e+00,  1.1214e-01, -3.8421e-