In [38]:
import csv
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from bs4 import BeautifulSoup
import string
import spacy
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence,pack_padded_sequence,pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
import fasttext
from torch.utils.data import SubsetRandomSampler,DataLoader
from torchtext.vocab import GloVe

EMBED_DIM = 5
HIDDEN_DIM = 2

PATIENCE_PARAMETER = 4
VALIDATION_LOSS_COMPUTE_STEP = 2


device_cpu = torch.device('cpu')

device_fast = torch.device('cpu')

if torch.has_mps:
    device_fast = torch.device('mps')
elif torch.has_cuda:
    device_fast = torch.device('cuda')


torch.manual_seed(0)
np.random.seed(0)
nlp = spacy.load('en_core_web_sm')

In [7]:
fasttext_model = fasttext.load_model('./crawl-300d-2M-subword/crawl-300d-2M-subword.bin')



In [39]:
glove =   GloVe()

In [25]:
file = open('processed_text.txt','w')
punctuation_words = open('punct.txt','w')
break_file = open('break.txt','w')

punctuations = set(list(string.punctuation))


def preprocess_text(text):
    

    text = re.sub(r'<br /><br />',"",text)
    text = BeautifulSoup(text,'lxml').get_text().strip()
    text = text.lower()

    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    new_text = []
    for word in text.split(' '):
        if word == '':
            continue
        if '.' in word and len(word)>1:
            #if 'http' in word:
            #    words = word.split('.')
            #    words[1] = ''.join(words[1:])
            #    words = [words[0],words[1]]
            #else:
            #    words = word.split('.')
           
            words = word.split('.')
            for w in words:
                new_text.append(w)
        else:
            new_text.append(word)
    
    text = ' '.join(new_text)

    words = nlp(text)
    review =  ' '.join(token.text for token in words)
    #review =  " ".join([re.sub(r'^https?:\/\/.*[\r\n]*', '', token.text, flags=re.MULTILINE) for token in words])
    for word in review.split(' '):
        if any([punct in word for punct in punctuations]):
            punctuation_words.write(word + "\n")


    file.write(text + '\n')
    return text


preprocessed_dataset = []
train_dataset_labels = []


with open("./Train dataset.csv") as csvfile:
    csvFile = csv.reader(csvfile)
    next(csvFile)

    for line in csvFile:
        processed_text = preprocess_text(line[0])
        label = 1 if line[1] == 'positive' else 0
        train_dataset_labels.append(label)
        preprocessed_dataset.append({"text":processed_text,"label":label})

train_dataset_labels = np.array(train_dataset_labels)
file.close()
punctuation_words.close()
break_file.close()





In [28]:
class ReviewDataSet(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

dataset = ReviewDataSet(preprocessed_dataset)

In [32]:
# Train and Validation split and an equal distriubition of classes
train_idx,valid_idx = train_test_split(np.arange(train_dataset_labels.shape[0]), 
    test_size=0.2,
    shuffle= True,
    stratify= train_dataset_labels,
    random_state=0
)

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_dataloader = DataLoader(dataset,32,sampler=train_sampler)
valid_dataloader = DataLoader(dataset,32,sampler=valid_sampler)


In [37]:
next(iter(train_dataloader))

{'text': ['it\'s not hard to imagine what the main problem for a screenwriter is who wants to have 18 equally well written characters with about the same amount of screen time in a movie that last around 90 minutes  it\'s almost impossible not to fall back on stereotypes and that is also what writer-director ralf westhoff does here  very few of the characters can be recognized as people that you and me know in real life, many of them are just characterized with two or three attributes and stay vague  i am aware of that but still think that "shoppen" is successful, namely that it accomplishes just what it wants to  it is a film with very well written dialogue, extremely good acting and a film that made me laugh out loud really often  i don\'t think that this film wants to make a deep going analysis of loneliness in our modern society, or that it wants to be moral commentary on speed-dating  it\'s a movie about something that exists and people and their motivation to use it  funny and en

In [49]:
def getWordEmbeddings(batch_data,glove : GloVe):

    reviews = None
    sentiments = None

    if len(batch_data)==2:
        reviews = batch_data[0]
        sentiments = batch_data[1]
    else:
        reviews = batch_data[0]


    batch_tensor = []
    inp_lengths = []
    for review in reviews:
        review_tensor = []
        words = review.split(' ')
        inp_lengths.append(len(words))
        for word in words:
            word_vector = glove[word]
            review_tensor.append(word_vector)
        review_tensor = torch.stack(review_tensor)
        batch_tensor.append(review_tensor)
    
    return (pad_sequence(batch_tensor,batch_first=True),inp_lengths,sentiments)

random_batch_data = next(iter(train_dataloader))
batch,length,senitments = getWordEmbeddings([random_batch_data['text'],random_batch_data['label']],glove)

In [50]:
batch.shape

torch.Size([32, 732, 300])

In [None]:
# bidirectional
# rnn_type
# use_cnn
# num_layers

class SentimentModel(nn.Module):

    def __init__(self,
            embed_dim=EMBED_DIM,hidden_dim =HIDDEN_DIM,
            rnn_type = 'gru',num_layers=1):
        
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.rnn = None

        if rnn_type == 'gru':
            self.rnn  = nn.GRU(input_size = self.embed_dim,
                hidden_size = self.hidden_dim,
                num_layers = num_layers,
                batch_first = True,
                dropout = 0,
                bidirectional = True
            )
        self.fc1 = nn.Linear(HIDDEN_DIM,128)
        self.fc2 = nn.Linear(128,1)
        self.sigmoid  = nn.Sigmoid()



    def forward(self,x,x_len):

        packed_input = pack_padded_sequence(x,x_len,batch_first=True,enforce_sorted=False)
        print(packed_input)
        packed_output,hidden = self.rnn(packed_input)
        output,output_lengths = pad_packed_sequence(packed_output)
        hidden = hidden.squeeze()
        
        out =F.relu(self.fc1(hidden))
        ans = self.sigmoid(self.fc2(out))
        return ans



model = SentimentModel()
model = model.to(device=device_fast)


In [None]:
inp = torch.randn((2,4,EMBED_DIM))
inp_lengths = [2,3]

inp[0,2:,:] = 0.0
inp[1,3:,:] = 0.0
inp = inp.to(device_fast)

In [None]:
model(inp,inp_lengths)

In [None]:
def train(model,train_dataloader,valid_dataloader,num_epochs,loss_fn,optimizer_name,check_point_name,learning_rate=0.01):

    criterion = None
    optimizer = None
    
    if loss_fn == 'bce':
        criterion = nn.BCELoss()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(),lr = learning_rate)
    
    best_validation_loss = 1000.0
    valdiation_loss_not_decreased_steps = 0

    model.train()
    for e in range(num_epochs):
        
        training_loss = 0.0
        model.train()

        for i,data in enumerate(train_dataloader):
            pass

    



    
    



