In [131]:
import re
import nltk
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from wordcloud import WordCloud

from tqdm import tqdm
tqdm.pandas()
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from matplotlib import pyplot as plt

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
from torchtext.vocab import GloVe

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [8]:
df = pd.read_csv('./dataset/movie.csv')
df = df.loc[:5000]
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.strip()
print(df['label'].value_counts())
df.head()

label
0    2582
1    2419
Name: count, dtype: int64


Unnamed: 0,text,label
0,i grew up (b. 1965) watching and loving the th...,0
1,"when i put this movie in my dvd player, and sa...",0
2,why do people who do not know what a particula...,0
3,even though i have great interest in biblical ...,0
4,im a die hard dads army fan and nothing will e...,1


In [12]:
df['text'] = df['text'].apply(lambda x: re.sub('\d+','',x))
df['tokens_gensim'] = df['text'].progress_apply(simple_preprocess)

stop_words = set(stopwords.words('english'))
a = list(stop_words)
a.append('br')
a.append('movie')
a.append('watch')

100%|██████████| 5001/5001 [00:01<00:00, 3651.66it/s]


In [135]:
df['sentence_gensim'] = df['tokens_gensim'].apply(lambda x: ' '.join(x))

In [136]:
class_0 = df.loc[df.label == 0, :]
class_1 = df.loc[df.label == 1, :]

test_0 = class_0.iloc[:1000, :]
test_1 = class_1.iloc[:1000, :]
valid_0 = class_0.iloc[1000:2000, :]
valid_1 = class_1.iloc[1000:2000, :]
train_0 = class_0.iloc[2000:, :]
train_1 = class_1.iloc[2000:, :]

train_df = pd.concat([train_0, train_1], axis=0).reset_index(drop=True)
print(train_df.shape)

val_df = pd.concat([valid_0, valid_1], axis=0).reset_index(drop=True)
print(val_df.shape)

test_df = pd.concat([test_0, test_1], axis=0).reset_index(drop=True)
print(test_df.shape)

(1001, 4)
(2000, 4)
(2000, 4)


In [146]:
count_vec = CountVectorizer(analyzer='word', ngram_range=(2, 2))
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(2, 2))

In [147]:
train_count_vec = count_vec.fit_transform(train_df['sentence_gensim'])
test_count_vec = count_vec.transform(test_df['sentence_gensim'])

train_tfidf_vec = tfidf_vec.fit_transform(train_df['sentence_gensim'])
test_tfidf_vec = tfidf_vec.transform(test_df['sentence_gensim'])

In [148]:
train_count_vec.toarray()[0].shape

(114151,)

In [149]:
train_count_vec.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [152]:
train_tfidf_vec.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

# Glove

In [18]:
torch.cuda.is_available()

True

In [39]:
def load_glove_embeddings(glove_file, embedding_dim):
    # Step 1: Read GloVe file and create word to index mapping and embedding matrix
    word_to_idx = {}
    embeddings = []

    with open(glove_file, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            # Split line into word and embedding vector
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            
            # Add word to vocab and corresponding embedding
            word_to_idx[word] = idx
            embeddings.append(vector)

    # Step 2: Convert list of embeddings to a numpy array
    embedding_matrix = np.stack(embeddings)

    # Step 3: Create a PyTorch Embedding layer
    # embedding_layer = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix), freeze=True)
    
    return embedding_matrix, word_to_idx

In [126]:
class GloveModel(nn.Module):
    def __init__(self, embed_dim, hidden_dim, output_dim):
        super(GloveModel, self).__init__()
        glove_weights,self.weight_idx = load_glove_embeddings(f'./pre-trained/glove.6B.{embed_dim}d.txt',embed_dim)
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(glove_weights),freeze=True)
        self.rnn = nn.RNN(embed_dim, hidden_dim)
        self.linear1 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        output, x = self.rnn(x)
        x = self.dropout(x.squeeze(0))
        out = self.linear1(x)
        return sig(out)
    
    def get_embeddings(self, x):
        return self.embedding(x)

    def get_weight_idx(self):
        return self.weight_idx


In [127]:
model = GloveModel(100,32,1)

In [115]:
model

GloveModel(
  (embedding): Embedding(400000, 100)
  (rnn): RNN(100, 32)
  (linear1): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [53]:
train_df.loc[0,'text']

'even if you could get past the idea that these boring characters personally witnessed every significant moment of the s (ok, so katie didn\'t join the manson family, and nobody died at altamont), this movie was still unbelievably awful. i got the impression that the "writers" just locked themselves in a room and watched "forrest gump," "the wonder years," and oliver stone\'s s films over and over again and called it research. a canadian television critic called the conclusion of the first episode "head spinning". he was right.'

In [60]:
word_to_idx = model.get_weight_idx()
word_to_idx

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [103]:
new_tensor = []

for word in train_df.loc[0,'tokens_gensim']:
    if word in word_to_idx:
        new_tensor.append(word_to_idx[word])
    else:
        new_tensor.append(-1)

new_tensor = torch.Tensor(np.array(new_tensor).astype('int')).int()
new_tensor

tensor([  151,    83,    81,    94,   169,   341,     0,  1159,    12,   158,
        12217,  2153,  4674,  6323,   359,  1209,  1600,     3,     0,  4862,
          100, 12327, 73330,  1429,     0, 20416,   213,     5,  3291,   431,
           22, 71395,    37,  1005,    15,   149, 34284,  9956,   405,     0,
         6383,    12,     0,  2601,   120,  5304,  1000,     6,   927,     5,
         3136, 16083, 27507,     0,  4284,    82,     5,  6192,  2155,  1588,
           74,     5,    74,   378,     5,   175,    20,   520,  1202,   458,
         4524,   175,     0,  4483,     3,     0,    58,  1942,   362, 11019,
           18,    15,   248], dtype=torch.int32)

In [116]:
x = model.get_embeddings(new_tensor)
x

In [128]:
model(new_tensor)

tensor([0.6045], grad_fn=<SigmoidBackward0>)

In [125]:
sig = nn.Sigmoid()
sig(out)

tensor([0.5261], grad_fn=<SigmoidBackward0>)