<a href="https://colab.research.google.com/github/KarissaChan1/rocket-nuggets/blob/main/NLP_DataPreprocessing_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A summary tutorial of data preprocessing methods for NLP applications, including LSTM architecture for sentiment analysis.

In [129]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [130]:
device = "cuda" if torch.cuda.is_available() else "cpu"

Load dataset

In [131]:
base_csv = '/content/IMDB Dataset.csv'
df = pd.read_csv(base_csv)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Split into train/test sets and encode positive/negative labels

In [132]:
X,y = df['review'].values,df['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')


encoded_train = [1 if label =='positive' else 0 for label in y_train]  
encoded_test = [1 if label =='positive' else 0 for label in y_test] 
print(x_train)

shape of train data is (37500,)
shape of test data is (12500,)
['I am not a fan of Sean Penn, but in contrast to my German colleague whose review appears here, I think he was perfectly cast as the neurotic, druggy character in this film. He has every nuance perfected and reminded me of several acquaintances who had similar tastes in "recreational chemistry." I saw this film but once, 10-15 years ago and this is the only part of the film that was etched indelibly on my mind. I don\'t say it very often, but in this case I will: Bravo, Sean Penn! As for the story line, well, it\'s based on fact, and as such, it is a tragedy that people would sell their country\'s secrets to the then enemy. Again, Penn has shown what you can do if you disagree with the administration. Use the freedoms you have, paid for in blood; don\'t break the law.'
 '"Chairman of the Board" is a ridiculously stupid film from the popular comic Carrot Top (also seen on the 1-800-COLLECT commercials). He plays a surfing i

Normalize text

In [133]:
import nltk
nltk.download('stopwords')

import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from torchtext.data.utils import get_tokenizer

# Function for text normalization
def normalize_text(text_sample):
  
  tokenized = []
  for i in range(len(text_sample)):
    text = text_sample[i]
    # Convert to lowercase
    normalized_text = text.lower()

    # Remove all non-word characters (everything except numbers and letters)
    normalized_text = re.sub(r"[^\w\s]", '', normalized_text)
  
    # Tokenize
    tokenizer = get_tokenizer('basic_english')
    tokens = tokenizer(normalized_text)

    # Perform stemming
    # stemmer = PorterStemmer()
    # stemmed_words = [stemmer.stem(word) for word in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop_words]

    tokenized.append(words)
  return tokenized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
x_train_normalized = normalize_text(x_train)
x_test_normalized = normalize_text(x_test)

In [135]:
print(x_train_normalized[0])

['fan', 'sean', 'penn', 'contrast', 'german', 'colleague', 'whose', 'review', 'appears', 'think', 'perfectly', 'cast', 'neurotic', 'druggy', 'character', 'film', 'every', 'nuance', 'perfected', 'reminded', 'several', 'acquaintances', 'similar', 'tastes', 'recreational', 'chemistry', 'saw', 'film', '1015', 'years', 'ago', 'part', 'film', 'etched', 'indelibly', 'mind', 'dont', 'say', 'often', 'case', 'bravo', 'sean', 'penn', 'story', 'line', 'well', 'based', 'fact', 'tragedy', 'people', 'would', 'sell', 'countrys', 'secrets', 'enemy', 'penn', 'shown', 'disagree', 'administration', 'use', 'freedoms', 'paid', 'blood', 'dont', 'break', 'law']


Perform padding based on maximum sequence length

In [136]:
def find_max_list(list):
    list_len = [len(i) for i in list]
    print(max(list_len))
    return max(list_len)

def padding_(sentences, seq_len):
  for i,sent in enumerate(sentences):
    if len(sent)<seq_len:
      pad = seq_len - len(sent)
      sentences[i] = (sentences[i] + [0] * pad)[:pad]

  return sentences

In [137]:
max_len_train = find_max_list(x_train_normalized)
max_len_test = find_max_list(x_test_normalized)

padded_xtrain = np.asarray(padding_(x_train_normalized,max_len_train))
padded_xtest = np.asarray(padding_(x_test_normalized,max_len_train))

1191
1449


  padded_xtrain = np.asarray(padding_(x_train_normalized,max_len_train))
  padded_xtest = np.asarray(padding_(x_test_normalized,max_len_train))


In [None]:
# You may want to add a DataLoader code section here to batch the training data before giving it to word2vec
# 1. create tensor dataset (combined xtrain and ytrain, xtest and ytest), print to make sure the tensor encodings are correct
# 2. create DataLoaders
# 3. train_dataloader is the new word2vec input

Train word embedding model (word2vec) to use as embedding layer in LSTM architecture

In [138]:
import gensim
from gensim.models import Word2Vec

# Initialize and train the Word2Vec model
model = Word2Vec(padded_xtrain, vector_size=50, window=5, min_count=1, workers=1)

# Retrieve word embeddings
word_vector = model.wv['student']

# Find most similar words
similar_words = model.wv.most_similar('student')

print("Word Embedding for 'student':", word_vector)
print("Most similar words to 'student':", similar_words)


Word Embedding for 'student': [-1.3708901  -0.43507367  0.18664095  0.6343911  -1.1630728   2.304918
 -0.29693615  0.24211895 -0.30435854  0.1929767  -0.47227374  0.14566927
  0.7573182   0.6662409  -0.51274943 -0.02596606  1.614102    0.79449666
  1.8662318   0.9794432   1.054361   -1.1728565   2.0218368  -0.7832905
  1.7774361   0.22244166 -0.63765186 -0.8793735  -0.6945737   1.3244457
 -1.8467313   0.3278806  -0.33175245  0.28065422 -0.39990792 -0.08560283
  0.3618493   0.07644077  0.3875066  -1.1224761   0.6221601   0.98940927
 -0.39729217 -0.7359131   0.45047677  1.6809639   0.6723763   0.05493571
  1.681981   -0.82995003]
Most similar words to 'student': [('teacher', 0.7724525928497314), ('students', 0.7599766850471497), ('class', 0.753632664680481), ('freshman', 0.7532499432563782), ('classes', 0.7451688051223755), ('graduate', 0.7425984144210815), ('college', 0.7266989946365356), ('boarding', 0.7167869210243225), ('studying', 0.7118719816207886), ('indian', 0.6949463486671448)]

Save weights of model to be used in pretrained nn.Embedding layer

In [139]:
weights = model.wv

LSTM architecture

Embedding layer uses pre-trained weights from above. If not using word2vec or any other pretrained embedding model, use nn.Embedding(*args) layer which will be trained with the entire LSTM model.

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self,weights,no_layers,vocab_size,hidden_dim,embedding_dim,output_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size

        self.weights = weights
    
        # using word2vec as embedding layer: take model weights and set freeze = True
        # or else self.embedding = nn.Embedding(*args) to train embedding layer while training entire model
        self.embedding = nn.Embedding.from_pretrained(self.weights,freeze=True) 
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True

        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden