In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
import multiprocessing
import re


In [2]:
num_cores = multiprocessing.cpu_count()
torch.set_num_threads(num_cores)
torch.backends.mkldnn.enabled = True

In [3]:
df_full = pd.read_csv("IMDB Dataset.csv")

In [4]:
df = df_full.iloc[:10000].copy()
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Preprossessing

In [5]:
# Convert into lowercase
df['review'] = df['review'].str.lower()

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
# Remove tag
def remove_html_tag(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
df['review'] = df['review'].apply(remove_html_tag)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [7]:
# Remove url
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub('r', text)

df['review'] = df['review'].apply(remove_url)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [8]:
# Remove punctuation
import string

def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['review'] = df['review'].apply(remove_punc)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [9]:
# Handling stopwords
stopword = stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    
    return " ".join(x)

df['review'] = df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz e...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend time hot s...,positive
3,basically theres family little boy jake thi...,negative
4,petter matteis love time money visually s...,positive


In [10]:
# Stemming
stemmer = PorterStemmer()

def stem_word(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

print(stem_word("walking walk walked"))

df['review'] = df['review'].apply(stem_word)
df.head()

walk walk walk


Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive


In [11]:
from collections import defaultdict
train_texts = df['review'].tolist()
train_labels = df['sentiment'].tolist()

MAX_SIZE = 30000
print(train_texts[:10])

def tokenize(text):
    return text.split()

# Build Vocabulary Function
def build_vocab(texts, max_size=MAX_SIZE):
    vocab = defaultdict(int)
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            vocab[token] += 1  # Count occurrences of each token
    
    # Sort the vocabulary by frequency and limit its size
    sorted_vocab = sorted(vocab.items(), key=lambda item: item[1], reverse=True)
    
    # Create the final vocabulary dictionary (word -> index)
    if max_size:
        sorted_vocab = sorted_vocab[:max_size]  # Limit vocab size if max_size is specified
    
    vocab_dict = {word: idx + 1 for idx, (word, _) in enumerate(sorted_vocab)}  # Starting index from 1
    return vocab_dict

# Initialize Random Embeddings
def initialize_embeddings(vocab_dict, embedding_dim):
    # Create a tensor of shape (vocab_size + 1, embedding_dim) for embeddings
    embeddings = torch.randn(len(vocab_dict) + 1, embedding_dim)  # +1 for padding index 0
    return embeddings

vocab = build_vocab(train_texts)
embedding_dim = 100
embeddings = initialize_embeddings(vocab, embedding_dim)

print("Vocabulary:", vocab)
print("Number of unique tokens:", len(vocab))
print("Sample Embeddings for first 5 words:")
for word in list(vocab.keys())[:5]:
    print(f"{word}: {embeddings[vocab[word]].tolist()}")
    

['one review mention watch 1 oz episod youll hook right exactli happen meth first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom 

In [12]:
sentiment_map = {'negative': 0, 'positive': 1}
df['encoded_sentiment'] = df['sentiment'].map(sentiment_map)
df.head()

Unnamed: 0,review,sentiment,encoded_sentiment
0,one review mention watch 1 oz episod youll hoo...,positive,1
1,wonder littl product film techniqu unassum old...,positive,1
2,thought wonder way spend time hot summer weeke...,positive,1
3,basic there famili littl boy jake think there ...,negative,0
4,petter mattei love time money visual stun film...,positive,1


In [13]:
# Encode by the number
def encode_reviews(texts, vocab):
    encoded_reviews = []
    for text in texts:
        tokens = tokenize(text)
        indices = [vocab.get(token, 0) for token in tokens]
        encoded_reviews.append(indices)
    return encoded_reviews

df['encoded_reviews'] = encode_reviews(df['review'], vocab)

print("Encoded Reviews:")
print(df[['review', 'encoded_reviews']])

Encoded Reviews:
                                                 review  \
0     one review mention watch 1 oz episod youll hoo...   
1     wonder littl product film techniqu unassum old...   
2     thought wonder way spend time hot summer weeke...   
3     basic there famili littl boy jake think there ...   
4     petter mattei love time money visual stun film...   
...                                                 ...   
9995  fun entertain movi wwii german spi juli andrew...   
9996  give break anyon say good hockey movi know mov...   
9997  movi bad movi watch endless seri bad horror mo...   
9998  movi probabl made entertain middl school earli...   
9999  smash film filmmak show intens strang relation...   

                                        encoded_reviews  
0     [3, 248, 356, 11, 485, 2623, 191, 368, 1437, 1...  
1     [101, 43, 221, 2, 1585, 13835, 25391, 1091, 56...  
2     [111, 101, 32, 651, 5, 824, 1280, 2080, 432, 6...  
3     [389, 155, 139, 43, 267, 4022, 28, 1

In [14]:
# Prepare data for training
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, index):
        return torch.tensor(self.reviews[index]), torch.tensor(self.labels[index])

reviews = df['encoded_reviews'].tolist()
labels = df['encoded_sentiment'].tolist()

print(len(reviews))

10000


In [15]:
def pad_sequences(sequences, max_length):
    padded = []
    for seq in sequences:
        # Pad sequences with zeros (or any padding index you prefer)
        padded_seq = seq + [0] * (max_length - len(seq))  # Padding with 0s
        padded.append(padded_seq)
    return padded

max_length = max(len(review) for review in reviews)
reviews = pad_sequences(reviews, max_length)

In [16]:
split_index = int(len(reviews) * 0.9)

train_review_dataset, train_label_dataset = reviews[:split_index], labels[:split_index]
validation_review_dataset, validation_label_dataset = reviews[split_index:], labels[split_index:]

train_dataset = SentimentDataset(train_review_dataset, train_label_dataset)
validation_dataset = SentimentDataset(validation_review_dataset, validation_label_dataset)

batch_size = 120

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle= True)
validation_data_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle= False)

In [17]:
class CNN_Sentiment_Classification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters, dropout_rate):
        super(CNN_Sentiment_Classification, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # nn.Conv2d(in_channels, out_channels, kernel_size)
        
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (ks, embedding_dim)) for ks in kernel_sizes
        ])
        
        # Fully connected layer
        
        # nn.Linear(in_features, out_features)
        
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate) 
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add a channel dimension: (batch_size, 1, sequence_length, embedding_dim)
        
        # Apply convolution and pooling
        convs = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # Shape: (batch_size, num_filters, W)
            
        pooled = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in convs]  # Shape: (batch_size, num_filters)
        
        # Concatenate pooled outputs
        x = torch.cat(pooled, 1)  # Shape: (batch_size, num_filters * len(kernel_sizes))
        x = self.dropout(x)  # Apply dropout
        x = self.fc(x)  # Shape: (batch_size, num_classes)
        
        return x

In [18]:
vocab_size = len(vocab) + 1
embedding_dim = 100
num_classes = 2
kernel_sizes = [1, 2, 3, 4, 5]
num_filters = 100
dropout_rate = 0.5

model = CNN_Sentiment_Classification(vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters, dropout_rate)

In [19]:
def train_model(model, train_data_loader, validation_data_loader, epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr= learning_rate)
    
    accuracy_log = []
    
    for epoch in range(epochs):
        total_train_loss = 0.0
        model.train()
        
        for texts, labels in train_data_loader:
            output = model(texts)
            loss = criterion(output, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_data_loader)
        
        model.eval()
        total_validation_loss = 0.0
        
        with torch.inference_mode():
            for texts, labels in validation_data_loader:
                output = model(texts)
                loss = criterion(output, labels)
                
                total_validation_loss += loss.item()
        
        avg_validation_loss = total_validation_loss / len(validation_data_loader)
        
        total_correct = 0

        with torch.no_grad():  # Disable gradient computation
            for texts, labels in validation_data_loader: 
                
                outputs = model(texts)
                _, predicted = torch.max(outputs.data, 1)  # Get the class with the highest probability
                total_correct += (predicted == labels).sum().item()

        accuracy = total_correct / len(validation_data_loader.dataset)
        print(f'Epoch = {epoch}, train_loss = {avg_train_loss}, validation_loss = {avg_validation_loss}')
        print(f'Accuracy: {accuracy:.4f}')
        
        accuracy_log.append(accuracy)
    
    return accuracy_log
# 87.5 percent accuracy

In [20]:
accuracy_log = train_model(model, train_data_loader, validation_data_loader, 20, 0.001)

Epoch = 0, train_loss = 1.629292987982432, validation_loss = 0.6272262136141459
Accuracy: 0.6510
Epoch = 1, train_loss = 0.8319573267300924, validation_loss = 0.5575226545333862
Accuracy: 0.7120
Epoch = 2, train_loss = 0.6119374612967173, validation_loss = 0.49614935451083714
Accuracy: 0.7800
Epoch = 3, train_loss = 0.5340100475152334, validation_loss = 0.4633116622765859
Accuracy: 0.7870
Epoch = 4, train_loss = 0.4846636180082957, validation_loss = 0.42203882667753434
Accuracy: 0.8210
Epoch = 5, train_loss = 0.45729422132174175, validation_loss = 0.3961670796076457
Accuracy: 0.8430
Epoch = 6, train_loss = 0.4049822998046875, validation_loss = 0.3702918125523461
Accuracy: 0.8430
Epoch = 7, train_loss = 0.37698890566825866, validation_loss = 0.3562551670604282
Accuracy: 0.8570
Epoch = 8, train_loss = 0.35767780939737953, validation_loss = 0.34422662523057723
Accuracy: 0.8600
Epoch = 9, train_loss = 0.31329700231552127, validation_loss = 0.32780394289228654
Accuracy: 0.8620
Epoch = 10, t