In [1]:
# Import necessary libraries 
import pandas as pd
import re
import spacy
from collections import Counter
from nltk import ngrams
import pickle
import numpy as np
import gensim.downloader as api

In [2]:
# Download necessary data
import spacy.cli
spacy.cli.download('en_core_web_sm') # Download English model

# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(50000, 2)
review       0
sentiment    0
dtype: int64


In [3]:
# Clean and preprocess the raw data
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Load spacy without unnecessary components (for this project) for efficiency

# Preprocessing function to clean reviews
def preprocess_dataset(texts):
    cleaned_texts = []
    docs = list(nlp.pipe(texts, batch_size=1000))
    for doc in docs:
        # Collect lemmas for valid tokens in this doc
        cleaned = [token.lemma_ for token in doc if not token.is_stop and (token.is_alpha or token.is_digit)]
        # Join into a single string per review
        cleaned_texts.append(' '.join(cleaned))
    return cleaned_texts

# Apply preprocessing to the reviews
df['cleaned_review'] = preprocess_dataset(df['review'])
print(df[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  reviewer mention watch 1 Oz episode hook right...  
1  wonderful little production br filming techniq...  
2  think wonderful way spend time hot summer week...  
3  basically family little boy Jake think zombie ...  
4  Petter Mattei love Time money visually stunnin...  


In [4]:
# Generate n-grams from cleaned text
def generate_ngrams(text, n=2):
    words = text.split()
    return list(ngrams(words, n))

# Apply n-gram generation (bigrams)
df['bigrams'] = df['cleaned_review'].apply(lambda x: generate_ngrams(x, 2))
print(df[['cleaned_review', 'bigrams']].head())

                                      cleaned_review  \
0  reviewer mention watch 1 Oz episode hook right...   
1  wonderful little production br filming techniq...   
2  think wonderful way spend time hot summer week...   
3  basically family little boy Jake think zombie ...   
4  Petter Mattei love Time money visually stunnin...   

                                             bigrams  
0  [(reviewer, mention), (mention, watch), (watch...  
1  [(wonderful, little), (little, production), (p...  
2  [(think, wonderful), (wonderful, way), (way, s...  
3  [(basically, family), (family, little), (littl...  
4  [(Petter, Mattei), (Mattei, love), (love, Time...  


In [5]:
# Load pre-trained GloVe word embeddings
model = api.load("glove-wiki-gigaword-100")

# Prepare data for neural network using pretrained GloVe embeddings

# Use cleaned_review without special tokens
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Define special tokens for sequence modeling
special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']

# Get all unique words
all_words = set()
for review in df['cleaned_review']:
    words = review.split()
    all_words.update(words)

# The vocabulary contains all the necessary words and tokens for the embeddings
vocab = special_tokens + list(all_words)
vocab_size = len(vocab)

# Every word should be assigned with an index for vectorization 
word_to_index = {word: i for i, word in enumerate(vocab)}

# Create embedding matrix
embedding_dim = 100
# vocab size (x) x embedding dimensions (y) 
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# The embedding matrix has all the words that exist in both the vocabulary of the dataset and pre-trained model
for word, i in word_to_index.items():
    if word in model and word not in special_tokens:
        # Returns vector with 100 values
        embedding_matrix[i] = model[word]

# Convert to sequences with <SOS> and <EOS>
X = []
for review in df['cleaned_review']:
    # Assigns index of SOS to begin and if a word doesnt exist it assigns index of UNK and in the end EOS 
    seq = [word_to_index['<SOS>']] + [word_to_index.get(word, word_to_index['<UNK>']) for word in review.split()] + [word_to_index['<EOS>']]
    X.append(seq)

# Pad sequences with <PAD> (index 0)
max_len = max(len(seq) for seq in X)
X = np.array([seq + [word_to_index['<PAD>']] * (max_len - len(seq)) for seq in X])

y = df['label'].values

# Save preprocessing artifacts
with open('neural_preprocessing.pkl', 'wb') as f:
    pickle.dump({
        'word_to_index': word_to_index,
        'embedding_matrix': embedding_matrix,
        'max_len': max_len,
        'X': X,
        'y': df['label'].values
    }, f)

print("Neural preprocessing complete. Data saved for model training.")

Neural preprocessing complete. Data saved for model training.


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, stratify=df['label'], random_state=42)

print("Data split complete.")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Input shape: {X_train.shape}")

Data split complete.
Training set size: 40000
Test set size: 10000
Input shape: (40000, 1298)
