In [1]:
# Import necessary libraries 
import pandas as pd
import re
import spacy
from collections import Counter
from nltk import ngrams
import pickle

In [53]:
# Download necessary data
import spacy.cli
spacy.cli.download('en_core_web_sm') # Download English model

# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(50000, 2)
review       0
sentiment    0
dtype: int64


In [None]:
# Clean and preprocess the raw data
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Load spacy without unnecessary components (for this project) for efficiency

# Preprocessing function to clean reviews
def preprocess_dataset(texts):
    cleaned_texts = []
    docs = list(nlp.pipe(texts, batch_size=1000))
    for doc in docs:
        # Collect lemmas for valid tokens in this doc
        cleaned = [token.lemma_ for token in doc if not token.is_stop and (token.is_alpha or token.is_digit)]
        # Join into a single string per review
        cleaned_texts.append(' '.join(cleaned))
    return cleaned_texts

# Apply preprocessing to the reviews
df['cleaned_review'] = preprocess_dataset(df['review'])
print(df[['review', 'cleaned_review']].head())

In [55]:
# Generate n-grams from cleaned text
def generate_ngrams(text, n=2):
    words = text.split()
    return list(ngrams(words, n))

# Apply n-gram generation (bigrams)
df['bigrams'] = df['cleaned_review'].apply(lambda x: generate_ngrams(x, 2))
print(df[['cleaned_review', 'bigrams']].head())

                                      cleaned_review  \
0  reviewer mention watch 1 Oz episode hook right...   
1  wonderful little production br filming techniq...   
2  think wonderful way spend time hot summer week...   
3  basically family little boy Jake think zombie ...   
4  Petter Mattei love Time money visually stunnin...   

                                             bigrams  
0  [(reviewer, mention), (mention, watch), (watch...  
1  [(wonderful, little), (little, production), (p...  
2  [(think, wonderful), (wonderful, way), (way, s...  
3  [(basically, family), (family, little), (littl...  
4  [(Petter, Mattei), (Mattei, love), (love, Time...  


In [None]:
# Build vocabulary with special tokens and indexing for neural network input
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#Add special tokens for start and end of sequence <UKN> and <PAD> aren't needed here
df['processed_review'] = df['cleaned_review'].apply(lambda x: f"<SOS> {x} <EOS>") # Add start and end tokens for sequence modeling

#Initialize vectorizers with n-gram range and max features to limit vocabulary size for neural network input
bow_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000) # Unigrams and bigrams, limit to top 1k features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)

# Sparse matrices for BoW and TF-IDF features
bow_features = bow_vectorizer.fit_transform(df['processed_review'])
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_review'])

# Save vectorizers for later use in model training
with open('vectorizers.pkl', 'wb') as f:
    pickle.dump({
        'bow_vectorizer': bow_vectorizer,
        'tfidf_vectorizer': tfidf_vectorizer,
        'bow_features': bow_features,
        'tfidf_features': tfidf_features
    }, f)

print("Vectorizers and features saved to 'vectorizers.pkl'.")
print(f"BoW feature shape: {bow_features.shape}")
print(f"TF-IDF feature shape: {tfidf_features.shape}")

