In [52]:
# Import necessary libraries 
import pandas as pd
import re
import spacy
from collections import Counter
from nltk import ngrams
import pickle

In [53]:
# Download necessary data
import spacy.cli
spacy.cli.download('en_core_web_sm') # Download English model

# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(50000, 2)
review       0
sentiment    0
dtype: int64


In [54]:
# Clean and preprocess the raw data
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Load spacy without unnecessary components (for this project) for efficiency

# Apply preprocessing using batch processing for speed
docs = list(nlp.pipe(df['review'], batch_size=1000))

# Create cleaned reviews by lemmatizing and removing stopwords and punctuation
df['cleaned_review'] = [' '.join([token.lemma_ for token in doc if not token.is_stop and (token.is_alpha or token.is_digit)]) for doc in docs]

In [55]:
# Generate n-grams from cleaned text
def generate_ngrams(text, n=2):
    words = text.split()
    return list(ngrams(words, n))

# Apply n-gram generation (bigrams)
df['bigrams'] = df['cleaned_review'].apply(lambda x: generate_ngrams(x, 2))
print(df[['cleaned_review', 'bigrams']].head())

                                      cleaned_review  \
0  reviewer mention watch 1 Oz episode hook right...   
1  wonderful little production br filming techniq...   
2  think wonderful way spend time hot summer week...   
3  basically family little boy Jake think zombie ...   
4  Petter Mattei love Time money visually stunnin...   

                                             bigrams  
0  [(reviewer, mention), (mention, watch), (watch...  
1  [(wonderful, little), (little, production), (p...  
2  [(think, wonderful), (wonderful, way), (way, s...  
3  [(basically, family), (family, little), (littl...  
4  [(Petter, Mattei), (Mattei, love), (love, Time...  


In [56]:
# Build advanced vocabulary with special tokens and indexing
def build_vocabulary(reviews):
    special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']  # Padding, Unknown, Start of Sequence, End of Sequence
    all_words = []
    for review in reviews:
        all_words.extend(review.split()) # Split reviews into words and collect them
    vocabulary = special_tokens + sorted(set(all_words)) # Create vocabulary list
    
    # Create word-to-index and index-to-word mappings (using dictionary comprehensions)
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)} # Map words to indices (encode)
    index_to_word = {idx: word for word, idx in word_to_index.items()} # Map indices to words (decode)
    
    return vocabulary, word_to_index, index_to_word

vocabulary, word_to_index, index_to_word = build_vocabulary(df['cleaned_review'])

# Save vocabulary and mappings to a pickle file
with open('vocabulary.pkl', 'wb') as f:
    pickle.dump({
        'vocabulary': vocabulary,
        'word_to_index': word_to_index,
        'index_to_word': index_to_word
    }, f)

print("Vocabulary saved to 'vocabulary.pkl'")

print(f'Vocabulary Size (including special tokens): {len(vocabulary)}')
print(f'First 10 words in vocabulary: {vocabulary[:10]}')
print(f'Last 10 words: {vocabulary[-10:]}')
print(f'Example mapping: "the" -> {word_to_index.get("the", word_to_index["<UNK>"])}')

Vocabulary saved to 'vocabulary.pkl'
Vocabulary Size (including special tokens): 106999
First 10 words in vocabulary: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', '0', '00', '000', '001', '006', '0069']
Last 10 words: ['úber', 'über', 'übermensch', 'übermenschlich', 'überwoman', 'ünfaithful', 'ý', 'ýs', 'יגאל', 'כרמון']
Example mapping: "the" -> 100562


In [57]:
# Convert cleaned reviews to sequences (vectors) using word-to-index mapping
df['sequences'] = df['cleaned_review'].apply(lambda x: [word_to_index.get(word, word_to_index['<UNK>']) for word in x.split()])

# Display sample sequences
print(df[['cleaned_review', 'sequences']].head())

# Save sequences and labels to a pickle file for model training
sequences_data = {
    'sequences': df['sequences'].tolist(),
    'labels': df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()  # Assuming 'sentiment' column has 'positive'/'negative'
}
with open('sequences.pkl', 'wb') as f:
    pickle.dump(sequences_data, f)

print("Sequences saved to 'sequences.pkl'.")

                                      cleaned_review  \
0  reviewer mention watch 1 Oz episode hook right...   
1  wonderful little production br filming techniq...   
2  think wonderful way spend time hot summer week...   
3  basically family little boy Jake think zombie ...   
4  Petter Mattei love Time money visually stunnin...   

                                           sequences  
0  [93201, 83816, 105321, 34, 38095, 71281, 77552...  
1  [106175, 82146, 90597, 61928, 73016, 100191, 8...  
2  [100691, 106175, 105401, 97539, 100951, 77703,...  
3  [60331, 72418, 82146, 61912, 25856, 100691, 10...  
4  [39567, 32996, 82479, 51418, 84734, 104876, 98...  
Sequences saved to 'sequences.pkl'.
