In [23]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "pandas", "sklearn", "numpy"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
pandas is already installed.
sklearn is already installed.
numpy is already installed.


In [24]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [25]:
# Define the path to the pickle file containing the review dataset
file_path = "../Dataset/others_reviews.pkl"

# Open the pickle file in binary read mode and load it into a DataFrame
with open(file_path, "rb") as f:
    others_reviews_df = pickle.load(f)

# Display the first few rows of the DataFrame to inspect its structure and content
display(others_reviews_df.head())



Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0
1,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0
2,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0
3,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0
4,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0


In [26]:
# Define the path to the pickle file containing Star Wars reviews
file_path = "../Dataset/sw_reviews.pkl"

# Open the pickle file in binary read mode and load it into a DataFrame
with open(file_path, "rb") as f:
    sw_reviews_df = pickle.load(f)

# Display the first few rows of the DataFrame to inspect its contents
display(sw_reviews_df.head())


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0


In [27]:
# Combine the two DataFrames (Star Wars and other reviews) into one
# 'ignore_index=True' resets the index in the merged DataFrame
complete_df = pd.concat([sw_reviews_df, others_reviews_df], ignore_index=True)

# Display the first few rows of the combined DataFrame
complete_df.head()

Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0


### Word Frequency Extraction from Reviews

This cell performs **basic preprocessing and token frequency analysis** on the review texts:

1. **Stopwords Removal**  
   It uses Scikit-learn's `ENGLISH_STOP_WORDS` to filter out common, uninformative words like "the", "is", etc.

2. **Text Merging and Cleaning**  
   The review texts from both datasets (`others_reviews_df` and `sw_reviews_df`) are merged and cleaned. Each text is:
   - Lowercased
   - Stripped of punctuation, numbers, and symbols using regex
   - Tokenized into individual words
   - Filtered to exclude stopwords and very short words (length ≤ 2)

3. **Frequency Counting**  
   All tokens are collected into a list and passed to Python’s `Counter` to count word frequencies.

4. **Output**  
   The top 100 most frequent words are computed, and the top 20 are displayed in a DataFrame for quick inspection.

This step helps understand the dominant vocabulary in the dataset and can be useful for further filtering or domain-specific keyword analysis.


In [28]:
# Load the official English stopword list from Scikit-learn
stopwords = ENGLISH_STOP_WORDS

# Merge the review texts from both datasets and convert to a clean list of strings
texts = pd.concat([
    others_reviews_df['Review_Text'],
    sw_reviews_df['Review_Text']
]).dropna().astype(str).tolist()

# Tokenization function: lowercase, remove non-letters, remove stopwords and short words
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation, numbers, and special characters
    return [w for w in text.split() if w not in stopwords and len(w) > 2]

# Apply tokenization and collect all words into a single list
all_tokens = []
for text in texts:
    all_tokens.extend(tokenize(text))

# Count word frequencies
freq_dist = Counter(all_tokens)

# Extract the 100 most common words
common_words = freq_dist.most_common(100)

# Display the top 20 most frequent words in a DataFrame
common_df = pd.DataFrame(common_words, columns=["word", "word_frequency"])
display(common_df.head(20))


Unnamed: 0,word,word_frequency
0,movie,103294
1,film,69598
2,star,62314
3,wars,56288
4,like,45447
5,just,43348
6,good,34327
7,story,31479
8,time,26044
9,great,25726


### Identifying Cinema-Domain Words with spaCy Embeddings

This cell uses **spaCy's semantic similarity** between word vectors to identify words that are conceptually related to the **cinema domain**.

#### Method:
1. Load the `en_core_web_md` model, which includes **pre-trained word embeddings**.
2. Each word is mapped to a **semantic vector** that captures its meaning based on distributional context.
3. The semantic vector of the reference word **"movie"** is extracted.
4. For each word in the list of most frequent terms, the **cosine similarity** with the vector for "movie" is computed.
5. If the similarity is above a threshold (here: **0.3**), the word is considered semantically related to "movie", and thus potentially **cinema-related**.

#### Limitation:
Words like **"good"** or **"great"** often co-occur with movie-related terms in text corpora. Therefore, even if they express **general sentiment** rather than being **specific to cinema**, spaCy may assign them high similarity scores to **"movie"**. This can lead to **false positives** in domain filtering.


In [29]:
# Load the spaCy model with pre-trained word vectors
nlp = spacy.load("en_core_web_md")

# Get the semantic vector of the reference word "movie"
cinema_ref = nlp("movie")

# Convert the list of most frequent words to a list
common_words = common_df["word"].tolist()

# Identify words that are semantically close to "movie"
related_words = [
    word for word in common_words
    if word not in ENGLISH_STOP_WORDS and len(word) > 2
    and nlp(word).has_vector  # Ensure the word has a vector representation
    and cinema_ref.similarity(nlp(word)) > 0.3  # Similarity threshold
]

# Display words considered semantically related to the cinema domain
display(related_words)


['movie',
 'film',
 'wars',
 'just',
 'good',
 'story',
 'time',
 'really',
 'characters',
 'jedi',
 'movies',
 'character',
 'did',
 'episode',
 'films',
 'way',
 'trilogy',
 'bad',
 'plot',
 'better',
 'scenes',
 'know',
 'end',
 'watch',
 'does',
 'seen',
 'little',
 'old',
 'going',
 'lot',
 'things',
 'darth',
 'vader',
 'empire',
 'thing',
 'far',
 'watching',
 'thought',
 'series',
 'actually',
 'come',
 'look',
 'real',
 'bit',
 'actors',
 'saga',
 'times',
 'battle',
 'point',
 'right']

### Extracting Cinema-Related but Sentiment-Neutral Words

This cell combines **semantic similarity analysis with sentiment filtering** to extract words that are conceptually related to cinema, while excluding emotionally charged terms.

#### Approach:
- **spaCy's `en_core_web_md`** is used to compute semantic similarity between each word and the reference word **"movie"**.
- Words with a **cosine similarity > 0.3** are considered semantically related to the cinema domain.

#### Sentiment Filtering:
To avoid including general opinion words like "good" or "great", we use **VADER** (Valence Aware Dictionary and sEntiment Reasoner), a sentiment analysis tool from **NLTK**:
- For each word, VADER returns a **compound sentiment score**.
- If the **absolute value** of the compound score is **greater than 0.3**, the word is considered sentiment-laden and is excluded from the cinema-related list.

#### Final Output:
The result is a DataFrame containing **frequent, cinema-related words** that are **not emotionally biased**, useful for building cleaner domain-specific keyword lists or filters.


In [52]:
# Setup
nltk.download('vader_lexicon')  # Download the VADER sentiment lexicon
sid = SentimentIntensityAnalyzer()  # Initialize VADER sentiment analyzer
nlp = spacy.load("en_core_web_md")  # Load spaCy model with word vectors

# Load official English stopwords from Scikit-learn
stopwords = ENGLISH_STOP_WORDS

# Merge and clean review texts from both datasets
texts = pd.concat([
    others_reviews_df['Review_Text'],
    sw_reviews_df['Review_Text']
]).dropna().astype(str).tolist()

# Tokenization function: lowercase, remove punctuation, filter stopwords and short words
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return [w for w in text.split() if w not in stopwords and len(w) > 2]

# Build token frequency distribution
all_tokens = []
for text in texts:
    all_tokens.extend(tokenize(text))

freq_dist = Counter(all_tokens)
common_words = freq_dist.most_common(10000)
common_df = pd.DataFrame(common_words, columns=["word", "count"])

# Define reference vector for "movie" and similarity threshold
cinema_ref = nlp("movie")
similarity_threshold = 0.3

# Check if a word has strong sentiment using VADER
def is_sentiment_word(word, threshold=0.3):
    score = sid.polarity_scores(word)['compound']
    return abs(score) > threshold

# Filter for cinema-related words that are NOT sentiment-heavy
filtered_words = []
for word in common_df["word"]:
    token = nlp(word)
    if token.has_vector and token.similarity(cinema_ref) > similarity_threshold:
        if not is_sentiment_word(word):
            filtered_words.append(word)

# Create final DataFrame with cinema-related (but neutral) words
cinema_related_df = common_df[common_df["word"].isin(filtered_words)].reset_index(drop=True)
cinema_related_df.head(500)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bianca/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,word,count
0,movie,103294
1,film,69598
2,just,43348
3,story,31479
4,time,26044
...,...,...
495,wacky,39
496,gunslinger,39
497,sidekicks,39
498,monologue,38


In [53]:
# List of cinema-related words to remove
words_to_remove = [
    "actor", "actress", "artist", "author", "cast", "character", "cinema", "cinematography", 
    "director", "editing", "episode", "film", "filmmaker", "genre", "maker", "movie", 
    "opera", "producer", "production", "review", "reviewer", "saga", "scene", 
    "screen", "trilogy", "video", "visual", "voice", "writer"
]

In [56]:
# Get the list of words from the DataFrame
cinema_words = cinema_related_df["word"].tolist()

# Check which of the target words are missing
missing_words = [word for word in words_to_remove if word not in cinema_words]

# Display results
if not missing_words:
    print("All words from 'words_to_remove' are present in 'cinema_related_df'.")
else:
    print("The following words are missing from 'cinema_related_df':")
    print(missing_words)

The following words are missing from 'cinema_related_df':
['actress', 'artist', 'author', 'cinema', 'cinematography', 'director', 'editing', 'filmmaker', 'maker', 'opera', 'producer', 'production', 'review', 'reviewer', 'scene', 'screen', 'visual', 'voice', 'writer']


In [57]:
# Find the intersection (words that are present in both lists)
present_words = [word for word in words_to_remove if word in cinema_words]

# Print results
print("Words from 'words_to_remove' that are present in 'cinema_related_df':")
print(present_words)

Words from 'words_to_remove' that are present in 'cinema_related_df':
['actor', 'cast', 'character', 'episode', 'film', 'genre', 'movie', 'saga', 'trilogy', 'video']
