In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import plotly.express as px

# Data Preprocess

In [3]:
# 1. Load Data into DataFrames
try:
    details_df = pd.read_json('archive/IMDB_movie_details.json', lines=True)
    reviews_df = pd.read_json('archive/IMDB_reviews.json', lines=True)
except ValueError:
    # Fallback for standard JSON list format
    details_df = pd.read_json('archive/IMDB_movie_details.json')
    reviews_df = pd.read_json('archive/IMDB_reviews.json')

print("Data Loaded Successfully.")
reviews_df[['review_text', 'is_spoiler']].head()

Data Loaded Successfully.


Unnamed: 0,review_text,is_spoiler
0,"In its Oscar year, Shawshank Redemption (writt...",True
1,The Shawshank Redemption is without a doubt on...,True
2,I believe that this film is the best story eve...,True
3,"**Yes, there are SPOILERS here**This film has ...",True
4,At the heart of this extraordinary movie is a ...,True


#### Data Checks:

In [4]:
print(f'\n--- Basic Info Checks ---\n')

total_num_of_reviews = reviews_df.shape[0]
print(f'1. reviews_df contains {total_num_of_reviews} number of reviews')

total_num_of_unique_movies_in_reviews = reviews_df['movie_id'].nunique()
print(f'2. reviews_df contains {total_num_of_unique_movies_in_reviews} number of different movies')

total_num_of_movies = details_df.shape[0]
print(f'3. details_df contains {total_num_of_movies} number of movies')

total_num_of_unique_movies_in_details = details_df['movie_id'].nunique()
print(f'4. details_df contains information about {total_num_of_unique_movies_in_details} number of different movies')

print(f'\n--- Data Quality Checks ---\n')

# 1. Check for NaN in movie_id (reviews_df)
nan_reviews_movie_id_count = reviews_df['movie_id'].isna().sum()
print(f'1. Number of NaN/Missing values in reviews_df["movie_id"]: {nan_reviews_movie_id_count}')

# 2. Check how many reviews contain actual text (not just symbols/garbage)
reviews_df['review_text_safe'] = reviews_df['review_text'].fillna('')
# Regex: eliminate “spammy” reviews like only punctuation, only digits, or meaningless repeated characters
valid_text_count = reviews_df['review_text_safe'].str.contains(r'[A-Za-z]').sum()
invalid_text_count = total_num_of_reviews - valid_text_count
print(f'2. Reviews with "actual" text content (min 10 letters): {valid_text_count} ({valid_text_count / total_num_of_reviews:.2%})')
print(f'   Reviews flagged as junk/short/empty: {invalid_text_count}')

# 3. Check for NaN in review_text (reviews_df)
nan_review_text_count = reviews_df['review_text'].isna().sum()
print(f'3. Number of NaN/Missing values in reviews_df["review_text"]: {nan_review_text_count}')

# 4. Check for NaN in plot_synopsis (details_df)
nan_synopsis_count = details_df['plot_synopsis'].isna().sum()
print(f'4. Number of NaN/Missing values in details_df["plot_synopsis"]: {nan_synopsis_count}')

# 5. Check how many plot_synopsis contain actual text (not just symbols/garbage)
valid_synopsis_count = details_df['plot_synopsis'].str.contains(r'[A-Za-z]').sum()
invalid_synopsis_count = total_num_of_movies - valid_synopsis_count
print(f'5. Synopsis with "actual" synopsis content: {valid_synopsis_count} ({valid_synopsis_count / total_num_of_movies:.2%})')
print(f'   Synopsis flagged as junk/short/empty: {invalid_synopsis_count}')

# 6. Check for NaN in plot_summary (details_df)
nan_summary_count = details_df['plot_summary'].isna().sum()
print(f'6. Number of NaN/Missing values in details_df["plot_summary"]: {nan_summary_count}')

# 7. Check for NaN in movie_id (details_df)
nan_details_movie_id_count = details_df['movie_id'].isna().sum()
print(f'7. Number of NaN/Missing values in details_df["movie_id"]: {nan_details_movie_id_count}')

# 8. Check how many plot_summary contain actual text (not just symbols/garbage)
valid_summary_count = details_df['plot_summary'].str.contains(r'[A-Za-z]').sum()
invalid_summary_count = total_num_of_movies - valid_summary_count
print(f'5. Summary with "actual" summary content: {valid_summary_count} ({valid_summary_count / total_num_of_movies:.2%})')
print(f'   Summary flagged as junk/short/empty: {invalid_summary_count}')


--- Basic Info Checks ---

1. reviews_df contains 573913 number of reviews
2. reviews_df contains 1572 number of different movies
3. details_df contains 1572 number of movies
4. details_df contains information about 1572 number of different movies

--- Data Quality Checks ---

1. Number of NaN/Missing values in reviews_df["movie_id"]: 0
2. Reviews with "actual" text content (min 10 letters): 573913 (100.00%)
   Reviews flagged as junk/short/empty: 0
3. Number of NaN/Missing values in reviews_df["review_text"]: 0
4. Number of NaN/Missing values in details_df["plot_synopsis"]: 0
5. Synopsis with "actual" synopsis content: 1339 (85.18%)
   Synopsis flagged as junk/short/empty: 233
6. Number of NaN/Missing values in details_df["plot_summary"]: 0
7. Number of NaN/Missing values in details_df["movie_id"]: 0
5. Summary with "actual" summary content: 1572 (100.00%)
   Summary flagged as junk/short/empty: 0


**"Synopsis flagged as junk/short/empty: 233"** suggests that there are invalid synopsis for some movies. Sample a few and confirme this.

In [5]:
details_df[~details_df['plot_synopsis'].str.contains(r'[A-Za-z]')].sample(5)

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
1566,tt4047038,Centers on the titular holistic detective who ...,1h,"[Comedy, Mystery, Sci-Fi]",8.4,2017-04-01,
724,tt0119190,Baby George got into a plane crash in a jungle...,1h 32min,"[Action, Adventure, Comedy]",5.4,1997-07-16,
1156,tt0201265,In what has to be one of the worst ideas in Ch...,1h 35min,[Comedy],6.8,1999-11-26,
807,tt0157503,"In a small Minnesota town, the annual beauty p...",1h 37min,"[Comedy, Romance, Thriller]",6.6,1999-07-23,
1003,tt0100944,"A young boy, recently orphaned, is taken to En...",1h 31min,"[Adventure, Comedy, Family]",6.8,1990-08-24,


It seems that some movies are missing `plot_synopsis` values. But we can ignore this because it does not affect our prediction models. So we are good with just setting all "junk" synopsis to empty strings `''`

In [6]:
details_df.loc[~details_df['plot_synopsis'].str.contains(r'[A-Za-z]'), 'plot_synopsis'] = ''

#### More data checks

In [7]:
print(f'\n--- Relationship Checks ---\n')

reviews_movies = reviews_df['movie_id'].unique()
details_movies = details_df['movie_id'].unique()

# 1. Movies in reviews but not in details
movies_in_reviews_only = set(reviews_movies) - set(details_movies)
print(f'1. Number of unique movies in reviews_df NOT found in details_df: {len(movies_in_reviews_only)}')
print(f'   Movie IDs in reviews_df only: {movies_in_reviews_only}')

# 2. Movies in details but not in reviews
movies_in_details_only = set(details_movies) - set(reviews_movies)
print(f'2. Number of unique movies in details_df NOT found in reviews_df: {len(movies_in_details_only)}')
print(f'   Movie IDs in details_df only: {movies_in_details_only}')

# Clean up temporary column
reviews_df.drop(columns=['review_text_safe'], inplace=True, errors='ignore')


--- Relationship Checks ---

1. Number of unique movies in reviews_df NOT found in details_df: 2
   Movie IDs in reviews_df only: {'tt0114142', 'tt0104014'}
2. Number of unique movies in details_df NOT found in reviews_df: 2
   Movie IDs in details_df only: {'tt0114142/', 'tt0104014/'}


## Analysis of Movie ID Conflict (`tt0104014` vs. `tt0104014/`)

The data strongly indicates that the movie IDs `tt0104014` (Reviews) and `tt0104014/` (Details) refer to the **exact same movie**. The trailing slash (`/`) in the details record is a data artifact that prevents a clean join.

The movie is the 1992 Italian erotic film, **`Così fan tutte`**, directed by **Tinto Brass**.

### Proof of Identity (tt0104014)

The proof comes from the consistency between the specific film titles, director names, and plot themes mentioned across both records.

#### 1. Review Record (`tt0104014`)

The review text provides the film's title directly:

```json
{
  "review_date": "16 July 2012",
  "movie_id": "tt0104014",
  "user_id": "ur5358902",
  "is_spoiler": false,
  "review_text": "**Tinto Brass** is usually referred to as either a misunderstood genius or a talentless hack. **Cosi fan tutte (\"All Ladies Do It\")** proves that he's neither one. When he doesn't take himself too seriously (which, unfortunately, he does quite often) the man is perfectly capable of creating fun, well-polished, strict entertainment. **Cosi fan tutte** certainly slips into the realm of pornography on occasion, but it has a sense of lightness and fun that breaks the ice and avoids real discomfort...",
  "rating": "7",
  "review_summary": "Silly and Sexy"
}
```

* **Key Proof:** The explicit naming of the director (**Tinto Brass**) and the Italian title with its English translation (**Cosi fan tutte ("All Ladies Do It")**).

#### 2. Detail Record (`tt0104014/`)

The detail record provides a plot summary that is perfectly consistent with a **Tinto Brass** film titled **`Così fan tutte`**:

```json
{
  "plot_summary": "For a while now, beautiful 24-year-old **Diana Bruni** who's been happily married for five years, has been feeling distressed, experiencing an inexplicable, rather restless craving to finally live her life to the full and to break free from what society forbids. As this urge grows stronger by the day, Diana will ultimately yield to her **carnal longings**, and through a parade of particularly explicit nocturnal **sensual adventures**, she will utterly embrace passion, even if this comes by way of **transgression**. However, before long, the unaware husband, **Paolo**, will find all about his headstrong and disobedient wife's **extra-marital escapades**...",
  "movie_id": "tt0104014/",
  "duration": "1h 33min",
  "genre": ["Comedy", "Drama"],
  "rating": "5.3",
  "release_date": "1992-02-21",
  "plot_synopsis": ""
}
```

* **Key Proof:** The description of the protagonist (**Diana Bruni**) engaging in "transgression," "carnal longings," and "extra-marital escapades" perfectly matches the plot of **`Così fan tutte`** (All Ladies Do It).

---

## Analysis of Movie ID Conflict (`tt0114142` vs. `tt0114142/`)

The two IDs, `tt0114142` (Reviews) and `tt0114142/` (Details), refer to the **exact same movie**. The content confirms the movie is the 1999 erotic thriller **`Deadly Betrayal`** (also known as `Inner Action`).

### Proof of Identity (tt0114142)

The proof is established through the consistency in genre, the specific actors mentioned in the reviews, and the plot summary's themes of marriage strain and corporate intrigue.

#### 1. Review Record (`tt0114142`)

The reviews identify the film's genre and key cast members:

```json
{
  "review_date": "27 December 2000", 
  "movie_id": "tt0114142", 
  "user_id": "ur0982623", 
  "is_spoiler": false, 
  "review_text": "Just a **standard softcore flick** from the **playboy factory** - so you know what to expect: Bad script, sexy women; **Tawny Kitaen** is gorgeous and prevents the whole movie from drifting into oblivion (imho).", 
  "rating": "8", 
  "review_summary": "Standard playboy production - bad script, sexy women"
}
```

* **Key Proof:** Explicitly identified as a **"standard softcore flick"** and names the actress **Tawny Kitaen**. (The other review also mentions Kitaen and **Shannon Whirry**).

#### 2. Detail Record (`tt0114142/`)

The plot summary describes a high-stakes, suggestive plot that perfectly matches the "erotic thriller/softcore" genre identified in the reviews:

```json
{
  "plot_summary": "The marriage of **David Burgess**, a senior executive, and his beautiful wife, **Sara Burgess**, an interior designer, is under **great strain** because David is in the process of putting together the biggest telecommunications merger... Sara suggests that they attend a **kinky sex club** to revitalize their marriage... David's problems are exacerbated by a **huge power struggle** at his office between **Karen Stone**... To further her aims she hires the services of sleazy divorce private detective **Ernie Fontenot** to spy on the Burgess's. At the date of the merger draws near and all the **blackmail and dirty tricks** start to play themselves out...",
  "movie_id": "tt0114142/",
  "duration": "1h 32min",
  "genre": ["Drama", "Thriller"],
  "rating": "4.0",
  "release_date": "1999-01-29",
  "plot_synopsis": ""
}
```

* **Key Proof:** The plot summary details the **strained marriage**, the suggestion of a **"kinky sex club,"** and the use of a private detective for **"blackmail and dirty tricks,"** confirming the subject matter of the "erotic thriller" reviews.

### Conclusion

The content match is definitive for both pairs of IDs. The two IDs should be considered one and the same after correcting the trailing slash in the `details_df`.

We need to correct the dataset

In [8]:
details_df['movie_id'] = details_df['movie_id'].str.replace('tt0104014/', 'tt0104014', regex=False)
details_df['movie_id'] = details_df['movie_id'].str.replace('tt0114142/', 'tt0114142', regex=False)

In [9]:
print(f'\n--- Relationship Re-Checks ---\n')

reviews_movies = reviews_df['movie_id'].dropna().unique()
details_movies = details_df['movie_id'].dropna().unique()

# 1. Movies in reviews but not in details
movies_in_reviews_only = set(reviews_movies) - set(details_movies)
print(f'1. Number of unique movies in reviews_df NOT found in details_df: {len(movies_in_reviews_only)}')
print(f'   Movie IDs in reviews_df only: {movies_in_reviews_only}')

# 2. Movies in details but not in reviews
movies_in_details_only = set(details_movies) - set(reviews_movies)
print(f'2. Number of unique movies in details_df NOT found in reviews_df: {len(movies_in_details_only)}')
print(f'   Movie IDs in details_df only: {movies_in_details_only}')

# Clean up temporary column
reviews_df.drop(columns=['review_text_safe'], inplace=True, errors='ignore')


--- Relationship Re-Checks ---

1. Number of unique movies in reviews_df NOT found in details_df: 0
   Movie IDs in reviews_df only: set()
2. Number of unique movies in details_df NOT found in reviews_df: 0
   Movie IDs in details_df only: set()


#### More Relationship Analysis

In [10]:

review_counts = reviews_df.groupby('movie_id').size()
stats = review_counts.describe(percentiles=[.1, .25, .5, .75, .9, .95])

output = f"""
--- More Relationship Analysis ---

Total Unique Movies: {review_counts.count():,.0f}
Total Reviews:       {review_counts.sum():,.0f}

Descriptive Statistics for Reviews per Movie:
--------------------------------------------
Mean (Average Reviews):   {stats['mean']:.2f}
Median (50th Percentile): {stats['50%']:.0f}
Standard Deviation (Std): {stats['std']:.2f}

Min Reviews:              {stats['min']:.0f}
Max Reviews:              {stats['max']:.0f}

Percentiles (Distribution Breakdown):
-----------------------------------
10th Percentile (10% of movies have this many or fewer reviews): {stats['10%']:.0f}
25th Percentile (25% of movies have this many or fewer reviews): {stats['25%']:.0f}
75th Percentile (75% of movies have this many or fewer reviews): {stats['75%']:.0f}
90th Percentile (90% of movies have this many or fewer reviews): {stats['90%']:.0f}
95th Percentile (95% of movies have this many or fewer reviews): {stats['95%']:.0f}
"""

print(output)


--- More Relationship Analysis ---

Total Unique Movies: 1,572
Total Reviews:       573,913

Descriptive Statistics for Reviews per Movie:
--------------------------------------------
Mean (Average Reviews):   365.08
Median (50th Percentile): 326
Standard Deviation (Std): 283.09

Min Reviews:              2
Max Reviews:              4845

Percentiles (Distribution Breakdown):
-----------------------------------
10th Percentile (10% of movies have this many or fewer reviews): 88
25th Percentile (25% of movies have this many or fewer reviews): 165
75th Percentile (75% of movies have this many or fewer reviews): 529
90th Percentile (90% of movies have this many or fewer reviews): 712
95th Percentile (95% of movies have this many or fewer reviews): 728



### Interpretation Note:

The Mean (365.08) is noticeably higher than the Median (326). This difference is a sign of a right-skewed distribution.

<img src="rightside_skewed.jpg" width="300">

## Visualizations

Plot out the distribution of reviews per movie

In [11]:
review_counts = reviews_df.groupby('movie_id').size().reset_index(name='review_count')

fig = px.histogram(
    review_counts, 
    x='review_count', 
    nbins=50, # Set number of bins for better resolution
    title='Distribution of Review Counts Per Movie',
    labels={'review_count': 'Number of Reviews (Bin)'},
    height=600,
)

# Customize the layout for better readability
fig.update_layout(
    xaxis_title="Number of Reviews for a Single Movie",
    yaxis_title="Number of Movies (Frequency)",
    bargap=0.05,
    template="plotly_white"
)

# Display the figure
fig.show()

In [12]:
fig_spoiler = px.histogram(
    reviews_df,
    x='is_spoiler',
    color='is_spoiler',
    title='Distribution of Spoilers in Reviews',
    labels={'is_spoiler': 'Is Spoiler (True/False)', 'count': 'Review Count'},
    template="plotly_white"
)

fig_spoiler.update_layout(
    xaxis_title="Is Spoiler",
    yaxis_title="Total Count of Reviews",
    showlegend=False
)
fig_spoiler.show()

This shows that our data is very unbalanced, we need to set `class_weight='balanced'`

# Bag of Words

In [5]:
# 3. Bag of Words Prediction
# features (X) and target (y)
X = reviews_df['review_text']
y = reviews_df['is_spoiler']

In [6]:
# Split into training and testing sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create Bag of Words vectors
# stop_words='english' removes common words like 'the', 'is', 'and'
vectorizer = CountVectorizer(stop_words='english')
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train the Classifier
model = LogisticRegression(max_iter=5000)
model.fit(X_train_bow, y_train)

# Make Predictions
y_pred = model.predict(X_test_bow)

In [7]:
# Evaluate Performance
print("\n--- Bag of Words Model Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Bag of Words Model Performance ---
Accuracy: 0.7658

Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.91      0.85     42365
        True       0.59      0.36      0.45     15027

    accuracy                           0.77     57392
   macro avg       0.69      0.64      0.65     57392
weighted avg       0.74      0.77      0.75     57392



# TFIDF

In [13]:
## 3. Prepare and Merge DataFrames
merged_df = pd.merge(reviews_df, details_df[['movie_id', 'plot_synopsis', 'plot_summary']], on='movie_id', how='left')

## No need anymore because we cleaned the dataset before and there are no missing movies
# merged_df['plot_synopsis'] = merged_df['plot_synopsis'].fillna('')
# merged_df['plot_summary'] = merged_df['plot_summary'].fillna('')

print("--- Merged DataFrame Head (Features) ---")
merged_df[['review_text', 'plot_summary', 'plot_synopsis', 'is_spoiler']].head(2)

--- Merged DataFrame Head (Features) ---


Unnamed: 0,review_text,plot_summary,plot_synopsis,is_spoiler
0,"In its Oscar year, Shawshank Redemption (writt...",Chronicles the experiences of a formerly succe...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",True
1,The Shawshank Redemption is without a doubt on...,Chronicles the experiences of a formerly succe...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",True


#### But

remember that there are movies without plot synopsis, maybe we need a dataset with all 2 features available. Let's make one just in case

In [18]:
details_df_with_plot_synopsis = details_df[details_df['plot_synopsis'].str.contains(r'[A-Za-z]')]
triple_features_merged_df = pd.merge(reviews_df, details_df_with_plot_synopsis[['movie_id', 'plot_synopsis', 'plot_summary']], on='movie_id', how='inner')

print("--- triple_features Merged DataFrame Head (Features) ---")
triple_features_merged_df[['review_text', 'plot_summary', 'plot_synopsis', 'is_spoiler']].head(2)

--- triple_features Merged DataFrame Head (Features) ---


Unnamed: 0,review_text,plot_summary,plot_synopsis,is_spoiler
0,"In its Oscar year, Shawshank Redemption (writt...",Chronicles the experiences of a formerly succe...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",True
1,The Shawshank Redemption is without a doubt on...,Chronicles the experiences of a formerly succe...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",True


In [12]:
## 4. Separate Features and Target
X_review = merged_df['review_text']
X_synopsis = merged_df['plot_synopsis']
X_summary = merged_df['plot_summary']
y = merged_df['is_spoiler']

In [13]:
# Split data into train/test sets (90/10)
X_train_review, X_test_review, X_train_synopsis, X_test_synopsis, X_train_summary, X_test_summary, y_train, y_test = train_test_split(
    X_review, X_synopsis, X_summary, y, test_size=0.1, random_state=42
)

In [14]:
## 5. Triple TF-IDF Vectorization
# Use three separate vectorizers: one for each text field
vectorizer_review = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)
vectorizer_synopsis = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)
vectorizer_summary = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)

In [15]:
# Fit/Transform REVIEW text
X_train_review_tfidf = vectorizer_review.fit_transform(X_train_review)
X_test_review_tfidf = vectorizer_review.transform(X_test_review)

# Fit/Transform SYNOPSIS text
X_train_synopsis_tfidf = vectorizer_synopsis.fit_transform(X_train_synopsis)
X_test_synopsis_tfidf = vectorizer_synopsis.transform(X_test_synopsis)

# Fit/Transform SUMMARY text
X_train_summary_tfidf = vectorizer_summary.fit_transform(X_train_summary)
X_test_summary_tfidf = vectorizer_summary.transform(X_test_summary)

## TFIDF vanilla

In [16]:
## 6. Features
X_train = X_train_review_tfidf
X_test = X_test_review_tfidf

print(f"\nTraining Feature Shape (Reviews): {X_train.shape}")


Training Feature Shape (Reviews): (516521, 50000)


In [17]:
## 7. Model Training and Evaluation
# Use Logistic Regression (increase max_iter to address the ConvergenceWarning)
model = LogisticRegression(max_iter=5000, solver='liblinear')

# Train the model on the combined features
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Vanilla TFIDF Model Performance ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Vanilla TFIDF Model Performance ---
Accuracy: 0.7827

Classification Report:
               precision    recall  f1-score   support

       False       0.80      0.94      0.86     42365
        True       0.66      0.34      0.45     15027

    accuracy                           0.78     57392
   macro avg       0.73      0.64      0.66     57392
weighted avg       0.77      0.78      0.76     57392



## TFIDF vanilla with Balanced Class Weight

In [14]:
## 7. Model Training and Evaluation
# Use Logistic Regression (increase max_iter to address the ConvergenceWarning)
model = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')

# Train the model on the combined features
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Vanilla TFIDF Model Performance Balanced Class Weight---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Vanilla TFIDF Model Performance ---
Accuracy: 0.7223

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.75      0.80     42365
        True       0.48      0.65      0.55     15027

    accuracy                           0.72     57392
   macro avg       0.67      0.70      0.67     57392
weighted avg       0.76      0.72      0.73     57392



## TFIDF with plot_synopsis

In [15]:
## 6. Combine Features
# Horizontally stack the two sparse matrices
X_train_combined = hstack([X_train_review_tfidf, X_train_synopsis_tfidf])
X_test_combined = hstack([X_test_review_tfidf, X_test_synopsis_tfidf])

print(f"\nTraining Feature Shape (Reviews + Synopses): {X_train_combined.shape}")


Training Feature Shape (Reviews + Synopses): (516521, 100000)


In [16]:
## 7. Model Training and Evaluation
# Use Logistic Regression (increase max_iter to address the ConvergenceWarning)
model = LogisticRegression(max_iter=5000, solver='liblinear')

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Plot Synopsis Model Performance ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Plot Synopsis Model Performance ---
Accuracy: 0.7843

Classification Report:
               precision    recall  f1-score   support

       False       0.80      0.94      0.87     42365
        True       0.66      0.36      0.46     15027

    accuracy                           0.78     57392
   macro avg       0.73      0.65      0.66     57392
weighted avg       0.77      0.78      0.76     57392



## TFIDF with plot_synopsis with balanced class weight

In [17]:
## 7. Model Training and Evaluation
# Use Logistic Regression (increase max_iter to address the ConvergenceWarning)
model = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Plot Synopsis Model Performance Balanced Class ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Plot Synopsis Model Performance Balanced Class ---
Accuracy: 0.7245

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.75      0.80     42365
        True       0.48      0.66      0.56     15027

    accuracy                           0.72     57392
   macro avg       0.67      0.70      0.68     57392
weighted avg       0.76      0.72      0.74     57392



## TFIDF with plot_synopsis and plot_summary

In [18]:
## 6. Combine ALL Features
# Horizontally stack all three sparse matrices
X_train_combined = hstack([
    X_train_review_tfidf,
    X_train_synopsis_tfidf,
    X_train_summary_tfidf
])

X_test_combined = hstack([
    X_test_review_tfidf,
    X_test_synopsis_tfidf,
    X_test_summary_tfidf
])

print(f"\nTraining Feature Shape (Reviews + Synopses + Summaries): {X_train_combined.shape}")


Training Feature Shape (Reviews + Synopses + Summaries): (516521, 150000)


In [19]:
## 7. Model Training and Evaluation
# Use Logistic Regression with parameters suitable for large, sparse data
model = LogisticRegression(max_iter=5000, solver='liblinear') 

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Triple Feature Model Performance ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Triple Feature Model Performance ---
Accuracy: 0.7844

Classification Report:
               precision    recall  f1-score   support

       False       0.80      0.94      0.87     42365
        True       0.66      0.36      0.47     15027

    accuracy                           0.78     57392
   macro avg       0.73      0.65      0.67     57392
weighted avg       0.77      0.78      0.76     57392



## TFIDF with plot_synopsis and plot_summary with balaneced class weight

In [20]:
## 7. Model Training and Evaluation
# Use Logistic Regression with parameters suitable for large, sparse data
model = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Triple Feature Model Performance Balanced Class Weight ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Triple Feature Model Performance Balanced Class Weight ---
Accuracy: 0.7245

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.75      0.80     42365
        True       0.48      0.66      0.56     15027

    accuracy                           0.72     57392
   macro avg       0.67      0.71      0.68     57392
weighted avg       0.76      0.72      0.74     57392



In [19]:
## 4. Separate Features and Target
X_review = triple_features_merged_df['review_text']
X_synopsis = triple_features_merged_df['plot_synopsis']
X_summary = triple_features_merged_df['plot_summary']
y = triple_features_merged_df['is_spoiler']

In [20]:
# Split data into train/test sets (90/10)
X_train_review, X_test_review, X_train_synopsis, X_test_synopsis, X_train_summary, X_test_summary, y_train, y_test = train_test_split(
    X_review, X_synopsis, X_summary, y, test_size=0.1, random_state=42
)

In [21]:
## 5. Triple TF-IDF Vectorization
# Use three separate vectorizers: one for each text field
vectorizer_review = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)
vectorizer_synopsis = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)
vectorizer_summary = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50000)

In [22]:
# Fit/Transform REVIEW text
X_train_review_tfidf = vectorizer_review.fit_transform(X_train_review)
X_test_review_tfidf = vectorizer_review.transform(X_test_review)

# Fit/Transform SYNOPSIS text
X_train_synopsis_tfidf = vectorizer_synopsis.fit_transform(X_train_synopsis)
X_test_synopsis_tfidf = vectorizer_synopsis.transform(X_test_synopsis)

# Fit/Transform SUMMARY text
X_train_summary_tfidf = vectorizer_summary.fit_transform(X_train_summary)
X_test_summary_tfidf = vectorizer_summary.transform(X_test_summary)

In [23]:
## 6. Combine ALL Features
# Horizontally stack all three sparse matrices
X_train_combined = hstack([
    X_train_review_tfidf,
    X_train_synopsis_tfidf,
    X_train_summary_tfidf
])

X_test_combined = hstack([
    X_test_review_tfidf,
    X_test_synopsis_tfidf,
    X_test_summary_tfidf
])

print(f"\nTraining Feature Shape (Reviews + Synopses + Summaries for only data with all three features): {X_train_combined.shape}")


Training Feature Shape (Reviews + Synopses + Summaries for only data with all three features): (484945, 150000)


In [24]:
## 7. Model Training and Evaluation
# Use Logistic Regression with parameters suitable for large, sparse data
model = LogisticRegression(max_iter=5000, solver='liblinear') 

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Triple Feature Model Performance ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Triple Feature Model Performance ---
Accuracy: 0.7818

Classification Report:
               precision    recall  f1-score   support

       False       0.80      0.93      0.86     39550
        True       0.66      0.37      0.47     14333

    accuracy                           0.78     53883
   macro avg       0.73      0.65      0.67     53883
weighted avg       0.77      0.78      0.76     53883



In [25]:
## 7. Model Training and Evaluation
# Use Logistic Regression with parameters suitable for large, sparse data
model = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')

# Train the model on the combined features
model.fit(X_train_combined, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n--- Triple Feature Model Performance Balanced Class Weight ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)


--- Triple Feature Model Performance Balanced Class Weight ---
Accuracy: 0.7251

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.75      0.80     39550
        True       0.49      0.67      0.56     14333

    accuracy                           0.73     53883
   macro avg       0.67      0.71      0.68     53883
weighted avg       0.76      0.73      0.74     53883

