# Sentiment Analysis

In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhaoqianxue/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhaoqianxue/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhaoqianxue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhaoqianxue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Data Cleaning

In [43]:
import pandas as pd

def process_oscar_winning_reviews(data_path, oscar_winning_movies=None):
    """
    Loads movie reviews data from a CSV file, removes duplicates and missing data,
    drops unnecessary columns, and filters the dataset for specified Oscar-winning movies.

    Parameters:
        data_path (str): The path to the dataset file.
        oscar_winning_movies (list of str, optional): Titles of Oscar-winning movies to filter.
            If None, a default list is used.

    Returns:
        pd.DataFrame: The cleaned and filtered DataFrame containing only reviews of specified Oscar-winning movies.
    """
    # Load the data
    data = pd.read_csv(data_path)

    # Remove duplicates and any rows with missing data
    data.drop_duplicates(inplace=True)
    data.dropna(inplace=True)

    # Drop unnecessary columns, if present
    for column in ['Date', 'Oscar Won', 'year']:
        if column in data.columns:
            data.drop(columns=column, inplace=True)

    # Default list of specific Oscar-winning movies if not provided
    if oscar_winning_movies is None:
        oscar_winning_movies = [
            "Oppenheimer",
            "Everything Everywhere All at Once",
            "CODA",
            "Nomadland",
            "Parasite"
        ]

    # Select and rename columns
    column_map = {'title': 'Name', 'Movie Name': 'Name', 'reviewText': 'Review'}
    data.rename(columns=column_map, inplace=True)
    data = data[['Name', 'Review']] if 'Review' in data.columns else data

    # Filter data for only Oscar-winning movies in the list
    filtered_data = data[data['Name'].isin(oscar_winning_movies)]

    return filtered_data


### 1.1 Tomato Audiences

In [61]:
rt_audiences_path = "../../../data/audiences_reviews.csv"
df_rt_audiences = process_oscar_winning_reviews(rt_audiences_path)
df_rt_audiences.head()

Unnamed: 0,Name,Review
0,Parasite,"""Parasite"" stands as a cinematic tour de force..."
1,Parasite,"pretty good movie, not my kind of thing but i ..."
2,Parasite,I love everything about this movie. Definitely...
3,Parasite,Mid. This movie has bland characters that are ...
4,Parasite,"This is a heavy story, with conflicting people..."


### 1.2 Tomato Critics

In [45]:
rt_critics_path = '../../../data/critics_reviews.csv'
df_rt_critics = process_oscar_winning_reviews(rt_critics_path)
df_rt_critics.head()

Unnamed: 0,Name,Review
0,Parasite,Parasite is the movie we will look back on as ...
1,Parasite,"It is sadistic, angry and dark and has a lot t..."
2,Parasite,"""Parasite"" has already made history for South ..."
3,Parasite,"Cinematography, score, editing… everything’s a..."
4,Parasite,"Radically different films such as Knives Out, ..."


### 1.3 IMDB Audiences

In [46]:
imdb_path = '../../../data/imdb_reviews.csv'
df_imdb = process_oscar_winning_reviews(imdb_path)
df_imdb.head()

Unnamed: 0,Name,Review
0,Oppenheimer,one anticipated film year many people included...
1,Oppenheimer,youll wit brain fully switched watching oppenh...
2,Oppenheimer,im big fan nolans work really looking forward ...
3,Oppenheimer,oppenheimer biographical thriller film written...
4,Oppenheimer,movie wow dont think ever felt like watching m...


### 1.4 Combine

In [47]:
def combine_movie_reviews(df_rt_audiences, df_rt_critics, df_imdb):
    """
    Combine movie review data from three different sources and add a source label to each.

    Parameters:
    - df_rt_audiences: pd.DataFrame, DataFrame containing reviews from Rotten Tomatoes Audiences.
    - df_rt_critics: pd.DataFrame, DataFrame containing reviews from Rotten Tomatoes Critics.
    - df_imdb: pd.DataFrame, DataFrame containing reviews from IMDb Audiences.

    Returns:
    - pd.DataFrame: A combined DataFrame with reviews from all three sources.
    """
    # Add source labels to each DataFrame
    df_rt_audiences['Source'] = 'Rotten Tomatoes Audiences'
    df_rt_critics['Source'] = 'Rotten Tomatoes Critics'
    df_imdb['Source'] = 'IMDB Audiences'  # Note the label change to match your specific requirement

    # Combine all three dataframes into one
    combined_data = pd.concat([df_rt_audiences, df_rt_critics, df_imdb], ignore_index=True)
    
    return combined_data

df_combined = combine_movie_reviews(df_rt_audiences, df_rt_critics, df_imdb)
df_combined.head()

Unnamed: 0,Name,Review,Source
0,Parasite,"""Parasite"" stands as a cinematic tour de force...",Rotten Tomatoes Audiences
1,Parasite,"pretty good movie, not my kind of thing but i ...",Rotten Tomatoes Audiences
2,Parasite,I love everything about this movie. Definitely...,Rotten Tomatoes Audiences
3,Parasite,Mid. This movie has bland characters that are ...,Rotten Tomatoes Audiences
4,Parasite,"This is a heavy story, with conflicting people...",Rotten Tomatoes Audiences


## 2. Data Processing

In [48]:
def clean_text(text):
    """
    Standardizes and cleans the text by:
    - Converting to lowercase.
    - Removing punctuation.
    - Removing stopwords.
    - Lemmatizing words.
    
    Parameters:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


In [49]:
df_combined['Review'] = df_combined['Review'].apply(clean_text)
df_combined['Review'].head()

0    parasite stand cinematic tour de force searing...
1             pretty good movie kind thing still liked
2    love everything movie definitely number 1 favo...
3    mid movie bland character archetype story quit...
4    heavy story conflicting people spectrum societ...
Name: Review, dtype: object

## 3. Sentiment Analysis

Define a function to apply sentiment analysis to each review in the DataFrame. This function uses the VADER sentiment analyzer to compute the compound sentiment score.

In [51]:
# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    score = sia.polarity_scores(text)
    return score['compound']  # Using the compound score for simplicity

# Apply sentiment analysis to the dataframe
df_combined['Sentiment'] = df_combined['Review'].apply(analyze_sentiment)

In [54]:
def average_movie_score(movie_name):
    """
    Calculates and returns the average movie score from different sources for the given movie name.

    Parameters:
        movie_name (str): The name of the movie.

    Returns:
        dict: A dictionary with the average score from each source or a message if not found.
    """
    # Ensure the movie name is a string
    if not isinstance(movie_name, str):
        return "Invalid input: movie name must be a string."

    # Check if the movie name exists in the DataFrame
    if movie_name not in df_combined['Name'].unique():
        return f"No data available for movie: {movie_name}"

    # Filter data for the specified movie
    movie_data = df_combined[df_combined['Name'] == movie_name]
    
    # Dictionary to hold the average scores by source
    average_scores = {}

    # Calculate the average score for each source
    for source in movie_data['Source'].unique():
        source_data = movie_data[movie_data['Source'] == source]
        average_score = source_data['Sentiment'].mean()
        average_scores[source] = average_score

    if not average_scores:
        return "No reviews available for this movie."
    
    return average_scores


In [55]:
# Test cases for the average_movie_score function
test_movies = ["Oppenheimer", "Everything Everywhere All at Once", "The Batman", "Nomadland", "Unknown Movie"]

# Execute the function for each test movie and print the results
for movie in test_movies:
    result = average_movie_score(movie)
    print(f"Average score for '{movie}': {result}")

Average score for 'Oppenheimer': {'Rotten Tomatoes Audiences': 0.3932152272727273, 'Rotten Tomatoes Critics': 0.22901643564356436, 'IMDB Audiences': 0.6057082917082918}
Average score for 'Everything Everywhere All at Once': {'Rotten Tomatoes Audiences': 0.3740295454545454, 'Rotten Tomatoes Critics': 0.3163227722772277, 'IMDB Audiences': 0.734269191049914}
Average score for 'The Batman': No data available for movie: The Batman
Average score for 'Nomadland': {'Rotten Tomatoes Audiences': 0.4324973863636364, 'Rotten Tomatoes Critics': 0.3384633867276888, 'IMDB Audiences': 0.5317552304964539}
Average score for 'Unknown Movie': No data available for movie: Unknown Movie
