In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:


# Importing the CSV files
barbie_df = pd.read_csv('barbie_Cleaned.csv')
oppenheimer_df = pd.read_csv('Oppenheimer_IMDb_reviews.csv')

In [None]:
barbie_df.head()

In [None]:
oppenheimer_df.head()

In [None]:
# Adding a new column with the value 'Barbie'
barbie_df['Source'] = 'Barbie'
barbie_df.head()

In [None]:
# Adding a new column with the value 'Oppenheimer'
oppenheimer_df['Source'] = 'Oppenheimer'
oppenheimer_df.head()

In [None]:
# Dropping all columns apart from 'rating', 'review', and 'Source'
oppenheimer_df = oppenheimer_df[['rating', 'review', 'Source']]
oppenheimer_df.head()

In [None]:
# Converting the 'rating' column of barbie_df to integer type
barbie_df['rating'].unique()

No value of 10 is shown, potential error shown through further analysis

In [None]:
# Identifying non-numeric values in the 'rating' column of barbie_df
non_numeric_ratings = barbie_df[~barbie_df['rating'].str.isnumeric()]['rating'].unique()
non_numeric_ratings

In [None]:
# Removing rows with non-numeric ratings from barbie_df
barbie_df = barbie_df[barbie_df['rating'].str.isnumeric()]

# Converting the 'rating' column to integer type
barbie_df['rating'] = barbie_df['rating'].astype(int)
barbie_df.head()

In [None]:
oppenheimer_df.info()

In [None]:
# Converting the 'rating' column of oppenheimer_df to integer type
oppenheimer_df['rating'] = oppenheimer_df['rating'].astype(int)
oppenheimer_df['rating'].unique()

In [None]:
barbie_df= barbie_df.rename(columns={'text': 'review'})
barbie_df.head()

In [None]:
import re

# Function to clean the review text
def clean_review(text):
    # Using regex to match the date pattern and remove everything before it
    match = re.search(r'\d{1,2} [A-Za-z]+ \d{4}', text)
    if match:
        return text[match.end():]
    return text

# Applying the function to the 'review' column of barbie_df_limited
barbie_df['review'] = barbie_df['review'].apply(clean_review)
barbie_df.head()

In [None]:
barbie_df = barbie_df[~barbie_df['review'].str.startswith('Warning:')]
barbie_df.head()

In [None]:
# Dropping rows from oppenheimer_df where the 'review' column starts with 'Warning:'
oppenheimer_df = oppenheimer_df[~oppenheimer_df['review'].str.startswith('Warning:')]

oppenheimer_df.head()

In [None]:
barbie_df = barbie_df.head(500)
oppenheimer_df = oppenheimer_df.head(500)
Barbenheimer = pd.concat([barbie_df, oppenheimer_df], ignore_index=True)

In [None]:
Barbenheimer.head()

In [None]:
Barbenheimer.to_csv('Barbenheimer_Sentiment_Analysis.csv', index=False)

# BARBENHEIMER Sentiment Analysis

---

In [None]:
Barbenheimer.head()

In [None]:
!pip install -q nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Tokenization
Barbenheimer['tokens'] = Barbenheimer['review'].apply(word_tokenize)
# Remove stop words
stop_words = set(stopwords.words('english'))
Barbenheimer['filtered_tokens'] = Barbenheimer['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
# Lemmatization
lemmatizer = WordNetLemmatizer()
Barbenheimer['lemmatized_tokens'] = Barbenheimer['filtered_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
Barbenheimer.head()

In [None]:
# Further cleaning of the filtered_tokens and lemmatized_tokens columns
Barbenheimer['filtered_tokens'] = Barbenheimer['filtered_tokens'].apply(lambda x: [word for word in x if word.isalpha() and len(word) > 1])
Barbenheimer['lemmatized_tokens'] = Barbenheimer['lemmatized_tokens'].apply(lambda x: [word for word in x if word.isalpha() and len(word) > 1])
Barbenheimer[['review', 'rating', 'Source', 'filtered_tokens', 'lemmatized_tokens']].head()

In [None]:
!pip install -q emoji
import emoji
import re
def extract_emojis(text):
    # Extract emojis using a regex pattern
    emojis = ''.join(re.findall(r'[\U00010000-\U0010ffff]', text))
    return emojis
# Extract emojis from the 'review' column
Barbenheimer['emojis'] = Barbenheimer['review'].apply(extract_emojis)
# Filter rows where emojis are present
emoji_rows = Barbenheimer[Barbenheimer['emojis'] != '']
emoji_rows[['review', 'emojis']]

In [None]:
!pip install -q textblob
from textblob import TextBlob
# Calculate sentiment scores using TextBlob
Barbenheimer['sentiment'] = Barbenheimer['review'].apply(lambda x: TextBlob(x).sentiment.polarity)
# Display the reviews along with their sentiment scores
Barbenheimer[['review', 'sentiment']].head()

In [None]:
Barbenheimer.head()

In [None]:
# Check if the 'filtered_tokens' and 'lemmatized_tokens' columns are the same
are_columns_identical = (Barbenheimer['filtered_tokens'] == Barbenheimer['lemmatized_tokens']).all()
are_columns_identical

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score using VADER
def get_vader_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# Apply the function to the 'review' column
Barbenheimer['vader_sentiment'] = Barbenheimer['review'].apply(get_vader_sentiment)

# Display the original reviews and their VADER sentiment scores
Barbenheimer[['review', 'sentiment', 'vader_sentiment']].head()

In [None]:
Barbenheimer.head()

# Analysis on Potential Error from the given CSV

In [None]:
# Group by the 'Source' column and calculate the average sentiment score and rating
average_sentiment_and_rating_by_source = Barbenheimer.groupby('Source').agg({
    'vader_sentiment': 'mean',
    'rating': 'mean'
}).reset_index()

# Display the average sentiment score and rating for each source
average_sentiment_and_rating_by_source

From the results:

Reviews from the "Barbie" source have a higher average sentiment score but a lower average rating compared to those from the "Oppenheimer" source.

This discrepancy indicates that while the sentiment in the text of the "Barbie" reviews is more positive, the numerical ratings given are lower. Conversely, the "Oppenheimer" reviews have a higher numerical rating but a lower sentiment score.

This kind of analysis can provide valuable insights into potential mismatches between textual sentiment and numerical ratings, which can be further explored.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14, 6))

# Plotting the distribution of sentiment scores for each source
plt.subplot(1, 2, 1)
sns.histplot(Barbenheimer, x='vader_sentiment', hue='Source', element='step', stat='density', common_norm=False)
plt.title('Distribution of Sentiment Scores by Source')
plt.xlabel('Sentiment Score')
plt.ylabel('Density')

# Plotting the distribution of ratings for each source
plt.subplot(1, 2, 2)
sns.histplot(Barbenheimer, x='rating', hue='Source', element='step', stat='density', common_norm=False)
plt.title('Distribution of Ratings by Source')
plt.xlabel('Rating')
plt.ylabel('Density')

plt.tight_layout()
plt.show()

Left Plot (Distribution of Sentiment Scores by Source):

The sentiment scores for "Barbie" reviews are mostly clustered around the positive end, indicating a generally positive sentiment.
The sentiment scores for "Oppenheimer" reviews are more spread out, with a peak around the neutral to slightly positive range.

Right Plot (Distribution of Ratings by Source):

The ratings for "Barbie" reviews are clustered around the lower end, indicating lower numerical ratings.
The ratings for "Oppenheimer" reviews are spread out but have a peak in the higher rating range.

From the visualizations, we can observe a discrepancy between the sentiment scores and ratings, especially for the "Barbie" source. While the sentiment of the reviews is positive, the numerical ratings given are lower.

This discrepancy can arise due to various reasons:

The sentiment analysis captures the overall mood of the text, but the numerical rating might be influenced by specific aspects of the product or service. There might be nuances in the reviews that are not captured by the sentiment analysis but influence the rating.

In [None]:
# Filter 'Barbie' reviews with high sentiment scores but low ratings using the original review column
high_sentiment_low_rating = Barbenheimer[(Barbenheimer['Source'] == 'Barbie') & (Barbenheimer['vader_sentiment'] > 0.5) & (Barbenheimer['rating'] < 3)]

# Filter 'Barbie' reviews with low sentiment scores but high ratings using the original review column
low_sentiment_high_rating = Barbenheimer[(Barbenheimer['Source'] == 'Barbie') & (Barbenheimer['vader_sentiment'] < 0) & (Barbenheimer['rating'] > 4)]

# Display a sample of each
high_sentiment_low_rating[['review', 'vader_sentiment', 'rating']].head(), low_sentiment_high_rating[['review', 'vader_sentiment', 'rating']].head()

Interestingly, there were no reviews with low sentiment scores but high ratings in the sample.

From the sample reviews with high sentiment scores but low ratings:

The reviews are overwhelmingly positive in their textual content, praising the movie's themes, storytelling, and production.
However, these reviews have a rating of 1, which is contradictory to the positive sentiment expressed in the text.
This discrepancy could be due to various reasons:

The rating might be influenced by factors not mentioned in the review text, such as personal biases, external influences, or technical issues during the rating process.
There might be sarcasm or irony in the reviews that the sentiment analysis tool failed to capture.

In [None]:
# Filter 'Oppenheimer' reviews with high sentiment scores but low ratings using the original review column
high_sentiment_low_rating_oppenheimer = Barbenheimer[(Barbenheimer['Source'] == 'Oppenheimer') & (Barbenheimer['vader_sentiment'] > 0.5) & (Barbenheimer['rating'] < 3)]

# Filter 'Oppenheimer' reviews with low sentiment scores but high ratings using the original review column
low_sentiment_high_rating_oppenheimer = Barbenheimer[(Barbenheimer['Source'] == 'Oppenheimer') & (Barbenheimer['vader_sentiment'] < 0) & (Barbenheimer['rating'] > 4)]

# Display a sample of each
high_sentiment_low_rating_oppenheimer[['review', 'vader_sentiment', 'rating']].head(), low_sentiment_high_rating_oppenheimer[['review', 'vader_sentiment', 'rating']].head()

Again there are instances where the sentiment score does not align with the given rating

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=Barbenheimer, x='vader_sentiment', y='rating', hue='Source', palette='viridis', alpha=0.6)
plt.title('Scatter Plot of Sentiment Scores vs. Ratings')
plt.xlabel('Sentiment Score')
plt.ylabel('Rating')
plt.legend(title='Source', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

Most of the reviews for both sources have positive sentiment scores, indicating generally positive feedback.

There's a noticeable cluster of "Barbie" reviews with high sentiment scores but low ratings, which we previously identified as potentially sarcastic or ironic reviews.

"Oppenheimer" reviews seem to have a more consistent relationship between sentiment scores and ratings, with higher ratings generally corresponding to higher sentiment scores.

This visualization provides a clear view of the distribution of sentiment scores and ratings for each source and highlights areas where sentiment analysis might not align with numerical ratings.

In [None]:
o_high_sentiment_low_rating = Barbenheimer[(Barbenheimer['Source'] == 'Oppenheimer') & (Barbenheimer['vader_sentiment'] > 0.5) & (Barbenheimer['rating'] < 3)]

# Filter 'Barbie' reviews with low sentiment scores but high ratings
o_low_sentiment_high_rating = Barbenheimer[(Barbenheimer['Source'] == 'Oppenheimer') & (Barbenheimer['vader_sentiment'] < 0) & (Barbenheimer['rating'] > 4)]

o_high_sentiment_low_rating[['review', 'vader_sentiment', 'rating']].head(),
o_low_sentiment_high_rating[['review', 'vader_sentiment', 'rating']].head()

In [None]:
# Heatmap for frequency of specific rating values for each source
rating_counts = Barbenheimer.groupby(['Source', 'rating']).size().unstack().fillna(0)

plt.figure(figsize=(10, 6))
sns.heatmap(rating_counts, annot=True, cmap='YlGnBu', fmt='g')
plt.title('Frequency of Specific Rating Values for Each Source')
plt.xlabel('Rating')
plt.ylabel('Source')
plt.show()

# Sentiment Score Analysis

In [None]:
!pip install -q wordcloud
from wordcloud import WordCloud

def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=400, height=400, background_color='white', colormap='viridis').generate(text)
    plt.figure(figsize=(6, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

# Generate word clouds for each source
barbie_text = ' '.join(Barbenheimer[Barbenheimer['Source'] == 'Barbie']['review'])
oppenheimer_text = ' '.join(Barbenheimer[Barbenheimer['Source'] == 'Oppenheimer']['review'])

generate_wordcloud(barbie_text, 'Word Cloud for Barbie Reviews')
generate_wordcloud(oppenheimer_text, 'Word Cloud for Oppenheimer Reviews')

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Source', y='vader_sentiment', data=Barbenheimer, palette='viridis')
plt.title('Distribution of Sentiment Scores for Each Source')
plt.xlabel('Source')
plt.ylabel('Sentiment Score')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=Barbenheimer, x='vader_sentiment', hue='Source', bins=30, kde=True, palette='viridis')
plt.title('Distribution of Sentiment Scores for Each Source')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
# Categorize sentiment scores into 'Positive', 'Neutral', and 'Negative'
conditions = [
    (Barbenheimer['vader_sentiment'] > 0.5),
    (Barbenheimer['vader_sentiment'] >= -0.5) & (Barbenheimer['vader_sentiment'] <= 0.5),
    (Barbenheimer['vader_sentiment'] < -0.5)
]
choices = ['Positive', 'Neutral', 'Negative']
Barbenheimer['sentiment_category'] = np.select(conditions, choices)

# Plotting the count of reviews in each sentiment category for each source
plt.figure(figsize=(10, 6))
sns.countplot(data=Barbenheimer, x='Source', hue='sentiment_category', palette='viridis')
plt.title('Count of Reviews by Sentiment Category for Each Source')
plt.xlabel('Source')
plt.ylabel('Count')
plt.legend(title='Sentiment Category')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()