In [3]:
import numpy as np
import pandas as pd
from scipy import stats

In [62]:
# Load the dataset
file_path = './movieReplicationSet.csv'
df = pd.read_csv(file_path)

In [5]:
# Create a copy of the original dataset to keep missing values intact
df_original = df.copy()

# Create another copy for imputation
df_imputed = df.copy()

# Identify only the movie rating columns (1-400 according to the assignment description)
movie_columns = df.columns[:400]

# Replace missing values with the median of each respective column for movie ratings
df_imputed[movie_columns] = df_imputed[movie_columns].apply(lambda x: x.fillna(x.median()), axis=0)

Q1: We use Welch t-test here because it's safer to assume that the variances and sample sizes could be unequal between high-popularity and low-popularity movies. Given that the p-value is significantly less than our significance level 
α=0.005, we reject the null hypothesis. This suggests that newer movies are rated differently than older movies, although the difference in average ratings is quite small.

In [10]:
# Calculate the number of ratings for each movie in the original dataset
num_ratings = df_original[movie_columns].count()

# Perform a median-split to categorize movies as 'high popularity' or 'low popularity'
median_num_ratings = num_ratings.median()
high_popularity_movies = num_ratings[num_ratings > median_num_ratings].index
low_popularity_movies = num_ratings[num_ratings <= median_num_ratings].index

# Calculate the average rating for 'high popularity' and 'low popularity' movies
avg_rating_high_popularity = df_original[high_popularity_movies].mean().mean()
avg_rating_low_popularity = df_original[low_popularity_movies].mean().mean()

# Perform a t-test to compare the average ratings of 'high popularity' and 'low popularity' movies
# We'll flatten the data and remove NaN values for the t-test
high_popularity_ratings = df_original[high_popularity_movies].values.flatten()
high_popularity_ratings = high_popularity_ratings[~np.isnan(high_popularity_ratings)]

low_popularity_ratings = df_original[low_popularity_movies].values.flatten()
low_popularity_ratings = low_popularity_ratings[~np.isnan(low_popularity_ratings)]

t_stat, p_value = stats.ttest_ind(high_popularity_ratings, low_popularity_ratings, equal_var=False)

avg_rating_high_popularity, avg_rating_low_popularity, t_stat, p_value


(2.8683159282110724, 2.400922845130662, 56.74324476674133, 0.0)

Q2: Same reaon as in Q1, we use the welch t-test here, Given that the p-value is significantly less than our significance level α = 0.005, we reject the null hypothesis. This suggests that newer movies are rated differently than older movies, although the difference in average ratings is quite small.

In [11]:
import re

# Extract the release year from the movie titles
release_years = df_imputed.columns[:400].to_series().str.extract(r'\((\d{4})\)')[0].astype(float)

# Drop movies where we couldn't extract a release year
valid_movie_columns = release_years.dropna().index

# Perform a median-split to categorize movies as 'new' or 'old'
median_release_year = release_years.dropna().median()
new_movies = release_years[release_years > median_release_year].index
old_movies = release_years[release_years <= median_release_year].index

# Calculate the average rating for 'new' and 'old' movies
avg_rating_new_movies = df_imputed[new_movies].mean().mean()
avg_rating_old_movies = df_imputed[old_movies].mean().mean()

# Perform a t-test to compare the average ratings of 'new' and 'old' movies
# We'll flatten the data for the t-test
new_movies_ratings = df_imputed[new_movies].values.flatten()
old_movies_ratings = df_imputed[old_movies].values.flatten()

t_stat_new_old, p_value_new_old = stats.ttest_ind(new_movies_ratings, old_movies_ratings, equal_var=False)

avg_rating_new_movies, avg_rating_old_movies, t_stat_new_old, p_value_new_old


(2.7882836157126545,
 2.748159703455119,
 20.106371466938946,
 7.210714168171186e-90)

Q3: Welch t-test, p >> alpha, no difference in the rating for shrek between male and female

In [63]:
# Isolate the ratings for 'Shrek (2001)'
shrek_ratings = df_original['Shrek (2001)'].dropna()

# Separate these ratings based on gender (1 = female, 2 = male)
gender_column = 'Gender identity (1 = female; 2 = male; 3 = self-described)'
shrek_ratings_gender = df_original.loc[shrek_ratings.index, [gender_column, 'Shrek (2001)']]
shrek_ratings_female = shrek_ratings_gender[shrek_ratings_gender[gender_column] == 1]['Shrek (2001)']
shrek_ratings_male = shrek_ratings_gender[shrek_ratings_gender[gender_column] == 2]['Shrek (2001)']

# Calculate the average rating for male and female viewers
avg_rating_shrek_female = shrek_ratings_female.mean()
avg_rating_shrek_male = shrek_ratings_male.mean()

# Perform a t-test to compare the average ratings between male and female viewers
t_stat_shrek, p_value_shrek = stats.ttest_ind(shrek_ratings_female, shrek_ratings_male, equal_var=False, nan_policy='omit')

avg_rating_shrek_female, avg_rating_shrek_male, t_stat_shrek, p_value_shrek


(3.155450874831763, 3.08298755186722, 1.1558907155973421, 0.24834907946281018)

In [25]:
shrek_ratings_female = df_imputed['Shrek (2001)'][df_imputed[gender_column] == 1]
shrek_ratings_male = df_imputed['Shrek (2001)'][df_imputed[gender_column] == 2]

# Perform t-test and omit NaN values
t_stat, p_value = stats.ttest_ind(ratings_female, ratings_male, equal_var=False, nan_policy='omit')

shrek_ratings_female.mean(), shrek_ratings_male.mean(), t_stat, p_value

(3.1827757125154896,
 3.1134615384615385,
 2.087516112530672,
 0.037573658853460555)

Q4 around 7%

In [46]:
count_diff_ratings = 0

# Loop through each movie and perform a t-test comparing ratings by male and female viewers
for movie in movie_columns:
    ratings_female = df_imputed[movie][df_imputed[gender_column] == 1]
    ratings_male = df_imputed[movie][df_imputed[gender_column] == 2]
    
    # Perform t-test and omit NaN values
    t_stat, p_value = stats.ttest_ind(ratings_female, ratings_male, equal_var=False, nan_policy='omit')
    
    # Check if the difference is statistically significant
    if p_value < 0.005:
        count_diff_ratings += 1

# Calculate the proportion of movies that are rated differently by gender
proportion_diff_ratings = count_diff_ratings / len(movie_columns)

count_diff_ratings, proportion_diff_ratings

(27, 0.0675)

Q5 

In [37]:
# Extract the ratings for 'The Lion King (1994)'
lion_king_ratings = df_imputed['The Lion King (1994)']  # Replace with the exact column name for this movie in your dataset

# Separate these ratings based on whether the respondent is an only child (assuming the column name is 'Only child')
ratings_only_child = lion_king_ratings[df_imputed['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 1]
ratings_with_siblings = lion_king_ratings[df_imputed['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 0]

# Calculate the average ratings for each group
avg_rating_only_child = ratings_only_child.mean()
avg_rating_with_siblings = ratings_with_siblings.mean()

# Perform a t-test to compare the average ratings between the two groups
t_stat, p_value = stats.ttest_ind(ratings_only_child, ratings_with_siblings, equal_var=False, nan_policy='omit')

p_value

0.06246490747619668

Q6

In [40]:
count_only_child_effect = 0

# Loop through each movie and perform a t-test comparing ratings by viewers who are only children vs those who have siblings
for movie in df_imputed.columns[:400]:  # Assuming the first 400 columns are movie ratings
    ratings_only_child = df_imputed[movie][df_imputed['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 1]
    ratings_with_siblings = df_imputed[movie][df_imputed['Are you an only child? (1: Yes; 0: No; -1: Did not respond)'] == 0]
    
    # Perform a t-test and omit NaN values
    t_stat, p_value = stats.ttest_ind(ratings_only_child, ratings_with_siblings, equal_var=False, nan_policy='omit')
    
    # Check if the difference is statistically significant (using alpha = 0.005 as per the project guidelines)
    if p_value < 0.005:
        count_only_child_effect += 1

# Calculate the proportion of movies that exhibit an "only child effect"
proportion_only_child_effect = count_only_child_effect / 400  # Assuming 400 movies in the dataset

proportion_only_child_effect

0.0075

In [42]:
df.columns[476]

'Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'

Q7

In [44]:
# Extract the ratings for 'The Wolf of Wall Street (2013)'
wolf_ratings = df_imputed['The Wolf of Wall Street (2013)']  # Replace with the exact column name in your dataset

# Separate these ratings based on whether the viewer prefers to watch movies socially or alone
ratings_social = wolf_ratings[df_imputed['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 0]
ratings_alone = wolf_ratings[df_imputed['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 1]

# Perform a t-test to compare the average ratings between the two groups
t_stat, p_value = stats.ttest_ind(ratings_social, ratings_alone, equal_var=False, nan_policy='omit')

p_value

0.33796312028022846

Q8

In [48]:
# Initialize a counter for movies that exhibit a "social watching" effect
count_social_watching_effect = 0

# Loop through each movie and perform a t-test comparing ratings by viewers who prefer to watch movies socially vs those who prefer to watch alone
for movie in df_imputed.columns[:400]:  # Assuming the first 400 columns are movie ratings
    ratings_social = df_imputed[movie][df_imputed['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 0]
    ratings_alone = df_imputed[movie][df_imputed['Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)'] == 1]
    
    # Perform a t-test and omit NaN values
    t_stat, p_value = stats.ttest_ind(ratings_social, ratings_alone, equal_var=False, nan_policy='omit')
    
    # Check if the difference is statistically significant
    if p_value < 0.005:
        count_social_watching_effect += 1

# Calculate the proportion of movies that exhibit a "social watching" effect
proportion_social_watching_effect = count_social_watching_effect / 400  # Assuming 400 movies in the dataset

proportion_social_watching_effect

0.01

Q9 here we use KS test to test difference in distribution

In [50]:
# Extract the ratings for 'Home Alone (1990)' and 'Finding Nemo (2003)'
home_alone_ratings = df_imputed['Home Alone (1990)']  # Replace with the exact column name in your dataset
finding_nemo_ratings = df_imputed['Finding Nemo (2003)']  # Replace with the exact column name in your dataset

# Perform a two-sample Kolmogorov-Smirnov test to compare the distributions
ks_stat, p_value = stats.ks_2samp(home_alone_ratings, finding_nemo_ratings)
p_value

4.816411619080073e-12

Q10

In [58]:
# List of keywords for franchises
franchises = ['Star Wars', 'Harry Potter', 'The Matrix', 'Indiana Jones', 'Jurassic Park', 'Pirates of the Caribbean', 'Toy Story', 'Batman']

# Initialize a list to store franchises of inconsistent quality
inconsistent_franchises_anova = []

# Loop through each franchise and perform one-way ANOVA
for franchise in franchises:
    # Identify columns (movies) belonging to this franchise
    franchise_movies = [col for col in df_imputed.columns if franchise in col]
    
    # Extract ratings for these movies
    franchise_ratings = [df_imputed[movie] for movie in franchise_movies]

    # Perform one-way ANOVA
    f_stat, p_value = stats.f_oneway(*franchise_ratings)
    
    # Interpret the result
    if p_value < 0.005:
        inconsistent_franchises_anova.append(franchise)
inconsistent_franchises_anova

['Star Wars',
 'The Matrix',
 'Indiana Jones',
 'Jurassic Park',
 'Pirates of the Caribbean',
 'Toy Story',
 'Batman']

In [59]:
df.columns[405]

'I enjoy doing things without too much planning '

Extra bonus: to see whether people enjoy doing things without too much planning rate movies differently with those who like planning

In [61]:
# Initialize a counter for movies that are rated differently by the two groups
count_diff_ratings = 0

# Loop through each movie and perform a t-test comparing ratings by the two groups
for movie in df_imputed.columns[:400]:  # Assuming the first 400 columns are movie ratings
    ratings_planning = df_imputed[movie][df_imputed['I enjoy doing things without too much planning '] >= 4]  # Adjust the threshold as needed
    ratings_no_planning = df_imputed[movie][df_imputed['I enjoy doing things without too much planning '] <= 2]  # Adjust the threshold as needed
    
    # Perform a t-test and omit NaN values
    t_stat, p_value = stats.ttest_ind(ratings_planning, ratings_no_planning, equal_var=False, nan_policy='omit')
    
    # Check if the difference is statistically significant
    if p_value < 0.005:
        count_diff_ratings += 1

# Calculate the proportion of movies that are rated differently by the two groups
proportion_diff_ratings = count_diff_ratings / 400  # Assuming 400 movies in the dataset

print(f"Number of movies that are rated differently by the two groups: {count_diff_ratings}")
print(f"Proportion of movies that are rated differently by the two groups: {proportion_diff_ratings}")

Number of movies that are rated differently by the two groups: 2
Proportion of movies that are rated differently by the two groups: 0.005
