### Import 

In [33]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import chi2

### Load Movie Titles

In [6]:
movies = pd.read_csv('movies.csv')


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


### Load Rating Titles


In [11]:
ratings = pd.read_csv('ratings.csv')


In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### EDA

In [14]:
df= ratings.merge(movies,on='movieId', how='left')
df.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [15]:
df = df.drop(columns='timestamp')


In [16]:
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [24]:
df.isna().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

# Hypothesis Testing

### Chi-Squared

Is there a relationship between the movie genre and the rating it receives?

H0: There is no relationship between the movie genre and rating it receives

HA: There is a relationship between the movie genre and rating it receives

In [31]:
chi_sq = df.groupby('rating')['genres'].value_counts()
chi_sq2 = chi_sq.unstack()
chi_sq2.fillna(0,inplace = True)
chi_sq2

genres,(no genres listed),Action,Action|Adventure,Action|Adventure|Animation,Action|Adventure|Animation|Children,Action|Adventure|Animation|Children|Comedy,Action|Adventure|Animation|Children|Comedy|Fantasy,Action|Adventure|Animation|Children|Comedy|IMAX,Action|Adventure|Animation|Children|Comedy|Romance,Action|Adventure|Animation|Children|Comedy|Sci-Fi,...,Romance|Thriller,Romance|War,Romance|Western,Sci-Fi,Sci-Fi|IMAX,Sci-Fi|Thriller,Sci-Fi|Thriller|IMAX,Thriller,War,Western
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.5,2.0,4.0,6.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,...,1.0,0.0,0.0,7.0,2.0,1.0,0.0,7.0,0.0,0.0
1.0,2.0,13.0,13.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,5.0,0.0,3.0,0.0,25.0,0.0,5.0
1.5,0.0,6.0,11.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,...,1.0,0.0,0.0,6.0,0.0,4.0,0.0,12.0,0.0,0.0
2.0,2.0,31.0,39.0,5.0,1.0,5.0,0.0,2.0,0.0,2.0,...,3.0,0.0,0.0,9.0,0.0,14.0,0.0,45.0,0.0,10.0
2.5,6.0,13.0,23.0,2.0,2.0,9.0,3.0,1.0,1.0,0.0,...,6.0,1.0,1.0,8.0,1.0,7.0,1.0,26.0,0.0,3.0
3.0,6.0,51.0,68.0,7.0,8.0,20.0,7.0,7.0,2.0,3.0,...,3.0,1.0,3.0,25.0,14.0,25.0,1.0,157.0,5.0,35.0
3.5,6.0,22.0,67.0,6.0,8.0,32.0,7.0,9.0,1.0,4.0,...,3.0,0.0,2.0,18.0,11.0,23.0,3.0,70.0,0.0,12.0
4.0,8.0,27.0,167.0,14.0,13.0,63.0,12.0,5.0,5.0,5.0,...,2.0,0.0,0.0,36.0,12.0,19.0,6.0,177.0,3.0,49.0
4.5,8.0,11.0,45.0,5.0,2.0,30.0,5.0,1.0,0.0,1.0,...,1.0,0.0,0.0,14.0,11.0,12.0,1.0,37.0,0.0,16.0
5.0,7.0,8.0,116.0,3.0,2.0,17.0,5.0,4.0,0.0,0.0,...,2.0,0.0,0.0,14.0,22.0,8.0,0.0,72.0,1.0,21.0


In [34]:
# contingency table
stat, p, dof, expected = chi2_contingency(chi_sq2)
print(expected)
#interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f)' % (prob, critical, stat))
if abs(stat) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')
#interpret p-value
alpha= 1-prob
print('signigicance=%.3f, p=%.3f,' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else: 
    print('Independent (fail to reject)')

[[6.38561625e-01 2.52707366e+00 7.54046174e+00 ... 8.53227022e+00
  1.22277758e-01 2.05154905e+00]
 [1.31021659e+00 5.18511246e+00 1.54717065e+01 ... 1.75067238e+01
  2.50892538e-01 4.20941926e+00]
 [8.34791146e-01 3.30364156e+00 9.85764013e+00 ... 1.11542306e+01
  1.59853624e-01 2.68198858e+00]
 ...
 [1.24999603e+01 4.94679281e+01 1.47605915e+02 ... 1.67020747e+02
  2.39360943e+00 4.01594470e+01]
 [3.98564997e+00 1.57729977e+01 4.70645900e+01 ... 5.32550676e+01
  7.63209568e-01 1.28049605e+01]
 [6.15769170e+00 2.43687374e+01 7.27131679e+01 ... 8.22772423e+01
  1.17913245e+00 1.97832223e+01]]
probability=0.950, critical=8766.224, stat=21897.886)
Dependent (reject H0)
signigicance=0.050, p=0.000,
Dependent (reject H0)
