## Importing Libraries

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.types import *
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy import create_engine

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Importing Data

**Connecting to Mysql database**

In [2]:
# Create a connection string using credentials following this format:
# connection = "dialect+driver://root:root@host:port/database"
database_name = "Movies"
connection_str = f"mysql+pymysql://root:root@localhost/{database_name}"

**Creating the engine**

In [3]:
# Create an instance of the sqlalchemy create_engine class
engine = create_engine(connection_str)

**Assigning datatype to each column**

# Hypothesis Testing


Stakeholder Questions
For each question:

The stakeholder has requested statistical tests to obtain mathematically-supported answers to their questions.
They would like to know if a statistically significant difference exists for each hypothesis.
They would like to know the p-value of the test.
They would like a visualization that supports the findings of the test.

### Does the MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') affect how much revenue the movie generates?



**Null Hypothesis:**

he MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES NOT affect how much revenue the movie generates?

**Alternative Hypothesis:**

The MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES affect how much revenue the movie generates

   - **Type of Data:**   
        Numeric
        
        
   - **Number of samples:**   
   Multiple samples
   
   
   - **Test type:**
   
   If parametric: ANOVA and/or Tukey
   
   If nonparametric: Kruskal-Wallis
   
   
  
   


In [4]:
# Display the first 5 rows of the table using an SQL query
q = """
SELECT revenue, certification
FROM tmdb_data 
WHERE revenue > 0 AND certification IS NOT NULL
;"""
df =pd.read_sql(q, engine)

In [5]:
df.head()

Unnamed: 0,revenue,certification
0,76019000.0,PG-13
1,5271670.0,PG-13
2,12855000.0,PG
3,5227350.0,R
4,14904.0,R


## Assumption Tests

**Test for significant Outliers**

In [6]:
df['certification'].value_counts()

R         2303
PG-13     1716
PG         631
NR         269
G          128
NC-17       22
R            1
PG-13        1
Name: certification, dtype: int64

In [12]:
groups = {}
for certification in df['certification'].unique():
    temp_df = df.loc[df['certification'] == certification, 'revenue']
    groups[certification] = temp_df
groups.keys()

dict_keys(['PG-13', 'PG', 'R', 'G', 'NR', 'NC-17', 'R ', 'PG-13 '])

In [7]:
# groups = {}
# for certification in df["certification"].unique():
#     temp = df.loc[df["certification"]== certification,"revenue"]
#     groups[certification ]= temp
# groups.keys()

dict_keys(['PG-13', 'PG', 'R', 'G', 'NR', 'NC-17', 'R ', 'PG-13 '])

In [13]:
for certification, revenue in groups.items():
    # Calculate the number of outliers
    outliers = np.abs(stats.zscore(revenue)) > 3
    print(f"{outliers.sum()} outliers were removed from the {certification} group.")
    # Remove the outliers
    groups[certification] = revenue.loc[~outliers]

39 outliers were removed from the PG-13 group.
20 outliers were removed from the PG group.
52 outliers were removed from the R group.
4 outliers were removed from the G group.
6 outliers were removed from the NR group.
1 outliers were removed from the NC-17 group.
0 outliers were removed from the R  group.
0 outliers were removed from the PG-13  group.


In [8]:
# for certification, revenue in groups.items():
#     outliers = np.abs(stats.zscore(revenue))>3
#     print(f'Group {certification}: {outliers.sum()} outliers')
#     groups[certification] = revenue.loc[~outliers]
# print('All outliers have been removed')

Group PG-13: 39 outliers
Group PG: 20 outliers
Group R: 52 outliers
Group G: 4 outliers
Group NR: 6 outliers
Group NC-17: 1 outliers
Group R : 0 outliers
Group PG-13 : 0 outliers
All outliers have been removed


**The test for outliers is satisfied, since all outliers have been removed**

**Test for Normality**

In [14]:
# Loop through the groups and obtain
# group count and the pvalue of the Normal Test
norm_results = {}
for certification, revenue in groups.items():
    stat, p = stats.normaltest(revenue)
    norm_results[certification] = {'n': len(revenue),
                       'p':p}

ValueError: skewtest is not valid with less than 8 samples; 1 samples were given.

In [11]:
###### Using a loop to obtain group count and pvalues

results = {}
for certification, revenue in groups.items():
    stat, p = stats.normaltest(revenue)
    n_results[certification] = {"n":len(revenue), "p":p}

ValueError: skewtest is not valid with less than 8 samples; 1 samples were given.

In [9]:
# Display a dataframe created from the norm_results dictonary and transpose it 
pd.DataFrame(results).T

NameError: name 'norm_results' is not defined

In [None]:
# Create and display a dataframe created from the norm_results dictonary and transpose it 
norm_results_df = pd.DataFrame(norm_results).T
# Add a column to indicate if the group pvalue was significant or not
norm_results_df['sig?'] = norm_results_df['p'] < .05

# Display the dataframe
norm_results_df

- We have large enough groups (each n>20) that we can safely disregard the assumption of normality, even though:
- The groups do NOT come from normal distributions.

**Assumption Equal Variance**

In [None]:
# Run the test and display the results
statistic, pvalue = stats.levene(*groups.values())
if pvalue < alpha:
    print(f'The p-value for the test was {pvalue}')
    print(f'It was < the alpha value of {alpha}, so')
    print(ha_desc)
    print(ha)
else:
    print(f'The p-value for the test was {pvalue}')
    print(f'It was > the alpha value of {alpha}, so')
    print(ho_desc)
    print(ho)    