## Importing Libraries

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.types import *
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy import create_engine

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Importing Data

**Connecting to Mysql database**

In [None]:
# Create a connection string using credentials following this format:
# connection = "dialect+driver://root:root@host:port/database"
database_name = "Movies"
connection_str = f"mysql+pymysql://root:root@localhost/{database_name}"

**Creating the engine**

In [None]:
# Create an instance of the sqlalchemy create_engine class
engine = create_engine(connection_str)

**Assigning datatype to each column**

In [None]:
# Calculate max string lengths for object columns
imdb_id_len = tmdb_results_combined_final_df['imdb_id'].fillna('').map(len).max()
certification_len = tmdb_results_combined_final_df['certification'].fillna('').map(len).max()


# Use 1 + the max_str_len for object columns
# Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    'imdb_id': String(imdb_id_len+1), 
    'budget':Float(),
    'revenue':Float(),
    'certification': String(certification_len+1)}

**Savig the dataframe to a database table**

In [None]:
# Save the dataframe to an sql table
# with appropriate datatypes and set index=False
tmdb_results_combined_final_df.to_sql('tmdb_data',
              engine, 
              dtype=df_schema,
              if_exists='replace',
              index=False)

In [None]:
# Use the sqlalchemy engine to update the table and set imdb_id as the primary key
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

In [None]:
# Display the first 5 rows of the table using an SQL query
q = """
SELECT * 
FROM tmdb_data LIMIT 5
;"""
pd.read_sql(q, engine)

# Hypothesis Testing


Stakeholder Questions
For each question:

The stakeholder has requested statistical tests to obtain mathematically-supported answers to their questions.
They would like to know if a statistically significant difference exists for each hypothesis.
They would like to know the p-value of the test.
They would like a visualization that supports the findings of the test.

### Does the MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') affect how much revenue the movie generates?



**Null Hypothesis:**

he MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES NOT affect how much revenue the movie generates?

**Alternative Hypothesis:**

The MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES affect how much revenue the movie generates

   - **Type of Data:**   
        Numeric
        
        
   - **Number of samples:**   
   Multiple samples
   
   
   - **Test type:**
   
   If parametric: ANOVA and/or Tukey
   
   If nonparametric: Kruskal-Wallis
   
   
  
   


In [None]:
# Display the first 5 rows of the table using an SQL query
q = """
SELECT revenue, certification
FROM tmdb_data 
WHERE revenue > 0 AND certification IS NOT NULL
;"""
df =pd.read_sql(q, engine)

In [None]:
df.head()

## Assumption Tests

**Test for significant Outliers**

In [None]:
df['certification'].value_counts()

In [None]:
groups = {}
for certification in df["certification"].unique():
    temp = df.loc[df["certification"]== certification,"revenue"]
    groups[certification ]= temp
groups.keys()

In [None]:
for certification, revenue in groups.items():
    outliers = np.abs(stats.zscore(revenue))>3
    print(f'Group {certification}: {outliers.sum()} outliers')
    groups[certification] = revenue.loc[~outliers]
print('All outliers have been removed')

**The test for outliers is satisfied, since all outliers have been removed**

**Test for Normality**

###### Using a loop to obtain group count and pvalues

n_results = {}
for certification, revenue in groups.items():
    stat, p = stats.normaltest(revenue)
    n_results[certification] = {"n":len(revenue), "p":p}

In [None]:
# Display a dataframe created from the norm_results dictonary and transpose it 
pd.DataFrame(norm_results).T

In [None]:
# Create and display a dataframe created from the norm_results dictonary and transpose it 
norm_results_df = pd.DataFrame(norm_results).T
# Add a column to indicate if the group pvalue was significant or not
norm_results_df['sig?'] = norm_results_df['p'] < .05

# Display the dataframe
norm_results_df

- We have large enough groups (each n>20) that we can safely disregard the assumption of normality, even though:
- The groups do NOT come from normal distributions.

**Assumption Equal Variance**

In [None]:
# Run the test and display the results
statistic, pvalue = stats.levene(*groups.values())
if pvalue < alpha:
    print(f'The p-value for the test was {pvalue}')
    print(f'It was < the alpha value of {alpha}, so')
    print(ha_desc)
    print(ha)
else:
    print(f'The p-value for the test was {pvalue}')
    print(f'It was > the alpha value of {alpha}, so')
    
 