## Importing Libraries

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.types import *
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy import create_engine

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Importing Data

In [2]:
# Import TMDB data csv file for the first year 2000 and 2001
tmdb_results_combined_final_df = pd.read_csv(r"Movies_files\tmdb_results_combined.csv.gz")

In [3]:
# Remove the initial row which has null data
tmdb_results_combined_final_df = tmdb_results_combined_final_df.loc[1:,:] 

In [4]:
tmdb_results_combined_final_df.head(2)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.4,21.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,4.4,7.0,


In [5]:
# Create a list of years to add to the already compiled year of 2000 1nd 2001
years = [*range(2002, 2022, 1)]

In [6]:
# Loop through the years and append each year to the combined dataframe
for year in years:
    # Import TMDB data csv file for the current year
    current_year_df = pd.read_csv(f'Movies_files/final_tmdb_data_{year}.csv.gz')
    # remove the initial row which has null data
    current_year_df = current_year_df.loc[1:,:] 
    # append TMDB data csv file for the current year to the combined dataframe
    tmdb_results_combined_final_df = tmdb_results_combined_final_df.append(current_year_df)


In [7]:
tmdb_results_combined_final_df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59692 entries, 1 to 3513
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                59692 non-null  object 
 1   adult                  59691 non-null  float64
 2   backdrop_path          36755 non-null  object 
 3   belongs_to_collection  3705 non-null   object 
 4   budget                 59691 non-null  float64
 5   genres                 59691 non-null  object 
 6   homepage               14734 non-null  object 
 7   id                     59691 non-null  float64
 8   original_language      59691 non-null  object 
 9   original_title         59691 non-null  object 
 10  overview               58318 non-null  object 
 11  popularity             59691 non-null  float64
 12  poster_path            53985 non-null  object 
 13  production_companies   59691 non-null  object 
 14  production_countries   59691 non-null  object 
 15  rel

## Saving the compiled Tmdb files as a single csv.gz file

In [8]:
# Save a final merged .csv.gz of all of the tmdb api data 
tmdb_results_combined_final_df.to_csv(r"Movies_files/tmdb_results_combined_final_df.csv.gz", 
                                compression="gzip", 
                                index=False)

## Filtering only the relevant columns

In [9]:
# Drop columns
tmdb_results_combined_final_df = tmdb_results_combined_final_df.loc[:,["imdb_id","revenue",'budget','certification']]

In [10]:
# Display the first (5) rows
tmdb_results_combined_final_df.head()

Unnamed: 0,imdb_id,revenue,budget,certification
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0116391,0.0,0.0,
4,tt0118694,12854953.0,150000.0,PG
5,tt0118852,0.0,0.0,R


In [11]:
# Display the names, non-null values, and datatypes for the columns
tmdb_results_combined_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59692 entries, 1 to 3513
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        59692 non-null  object 
 1   revenue        59691 non-null  float64
 2   budget         59691 non-null  float64
 3   certification  14566 non-null  object 
dtypes: float64(2), object(2)
memory usage: 2.3+ MB


## Creating and saving the dataframe in a database table

**Connecting to Mysql database**

In [12]:
# Create a connection string using credentials following this format:
# connection = "dialect+driver://root:root@host:port/database"
database_name = "Movies"
connection_str = f"mysql+pymysql://root:root@localhost/{database_name}"

**Creating the engine**

In [13]:
# Create an instance of the sqlalchemy create_engine class
engine = create_engine(connection_str)

**Assigning datatype to each column**

In [14]:
# Calculate max string lengths for object columns
imdb_id_len = tmdb_results_combined_final_df['imdb_id'].fillna('').map(len).max()
certification_len = tmdb_results_combined_final_df['certification'].fillna('').map(len).max()


# Use 1 + the max_str_len for object columns
# Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    'imdb_id': String(imdb_id_len+1), 
    'budget':Float(),
    'revenue':Float(),
    'certification': String(certification_len+1)}

**Savig the dataframe to a database table**

In [15]:
# Save the dataframe to an sql table
# with appropriate datatypes and set index=False
tmdb_results_combined_final_df.to_sql('tmdb_data',
              engine, 
              dtype=df_schema,
              if_exists='replace',
              index=False)

In [16]:
# Use the sqlalchemy engine to update the table and set imdb_id as the primary key
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1b898cd6b88>

In [17]:
# Display the first 5 rows of the table using an SQL query
q = """
SELECT * 
FROM tmdb_data LIMIT 5
;"""
pd.read_sql(q, engine)

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0035423,76019000.0,48000000.0,PG-13
2,tt0062336,0.0,0.0,
3,tt0069049,0.0,12000000.0,R
4,tt0088751,0.0,350000.0,


# Hypothesis Testing


Stakeholder Questions
For each question:

The stakeholder has requested statistical tests to obtain mathematically-supported answers to their questions.
They would like to know if a statistically significant difference exists for each hypothesis.
They would like to know the p-value of the test.
They would like a visualization that supports the findings of the test.

### Does the MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') affect how much revenue the movie generates?



**Null Hypothesis:**

he MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES NOT affect how much revenue the movie generates?

**Alternative Hypothesis:**

The MPAA rating of a movie ('G', 'NC-17', 'NR', 'PG', 'PG-13', or 'R') DOES affect how much revenue the movie generates

   - **Type of Data:**   
        Numeric
        
        
   - **Number of samples:**   
   Multiple samples
   
   
   - **Test type:**
   
   If parametric: ANOVA and/or Tukey
   
   If nonparametric: Kruskal-Wallis
   
   
  
   


In [18]:
# Display the first 5 rows of the table using an SQL query
q = """
SELECT revenue, certification
FROM tmdb_data 
WHERE revenue > 0 AND certification IS NOT NULL
;"""
df =pd.read_sql(q, engine)

In [19]:
df.head()

Unnamed: 0,revenue,certification
0,76019000.0,PG-13
1,5271670.0,PG-13
2,12855000.0,PG
3,5227350.0,R
4,14904.0,R


## Assumption Tests

**Test for significant Outliers**

In [20]:
df['certification'].value_counts()

R         2303
PG-13     1716
PG         631
NR         269
G          128
NC-17       22
R            1
PG-13        1
Name: certification, dtype: int64

In [21]:
groups = {}

for certification in df["certification"].unique():
    temp = df.loc[df["certification"]== certification,"revenue"]
    groups[certification ]= temp
groups.keys()

dict_keys(['PG-13', 'PG', 'R', 'G', 'NR', 'NC-17', 'R ', 'PG-13 '])

In [22]:
for certification, revenue in groups.items():
    outliers = np.abs(stats.zscore(revenue))>3
    print(f'Group {certification}: {outliers.sum()} outliers')
    groups[certification] = revenue.loc[~outliers]
print('All outliers have been removed')

Group PG-13: 39 outliers
Group PG: 20 outliers
Group R: 52 outliers
Group G: 4 outliers
Group NR: 6 outliers
Group NC-17: 1 outliers
Group R : 0 outliers
Group PG-13 : 0 outliers
All outliers have been removed


**The test for outliers is satisfied, since all outliers have been removed**

In [30]:
# Create a connection string using credentials following this format:
# connection = "dialect+driver://root:root@host:port/database"
database_name = "Movies"
connection_str = f"mysql+pymysql://root:root@localhost/{database_name}"

**Test for Normality**

In [31]:
###### Using a loop to obtain group count and pvalues

n_results = {}
for certification, revenue in groups.items():
    stat, p = stats.normaltest(revenue)
    n_results[certification] = {"n":len(revenue), "p":p, "test stat": stat}
    
    ## convert to a dataframe
n_results_df = pd.DataFrame(n_results).T
n_results_df['significant?'] = n_results_df['p'] < .05
n_results_df

ValueError: skewtest is not valid with less than 8 samples; 1 samples were given.