In [None]:
import pandas as pd 
from sqlalchemy import create_engine 
from config import password 

# Extract CSVs into DataFrames

In [None]:
imdb_file = "Resources/IMDB ratings.csv"
imdb_df = pd.read_csv(imdb_file)
print(imdb_df.shape) # Printing the shape to be referenced later
imdb_df.head(5)


In [None]:
disney_file = "Resources/disney_plus_shows.csv"
disney_df = pd.read_csv(disney_file)
print(disney_df.shape) # Printing the shape to be referenced later
disney_df.head(5)


# Transform DataFrames

## IMDB Data Cleaning

In [None]:
# Selecting columns to be included
imdb_columns = ['imdb_title_id', 'weighted_average_vote', 'total_votes', 
                'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6',
       'votes_5', 'votes_4', 'votes_3', 'votes_2', 'votes_1',
                'us_voters_rating', 'us_voters_votes',
               'non_us_voters_rating', 'non_us_voters_votes']
imdb_df_transformed = imdb_df[imdb_columns].copy()

# Renaming what will become the index 
imdb_df_transformed.rename(columns={"imdb_title_id":"imdb_id"}, inplace=True)
imdb_df_transformed.head()


In [None]:
# Checking for duplicates on IMDB title id 
imdb_df_transformed.drop_duplicates("imdb_id", inplace=True)
imdb_df_transformed.shape # There are none.

In [None]:
# Check data types 
imdb_df_transformed.dtypes # Everything looks correct. 

In [None]:
# Dropping rows with blanks 
na_test = imdb_df_transformed.dropna(how="any")
na_test.shape 


In [None]:
# Setting index as IMDB title id 
imdb_df_transformed.set_index("imdb_id", inplace=True)
imdb_df_transformed.head()


## Disney+ Data Cleaning

In [None]:
# Selecting columns to be included
disney_columns = ['imdb_id', 'title', 'type', 'rated',
                  'released_at','imdb_rating', 'imdb_votes' ]
disney_df_transformed = disney_df[disney_columns].copy()
disney_df_transformed.head()


In [None]:
# Fixing data types
# Removed commas in imdb_votes and converted to float
disney_df_transformed['imdb_votes'] = disney_df_transformed['imdb_votes'].str.replace(',','').astype(float)



In [None]:
# Change N/A in release dates to make it blank

# Convert release dates column to datetime

# View all changes
disney_df_transformed.dtypes

In [None]:
# Drop empty IMDB title ids
disney_df_transformed.dropna(subset=['imdb_id'], inplace=True)
disney_df_transformed.shape


In [None]:
# Checking for duplicates on IMDB title id 
disney_df_transformed.drop_duplicates("imdb_id", inplace=True)
disney_df_transformed.shape # There are none.


In [None]:
# Renaming the rating and voting columns to make their source clear
disney_df_transformed.rename(columns={"imdb_rating":"disney_imdb_rating", "imdb_votes":"disney_imdb_votes"}, inplace=True)
disney_df_transformed.head()


In [None]:
# Setting index as IMDB id 
disney_df_transformed.set_index("imdb_id", inplace=True)
disney_df_transformed.head()


# Create database connection

In [None]:
# Initiate connection
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/[insert]')
connection = engine.connect()


In [None]:
# Confirm tables


# Load DataFrames into database

In [None]:
# Use to_SQL