In [136]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

import warnings
warnings.filterwarnings("ignore")

In [137]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    return movie

In [138]:
# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def extract_transform_load(wiki_file, kaggle_file, ratings_file):    
    
    file_dir = 'C://Users/KenAk/ETL/Movies-ETL/'
    
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.

    kaggle_metadata = pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
    ratings = pd.read_csv(f'{file_dir}ratings.csv')
    
    # Open and read the Wikipedia data JSON file.
 
    with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
        wiki_movies_raw = json.load(file)
    
    # 3. Write a list comprehension to filter out TV shows. Code from near bottom of 8.3.3
    # The TV shows are filtered out, and the wiki_movies_df DataFrame is created (3 pt)
    
    wiki_movies = [movie for movie in wiki_movies_raw
                   if ('Director' in movie or 'Directed by' in movie)
                       and 'imdb_link' in movie
                       and 'No. of episodes' not in movie]
    
    # 4. Write a list comprehension to iterate through the cleaned wiki movies list and call the clean_movie function on each movie.
    # We can make a list of cleaned movies with a list comprehension
    
    clean_movies = [clean_movie(movie) for movie in wiki_movies]

    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.

    wiki_movies_df = pd.DataFrame(clean_movies)
    
    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    # A try-except block is used to catch errors while extracting the IMDb IDs with a regular expression and dropping duplicate IDs. (5 pt)
    try:
        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
        print("Number of movies before dropping duplicates:", len(wiki_movies_df))
        wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
        print("Number of movies after dropping duplicates: ",len(wiki_movies_df))
    except:
        print("An exception occurred")

    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    # That will give us the columns that we want to keep, which we can select from our Pandas DataFrame
    # A list comprehension is used to keep columns with non-null values (3 pt)
    wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
    wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]
     
    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    # The non-null box office data is converted to string values using the lambda and join functions (3 pt)

    Box_Office = wiki_movies_df['Box office'].dropna() #drop missing values
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.  
    # Lambda functions don't have a name and automatically return a variable
    Box_Office[Box_Office.map(lambda x: type(x) != str)]
    # Instead of creating a new function with a block of code and the def keyword, we can create an anonymous lambda function right inside the map() call

    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.  
    # 8.3.10 Parse the Box Office Data; A regular expression is used to match the six elements of "form_one" of the box office data (2 pt)
    form_one = r'\$\d+\.?\d*\s*[mb]illi?on'
    Box_Office.str.contains(form_one, flags=re.IGNORECASE, na=False).sum()
    
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    # A regular expression is used to match the three elements of "form_two" of the box office data (2 pt)
    form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'
    Box_Office.str.contains(form_two, flags=re.IGNORECASE, na=False).sum()
    
    # 12. Add the parse_dollars function.
    def parse_dollars(s):
        # if s is not a string, return NaN
        if type(s) != str:
            return np.nan

        # if input is of the form $###.# million
        if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):

            # remove dollar sign and " million"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a million
            value = float(s) * 10**6

            # return value
            return value

        # if input is of the form $###.# billion
        elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

            # remove dollar sign and " billion"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a billion
            value = float(s) * 10**9

            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)', s, flags=re.IGNORECASE):

            # remove dollar sign and commas
            s = re.sub('\$|,','', s)

            # convert to float
            value = float(s)

            # return value
            return value

        # otherwise, return NaN
        else:
            return np.nan
    
    # The following columns are cleaned in the Wikipedia DataFrame: (8 pt)
    
    # The box office column
    # The budget column
    # The release date column
    # The running time column
    
    # 13. Clean the box office column in the wiki_movies_df DataFrame.

    wiki_movies_df['Box_Office'] = Box_Office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Box office', axis=1, inplace=True)
    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    
    budget = wiki_movies_df['Budget'].dropna()
    budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    matches_form_one = budget.str.contains(form_one, flags=re.IGNORECASE, na=False)
    matches_form_two = budget.str.contains(form_two, flags=re.IGNORECASE, na=False)
    budget[~matches_form_one & ~matches_form_two]
    # Remove the citation references with the following:
    budget = budget.str.replace(r'\[\d+\]\s*', '')
    budget[~matches_form_one & ~matches_form_two]
    # make a variable that holds the non-null values of Release date in the DataFrame, converting lists to strings
    wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    # We can also drop the original Budget column
    wiki_movies_df.drop('Budget', axis=1, inplace=True)
    
    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    
    # make a variable that holds the non-null values of Release date in the DataFrame, converting lists to strings

    Release_Date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    
    # The forms we'll be parsing are:

    # 1. Full month name, one- to two-digit day, four-digit year (i.e., January 1, 2000)
    # 2. Four-digit year, two-digit month, two-digit day, with any separator (i.e., 2000-01-01)
    # 3. Full month name, four-digit year (i.e., January 2000)
    # 4. Four-digit year

    date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
    date_form_two = r'\d{4}.[01]\d.[0123]\d'
    date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
    date_form_four = r'\d{4}'
    
    # Extract the dates

    Release_Date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)
    wiki_movies_df['Release_Date'] = pd.to_datetime(Release_Date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)
    # We can also drop the original Release date column
    wiki_movies_df.drop('Release date', axis=1, inplace=True)
    
    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    # Parse running time
    Running_Time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    running_time_extract = Running_Time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
    running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
    wiki_movies_df['Running_Time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
    wiki_movies_df.drop('Running time', axis=1, inplace=True)
    
    # Return three variables. The first is the wiki_movies_df DataFrame  
    return wiki_movies_df, kaggle_metadata, ratings 

In [139]:
# 17. Create the path to your file directory and variables for the three files.
file_dir = 'C://Users/KenAk/ETL/Movies-ETL/'
# The Wikipedia data
wiki_file = f'{file_dir}/wikipedia_movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [130]:
ratings_file

'C://Users/KenAk/ETL/Movies-ETL//ratings.csv'

In [140]:
# 18. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = extract_transform_load(wiki_file, kaggle_file, ratings_file)

Number of movies before dropping duplicates: 7076
Number of movies after dropping duplicates:  7033


In [141]:
wiki_file

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Distributed by,Country,Language,Written by,Productioncompanies,imdb_id,Box_Office,budget,Release_Date,Running_Time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,20th Century Fox,United States,English,,,tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,Avenue Pictures,United States,English,,,tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,TriStar Pictures,United States,"[English, Lao]",,,tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,Orion Pictures,United States,English,Woody Allen,,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,Paramount Pictures,US,English,Paul Hogan,,tt0099018,6939946.0,25000000.0,1990-12-19,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7071,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018,https://www.imdb.com/title/tt1255919/,Holmes & Watson,Etan Cohen,"[Will Ferrell, Adam McKay, Jimmy Miller, Clayt...",Etan Cohen,,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",...,Sony Pictures Releasing,United States,English,,"[Columbia Pictures, Gary Sanchez Productions, ...",tt1255919,41900000.0,42000000.0,2018-12-25,90.0
7072,https://en.wikipedia.org/wiki/Vice_(2018_film),2018,https://www.imdb.com/title/tt6266538/,Vice,Adam McKay,"[Brad Pitt, Dede Gardner, Jeremy Kleiner, Kevi...",,,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",...,Mirror Releasing,United States,English,Adam McKay,,tt6266538,76100000.0,60000000.0,2018-12-11,132.0
7073,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,Mimi Leder,Robert W. Cort,,,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",...,Focus Features,United States,English,Daniel Stiepleman,"[Focus Features, [1], Participant Media, [1], ...",tt4669788,38400000.0,20000000.0,2018-11-08,120.0
7074,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018,https://www.imdb.com/title/tt7137380/,Destroyer,Karyn Kusama,"[Fred Berger, Phil Hay, Matt Manfredi]",,,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",...,Mirror Releasing,United States,English,"[Phil Hay, Matt Manfredi]",,tt7137380,5500000.0,9000000.0,2018-08-31,123.0


In [142]:
# 19. Set the wiki_movies_df equal to the wiki_file variable. 
wiki_movies_df = wiki_file

In [143]:
# 20. The cleaned Wikipedia data is converted to a Pandas DataFrame, 
# and the DataFrame is displayed in the ETL_clean_wiki_movies.ipynb file (4 pt)
wiki_movies_df.head(50)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Distributed by,Country,Language,Written by,Productioncompanies,imdb_id,Box_Office,budget,Release_Date,Running_Time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,20th Century Fox,United States,English,,,tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,Avenue Pictures,United States,English,,,tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,TriStar Pictures,United States,"[English, Lao]",,,tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,Orion Pictures,United States,English,Woody Allen,,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,Paramount Pictures,US,English,Paul Hogan,,tt0099018,6939946.0,25000000.0,1990-12-19,95.0
5,https://en.wikipedia.org/wiki/The_Ambulance,1990,https://www.imdb.com/title/tt0099026/,The Ambulance,Larry Cohen,"[Larry Cohen, Moctesuma Esparza, Robert Katz]",,,,"[Eric Roberts, James Earl Jones, Red Buttons, ...",...,Triumph Releasing Corporation,United States,English,Larry Cohen,,tt0099026,,,1990-03-22,95.0
6,https://en.wikipedia.org/wiki/American_Dream_(...,1990,https://www.imdb.com/title/tt0099028/,American Dream,"[Barbara Kopple, Co-directors:, Cathy Caplan, ...","[Arthur Cohn, Barbara Kopple]",,,,,...,Prestige Films,"[United States, United Kingdom]",English,,,tt0099028,,,1990-10-06,100.0
7,https://en.wikipedia.org/wiki/American_Ninja_4...,1990,https://www.imdb.com/title/tt0101326/,American Ninja 4: The Annihilation,Cedric Sundstrom,Ovidio G Assonitis,,,,"[Michael Dudikoff, David Bradley, James Booth,...",...,Cannon Group,"[United States, Lesotho]",English,David Geeves,,tt0101326,,,1991-03-08,99.0
8,https://en.wikipedia.org/wiki/Andre%27s_Mother,1990,https://www.imdb.com/title/tt0099037/,Andre's Mother,Deborah Reinisch,"[Sarah Green, Deborah Reinisch]",,,,"[Richard Thomas, Sada Thompson, Sylvia Sidney]",...,Public Broadcasting Service,United States,English,Terrence McNally,,tt0099037,,,1990-03-07,50.0
9,https://en.wikipedia.org/wiki/Angel_Town_(film),1990,https://www.imdb.com/title/tt0099039/,Angel Town,Eric Karson,"[Ash R. Shah, Eric Karson]",,,,"[Olivier Gruner, Theresa Saldana, Frank Aragon...",...,Imperial Entertainment,United States,English,S. Warren,,tt0099039,855810.0,,1990-02-23,102.0


In [144]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Country',
 'Language',
 'Written by',
 'Productioncompanies ',
 'imdb_id',
 'Box_Office',
 'budget',
 'Release_Date',
 'Running_Time']