In [12]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

In [13]:
file_dir = 'Input'
# Wikipedia data
wiki_file = f'{file_dir}/wikipedia_movies.json'
# Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [14]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movies):
    movie = dict(movies)
    alt_titles = {}
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key]= movie[key]
            movie.pop(key)
        if len(alt_titles) > 0:
            movie['alt_titles'] = alt_titles
            
        def change_column_name(old_name, new_name):
            if old_name in movie:
                movie[new_name] = movie.pop(old_name)
                
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    
    return movie

In [15]:
def load_dataset(*file):
# 2. Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    for f in file:
        if "csv" in f:
            file_input = pd.read_csv(f, low_memory=False)
            df = pd.DataFrame(file_input)
        else:
            with open(f, mode='r') as x:
                file_input = json.load(x)
                file_input1 = [i for i in file_input
                if ('Director' in i or 'Directed by' in i)
                    and ('imdb_link' in i) and ('No. of episodes' not in i)]
                file_input2 = [clean_movie(movie) for movie in file_input1]
            df = pd.DataFrame(file_input2)
        return df

In [16]:
# execute functions to load data in DataFrames, and to perform the initial "clean" function
files_to_load = [wiki_file, kaggle_file, ratings_file]
file_names = ["wiki_movies_df", "kaggle_meta", "ratings"]
for x, y in zip(file_names, files_to_load):
    globals()[x] = load_dataset(y)  

In [17]:
# define function for parsing budget column data
def parse_dollars(x):
    if type(x) != str:
        return np.nan
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', x, flags=re.IGNORECASE):
        x = re.sub(r'\$|\s*|[a-zA-Z]', '', x)
        value = float(x) * 1000000
        return value
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', x, flags=re.IGNORECASE):
        x = re.sub(r'\$|\s*|[a-zA-Z]', '', x)
        value = float(x) * 1000000000
        return value
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)', x, flags=re.IGNORECASE):
        x = re.sub(r'\$|,', '', x)
        value = float(x)
        return value
    else: 
        return np.nan

In [26]:
    try:
# clean IMDB data
        # grab only the IMDB ID number from each entry
        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
        # drop dupicated from the IMDB ID column
        wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
        # count the number of null values per column, and compare to the total. keep if sum is less than 90% of the total dataset
        columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
        wiki_movies_df = wiki_movies_df[columns_to_keep]
# clean box office data
        # drop null values from Box Office
        box_office = wiki_movies_df['Box office'].dropna()
        # convert to string values and filter based on regular expressions
            # if a list, join values with a space, else keep original
        box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
            # define regex to capture the most common formats
        form_one = r'\$\s*\d+\.?\d*\s*[mb]illion'
        form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'
            # replace hypens from value ranges with '$'
        box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
            # execute capture of both regex statements for the box_office column and apply the Parse_Dollars function
        wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
        # drop original Box office column
        wiki_movies_df.drop('Box office', axis=1, inplace=True)
# clean budget data
        # drop null values
        budget = wiki_movies_df['Budget'].dropna()
        # join lists with a space if a list is present, else keep the original value 
        budget = budget.apply(lambda x: ' '.join(x) if type(x) == list else x)
        # replace hyphen value ranges with a '$'
        budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
        # remove bracketed references
        budget = budget.str.replace(r'\[\d+\]\s*', '', regex=True)
        # capture buget values based on the same regex used for box office values
        wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
        # drop original column
        wiki_movies_df.drop('Budget', axis=1, inplace=True)
# clean release date data
        # drop null values and join lists if they are present, else keep value
        release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
        # define date formats as regex
            # Month DD YYYY
        date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\d{4}'
            # YYYY (various separator) MM (various separator) DD
        date_form_two = r'\d{4}[,|.|-|\:|\\|\s|/]\d{2}[,|.|-|\:|\\|\s|/]\d{2}'
            # Month YYYY
        date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
            # YYYY
        date_form_four = r'\d{4}'
        # capture release dates that contain the above regex, convert to datetime and auto identify date format
        wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)[0], infer_datetime_format=True)
# clean running time data
        # drop null values and join lists if they exist, else keep original value
        running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
        # capture only the number values for varying hour/min formats
        running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
        # convert the captured values to numeric formats, based on columns H|M|minutes(separate from first pair)
        running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
        # calculate running time combining cols 1 and 2 (if not 0), else using 3
        wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: float(row[0]*60 + row[1]) if row[2] == 0 else float(row[2]), axis=1)
        # drop original column
        wiki_movies_df.drop('Running time', axis=1, inplace=True)
    except Exception as error:
        print(error)
    
    

'Box office'


In [27]:
# 20. Check that the wiki_movies_df DataFrame looks like this. 
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Country,Language,...,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",United States,English,...,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,"[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-01-01,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",United States,English,...,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-01-01,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",United States,"[English, Lao]",...,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]","[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-01-01,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",United States,English,...,Susan E. Morse,,Robert Greenhut,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-01-01,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,"December 19, 1990",US,English,...,David Stiven,Maurice Jarre,John Cornell,,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-01-01,95.0


In [28]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Release date',
 'Country',
 'Language',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'Writer(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']