In [1]:
import json
import pandas as pd
import numpy as np

import re

In [2]:
# Use the clean movie function 

def clean_movie(movie):

    movie = dict(movie) #Non-destructive copy of local variable

    alternative_titles = {} #Create dict of all alternative title keys/values

    #Loop through list of alternative title keys
    for key in ['Also known as', 'Arabic', 'Cantonese', 'Chinese', 'French',
                'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally', 'Mandarin',
                'McCune–Reischauer', 'Original title', 'Polish', 'Revised Romanization',
                'Romanized', 'Russian', 'Simplified', 'Traditional', 'Yiddish']:
    
    #If key exists, remove key-value pair and add to alternative titles dict
        if key in movie:

            alternative_titles[key] = movie[key]

            movie.pop(key)

        if len(alternative_titles) > 0:

            movie['alternative_titles'] = alternative_titles

    #Merge column names

    def change_column_name(old_name, new_name):

        if old_name in movie:

            movie[new_name] = movie.pop(old_name)

    change_column_name('Adaptation by', 'Writer(s)')

    change_column_name('Country of origin', 'Country')

    change_column_name('Directed by', 'Director')

    change_column_name('Distributed by', 'Distributor')

    change_column_name('Edited by', 'Editor(s)')

    change_column_name('Length', 'Running time')

    change_column_name('Original release', 'Release date')

    change_column_name('Music by', 'Composer(s)')

    change_column_name('Produced by', 'Producer(s)')

    change_column_name('Producer', 'Producer(s)')

    change_column_name('Productioncompanies ', 'Production company(s)')

    change_column_name('Productioncompany ', 'Production company(s)')

    change_column_name('Released', 'Release Date')

    change_column_name('Release Date', 'Release date')

    change_column_name('Screen story by', 'Writer(s)')

    change_column_name('Screenplay by', 'Writer(s)')

    change_column_name('Story by', 'Writer(s)')

    change_column_name('Theme music composer', 'Composer(s)')

    change_column_name('Written by', 'Writer(s)')

    return movie

In [None]:
# Function to read in csv and json files

def file_read_clean():

    # Reads in the three data files

    kaggle_metadata = pd.read_csv(kaggle_file, low_memory=False)

    ratings = pd.read_csv(ratings_file)

    with open(wiki_file, mode='r') as file:

        wiki_movies_raw = json.load(file)

    # Write a list comprehension the filters out TV shows from wiki_movies_raw

    wiki_movies = [movie for movie in wiki_movies_raw

              if('Director' in movie or 'Directed by' in movie) 

                  and 'imdb_link' in movie
                  
                  and 'No. of episodes' not in movie]

    # Write another list comprehension to iterate through cleaned wiki movies

    clean_wiki_movies = [clean_movie(movie) for movie in wiki_movies]

    # Read in cleaned movies as a DataFrame
    
    clean_wiki_movies_df = pd.DataFrame(clean_wiki_movies)

    # Use a try/except block to extract imdb ids while dropping any duplicated ids. Print out error type if error occurs

    try:
        clean_wiki_movies_df['imdb_id'] = clean_wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')

        clean_wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)

    except Exception as e:
        print(e)

    # Keeping columns in which null values makes up less than 90% of the data for that column
    
    kept_columns = [column for column in clean_wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

    wiki_movies_df = pd.DataFrame(kept_columns)

    box_office = wiki_movies_df['Box office'].dropna()

    # Convert box office data to strings to apply regular expressions
    
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

    # Match elements from first form of writing out box office data

    form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'

    # Match elements from second form of writing out box office data

    form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'

    def parse_dollars(s):

    #If s is not a string, return NaN
    if type(s) != str:
        return np.nan

    #If input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):

        #Remove dollar sign and ' million'
        s = re.sub('\$|\s|[a-zA-Z]', '', s)

        #Convert to float and multiply by a million
        value = float(s) * 10**6

        #Return value
        return value

    #If input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

        #Remove dollar sign and ' billion'
        s = re.sub('\$|\s|[a-zA-Z]', '', s)

        #Convert to float and multipy by a billion
        value = float(s) * 10**9

        #Return value
        return value

    #If input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):

        #Remove dollar sign and commas
        s = re.sub('\$|,','', s)

        #Convert to float
        value = float(s)

        #Return value
        return value

    #Otherwise, return NaN
    else:
        
        return np.nan

    return wiki_movies_df, kaggle_metadata, ratings



In [None]:
file_dir = 'C:/Users/Ryan/Documents/BootCamp/SQL/Movies_ETL/Raw_Data_Files/'

kaggle_file = f'{file_dir}movies_metadata.csv'

ratings_file = f'{file_dir}ratings.csv'

wiki_file = f'{file_dir}wikipedia-movies.json'

wiki_file, kaggle_file, ratings_file = file_reader()