In [34]:
import json
import pandas as pd
import numpy as np

import re

In [35]:
# Use the clean movie function 

def clean_movie(movie):

    movie = dict(movie) #Non-destructive copy of local variable

    alternative_titles = {} #Create dict of all alternative title keys/values

    #Loop through list of alternative title keys
    for key in ['Also known as', 'Arabic', 'Cantonese', 'Chinese', 'French',
                'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally', 'Mandarin',
                'McCune–Reischauer', 'Original title', 'Polish', 'Revised Romanization',
                'Romanized', 'Russian', 'Simplified', 'Traditional', 'Yiddish']:
    
    #If key exists, remove key-value pair and add to alternative titles dict
        if key in movie:

            alternative_titles[key] = movie[key]

            movie.pop(key)

        if len(alternative_titles) > 0:

            movie['alternative_titles'] = alternative_titles

    #Merge column names

    def change_column_name(old_name, new_name):

        if old_name in movie:

            movie[new_name] = movie.pop(old_name)

    change_column_name('Adaptation by', 'Writer(s)')

    change_column_name('Country of origin', 'Country')

    change_column_name('Directed by', 'Director')

    change_column_name('Distributed by', 'Distributor')

    change_column_name('Edited by', 'Editor(s)')

    change_column_name('Length', 'Running time')

    change_column_name('Original release', 'Release date')

    change_column_name('Music by', 'Composer(s)')

    change_column_name('Produced by', 'Producer(s)')

    change_column_name('Producer', 'Producer(s)')

    change_column_name('Productioncompanies ', 'Production company(s)')

    change_column_name('Productioncompany ', 'Production company(s)')

    change_column_name('Released', 'Release Date')

    change_column_name('Release Date', 'Release date')

    change_column_name('Screen story by', 'Writer(s)')

    change_column_name('Screenplay by', 'Writer(s)')

    change_column_name('Story by', 'Writer(s)')

    change_column_name('Theme music composer', 'Composer(s)')

    change_column_name('Written by', 'Writer(s)')

    return movie

In [36]:
# Function to read in csv and json files

def file_read_clean():

    # Reads in the three data files

    kaggle_metadata = pd.read_csv(kaggle_file, low_memory=False)

    ratings = pd.read_csv(ratings_file)

    with open(wiki_file, mode='r') as file:

        wiki_movies_raw = json.load(file)

    # Write a list comprehension the filters out TV shows from wiki_movies_raw

    wiki_movies = [movie for movie in wiki_movies_raw

              if('Director' in movie or 'Directed by' in movie) 

                  and 'imdb_link' in movie
                  
                  and 'No. of episodes' not in movie]

    # Write another list comprehension to iterate through cleaned wiki movies

    clean_wiki_movies = [clean_movie(movie) for movie in wiki_movies]

    # Read in cleaned movies as a DataFrame
    
    clean_wiki_movies_df = pd.DataFrame(clean_wiki_movies)

    # Use a try/except block to extract imdb ids while dropping any duplicated ids. Print out error type if error occurs

    try:
        clean_wiki_movies_df['imdb_id'] = clean_wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')

        clean_wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)

    except Exception as e:
        print(e)

    # Keeping columns in which null values makes up less than 90% of the data for that column
    
    kept_columns = [column for column in clean_wiki_movies_df.columns if clean_wiki_movies_df[column].isnull().sum() < len(clean_wiki_movies_df) * 0.9]

    wiki_movies_df = clean_wiki_movies_df[kept_columns]

    wiki_movies_df.head()

    box_office = wiki_movies_df['Box office'].dropna()

    # Convert box office data to strings to apply regular expressions
    
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

    # Match elements from first form of writing out box office data

    form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'

    # Match elements from second form of writing out box office data

    form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'

    # Create function to parse strings using regex, remove certain values, and convert value to a float

    def parse_dollars(s):

        #If s is not a string, return NaN
        if type(s) != str:
            return np.nan

        #If input is of the form $###.# million
        if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):

            #Remove dollar sign and ' million'
            s = re.sub('\$|\s|[a-zA-Z]', '', s)

            #Convert to float and multiply by a million
            value = float(s) * 10**6

            #Return value
            return value

        #If input is of the form $###.# billion
        elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

            #Remove dollar sign and ' billion'
            s = re.sub('\$|\s|[a-zA-Z]', '', s)

            #Convert to float and multipy by a billion
            value = float(s) * 10**9

            #Return value
            return value

        #If input is of the form $###,###,###
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):

            #Remove dollar sign and commas
            s = re.sub('\$|,','', s)

            #Convert to float
            value = float(s)

            #Return value
            return value

        #Otherwise, return NaN
        else:
            
            return np.nan

    # Cleaning box office data 

    wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)

    wiki_movies_df.drop('Box office', axis=1, inplace=True)

    # Cleaning budget data

    budget = wiki_movies_df['Budget'].dropna()

    budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

    budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

    budget = budget.str.replace(r'\[\d+\]s*', '')

    wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)

    wiki_movies_df.drop('Budget', axis=1, inplace=True)

    # Cleaning release date data

    release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

    date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'

    date_form_two = r'\d{4}.[01]\d.[0123]\d'

    date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'

    date_form_four = r'\d{4}'

    release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

    wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)

    wiki_movies_df.drop('Release date', axis=1, inplace=True)

    # Cleaning running time data

    running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

    running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')

    running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

    wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

    wiki_movies_df.drop('Running time', axis=1, inplace=True)

    return wiki_movies_df, kaggle_metadata, ratings



In [37]:
file_dir = 'C:/Users/Ryan/Documents/BootCamp/SQL/Movies_ETL/Raw_Data_Files/'

kaggle_file = f'{file_dir}movies_metadata.csv'

ratings_file = f'{file_dir}ratings.csv'

wiki_file = f'{file_dir}wikipedia-movies.json'

In [38]:
wiki_movies_df, kaggle_metadata, ratings = file_read_clean() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [39]:
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Country,Language,Director,...,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,United States,English,Renny Harlin,...,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,"[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,United States,English,James Foley,...,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,United States,"[English, Lao]",Roger Spottiswoode,...,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]","[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,United States,English,Woody Allen,...,Susan E. Morse,,Robert Greenhut,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,US,English,John Cornell,...,David Stiven,Maurice Jarre,John Cornell,,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [40]:
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Country',
 'Language',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'Writer(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']