In [35]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

import warnings
warnings.filterwarnings("ignore")

In [36]:
# 1. The Kaggle metadata is cleaned (4 pt)
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    return movie

In [37]:
# 1 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)
# Before Step 2, add all the code written for Deliverable 2

# 2 Add the function that takes in three arguments; Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)
# The Wikipedia and Kaggle DataFrames are merged (3 pt)

def extract_transform_load(wiki_file, kaggle_file, ratings_file):    
    
    file_dir = 'C://Users/KenAk/ETL/Movies-ETL/'
    
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.

    kaggle_metadata = pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
    ratings = pd.read_csv(f'{file_dir}ratings.csv')
    
    # Open and read the Wikipedia data JSON file.
 
    with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
        wiki_movies_raw = json.load(file)
    
    # 3. Write a list comprehension to filter out TV shows. Code from near bottom of 8.3.3
    # The TV shows are filtered out, and the wiki_movies_df DataFrame is created (3 pt)
    
    wiki_movies = [movie for movie in wiki_movies_raw
                   if ('Director' in movie or 'Directed by' in movie)
                       and 'imdb_link' in movie
                       and 'No. of episodes' not in movie]
    
    # 4. Write a list comprehension to iterate through the cleaned wiki movies list and call the clean_movie function on each movie.
    # We can make a list of cleaned movies with a list comprehension
    
    clean_movies = [clean_movie(movie) for movie in wiki_movies]

    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.

    wiki_movies_df = pd.DataFrame(clean_movies)
    
    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    # A try-except block is used to catch errors while extracting the IMDb IDs with a regular expression and dropping duplicate IDs. (5 pt)
    try:
        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
        print("Number of movies before dropping duplicates:", len(wiki_movies_df))
        wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
        print("Number of movies after dropping duplicates: ",len(wiki_movies_df))
    except:
        print("An exception occurred")

    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    # That will give us the columns that we want to keep, which we can select from our Pandas DataFrame
    # A list comprehension is used to keep columns with non-null values (3 pt)
    wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
    wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]
     
    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    # The non-null box office data is converted to string values using the lambda and join functions (3 pt)

    Box_Office = wiki_movies_df['Box office'].dropna() #drop missing values
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.  
    # Lambda functions don't have a name and automatically return a variable
    Box_Office[Box_Office.map(lambda x: type(x) != str)]
    # Instead of creating a new function with a block of code and the def keyword, we can create an anonymous lambda function right inside the map() call

    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.  
    # 8.3.10 Parse the Box Office Data; A regular expression is used to match the six elements of "form_one" of the box office data (2 pt)
    form_one = r'\$\d+\.?\d*\s*[mb]illi?on'
    Box_Office.str.contains(form_one, flags=re.IGNORECASE, na=False).sum()
    
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    # A regular expression is used to match the three elements of "form_two" of the box office data (2 pt)
    form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'
    Box_Office.str.contains(form_two, flags=re.IGNORECASE, na=False).sum()
    
    # 12. Add the parse_dollars function.
    def parse_dollars(s):
        # if s is not a string, return NaN
        if type(s) != str:
            return np.nan

        # if input is of the form $###.# million
        if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):

            # remove dollar sign and " million"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a million
            value = float(s) * 10**6

            # return value
            return value

        # if input is of the form $###.# billion
        elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

            # remove dollar sign and " billion"
            s = re.sub('\$|\s|[a-zA-Z]','', s)

            # convert to float and multiply by a billion
            value = float(s) * 10**9

            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)', s, flags=re.IGNORECASE):

            # remove dollar sign and commas
            s = re.sub('\$|,','', s)

            # convert to float
            value = float(s)

            # return value
            return value

        # otherwise, return NaN
        else:
            return np.nan
    
    # The following columns are cleaned in the Wikipedia DataFrame: (8 pt)
    
    # The box office column
    # The budget column
    # The release date column
    # The running time column
    
    # 13. Clean the box office column in the wiki_movies_df DataFrame.

    wiki_movies_df['Box_Office'] = Box_Office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Box office', axis=1, inplace=True)
    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    
    budget = wiki_movies_df['Budget'].dropna()
    budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    matches_form_one = budget.str.contains(form_one, flags=re.IGNORECASE, na=False)
    matches_form_two = budget.str.contains(form_two, flags=re.IGNORECASE, na=False)
    budget[~matches_form_one & ~matches_form_two]
    # Remove the citation references with the following:
    budget = budget.str.replace(r'\[\d+\]\s*', '')
    budget[~matches_form_one & ~matches_form_two]
    # make a variable that holds the non-null values of Release date in the DataFrame, converting lists to strings
    wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    # We can also drop the original Budget column
    wiki_movies_df.drop('Budget', axis=1, inplace=True)
    
    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    
    # make a variable that holds the non-null values of Release date in the DataFrame, converting lists to strings

    Release_Date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    
    # The forms we'll be parsing are:

    # 1. Full month name, one- to two-digit day, four-digit year (i.e., January 1, 2000)
    # 2. Four-digit year, two-digit month, two-digit day, with any separator (i.e., 2000-01-01)
    # 3. Full month name, four-digit year (i.e., January 2000)
    # 4. Four-digit year

    date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
    date_form_two = r'\d{4}.[01]\d.[0123]\d'
    date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
    date_form_four = r'\d{4}'
    
    # Extract the dates

    Release_Date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)
    wiki_movies_df['Release_Date'] = pd.to_datetime(Release_Date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)
    # We can also drop the original Release date column
    wiki_movies_df.drop('Release date', axis=1, inplace=True)
    
    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    # Parse running time
    Running_Time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    running_time_extract = Running_Time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
    running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
    wiki_movies_df['Running_Time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
    wiki_movies_df.drop('Running time', axis=1, inplace=True)
    
    # Return three variables. The first is the wiki_movies_df DataFrame  
    #return wiki_movies_df, kaggle_metadata, ratings 

    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    
    # Open and read the Wikipedia data JSON file.
      
    # Write a list comprehension to filter out TV shows.  

    # Write a list comprehension to iterate through the cleaned wiki movies list and call the clean_movie function on each movie.
 
    # Read in the cleaned movies list from Step 4 as a DataFrame.

    # Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    # dropping any imdb_id duplicates. If there is an error, capture and print the exception.

    #  Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.   

    # Create a variable that will hold the non-null values from the “Box office” column.
    
    # Convert the box office data created in Step 8 to string values using the lambda and join functions.    

    # Write a regular expression to match the six elements of "form_one" of the box office data.
   
    # Write a regular expression to match the three elements of "form_two" of the box office data.   

    # Add the parse_dollars function.    
        
    # Clean the box office column in the wiki_movies_df DataFrame.
    
    # Clean the budget column in the wiki_movies_df DataFrame.
    
    # Clean the release date column in the wiki_movies_df DataFrame.
    
    # Clean the running time column in the wiki_movies_df DataFrame.    
     
    # 2. Clean the Kaggle metadata
    
    kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult',axis='columns')
    kaggle_metadata['video'] = kaggle_metadata['video'] == 'True'
    kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
    kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
    kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise')
    kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date'])

    # 3. Merged the two DataFrames into the movies DataFrame.
    # 8.4.1 Merge Wikipedia and Kaggle Metadata
    movies_df = pd.merge(wiki_movies_df, kaggle_metadata, on='imdb_id', suffixes=['_wiki','_kaggle'])

    #Test code
    movies_df.columns.to_list()
    
    # 4. Drop unnecessary columns from the merged DataFrame.
    movies_df.drop(columns=['title_wiki','Language'], inplace=True)
    
    # 5. Add in the function to fill in the missing Kaggle data.
    # Next, to save a little time, we'll make a function that fills in missing data for a column pair and then drops the redundant column
    def fill_missing_kaggle_data(df, kaggle_column, wiki_column):
             df[kaggle_column] = df.apply(
                lambda row: row[wiki_column] if row[kaggle_column] == 0 else row[kaggle_column], axis=1)
             df.drop(columns=wiki_column, inplace=True)

    # 6. Call the function in Step 5 with the DataFrame and columns as the arguments.
    # Now we can run the function for the column pairs that we decided to fill in zeros
    fill_missing_kaggle_data(movies_df, 'runtime', 'Running_Time')
    fill_missing_kaggle_data(movies_df, 'budget_kaggle', 'budget_wiki')
    fill_missing_kaggle_data(movies_df, 'revenue', 'Box_Office')
    fill_missing_kaggle_data(movies_df, 'production_companies', 'Productioncompanies ')
    fill_missing_kaggle_data(movies_df, 'release_date', 'Release_Date')
    
    # The above procedures, along with filtering and renaming movies_df DataFrame columns (see below), are worth 8 points.
        
    # 7. Filter the movies DataFrame for specific columns.
    # Since we've merged our data and filled in values, it's good to check that there aren't any columns with only one value, since
    # that doesn't really provide any information. Don't forget, we need to convert lists to tuples for value_counts() to work.

    for col in movies_df.columns:
        lists_to_tuples = lambda x: tuple(x) if type(x) == list else x
        value_counts = movies_df[col].apply(lists_to_tuples).value_counts(dropna=False)
        num_values = len(value_counts)
        if num_values == 1:
            movies_df.drop(columns=col, inplace=True)
            print("We have dropped the", col, "column because it only has one value.")
                
    # 8. Rename the columns in the movies DataFrame # Reorder, then rename the columns

    movies_df = movies_df.loc[:, ['imdb_id','id','title_kaggle','original_title','tagline','belongs_to_collection','url','imdb_link',
                       'runtime','budget_kaggle','revenue','release_date','popularity','vote_average','vote_count','genres',
                       'original_language','overview','spoken_languages','Country','production_companies','production_countries',
                       'Distributed by','Produced by','Directed by','Starring','Cinematography','Edited by','Written by','Screenplay by',
                       'Music by','Based on','Productioncompany ','homepage'
                      ]]

    movies_df.rename({'id':'kaggle_id',
                   'title_kaggle':'title',
                   'url':'wikipedia_url',
                   'budget_kaggle':'budget',
                   'Productioncompany ':'production_company',
                   'Country':'country',
                   'Distributed by':'distributor',
                   'Produced by':'producer',
                   'Directed by':'director',
                   'Starring':'starring',
                   'Cinematography':'cinematography',
                   'Edited by':'editors',
                   'Written by':'writers',
                   'Screenplay by':'screenplay',
                   'Music by':'composers',
                   'Based on':'based_on'
                  }, axis='columns', inplace=True)
    
    # 9. Transform and merge the ratings DataFrame # 8.4.2 Transform and Merge Rating Data (10 points overall)
    rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count().rename({'userId':'count'}, axis=1) \
                .pivot(index='movieId',columns='rating', values='count') # (Clean 3 pt)
    movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='kaggle_id', right_index=True, how='left') # (Merge 4 pt)
    print("Number of movies with ratings:",len(movies_with_ratings_df))
    
    #  We'll prepend rating_ to each column with a list comprehension:
    rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
                    .rename({'userId':'count'}, axis=1) \
                    .pivot(index='movieId',columns='rating', values='count')
    rating_counts.columns = ['rating_' + str(col) for col in rating_counts.columns]
    
    movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='kaggle_id', right_index=True, how='left')

    # Fill in missing values with zeros (3 pt)
    movies_with_ratings_df[rating_counts.columns] = movies_with_ratings_df[rating_counts.columns].fillna(0)
          
    return wiki_movies_df, movies_with_ratings_df, movies_df

In [38]:
# 10. Create the path to your file directory and variables for the three files.
file_dir = 'C://Users/KenAk/ETL/Movies-ETL'
# The Wikipedia data
wiki_file = f'{file_dir}/wikipedia_movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [39]:
# 11. Set the three variables equal to the function created in D1.
wiki_file_returned, kaggle_file_returned, ratings_file_returned = extract_transform_load(wiki_file, kaggle_file, ratings_file)

Number of movies before dropping duplicates: 7076
Number of movies after dropping duplicates:  7033
We have dropped the video column because it only has one value.
Number of movies with ratings: 6052


In [40]:
# 12. Set the DataFrames from the return statement equal to the file names in Step 11. 
wiki_movies_df = wiki_file_returned
movies_with_ratings_df = kaggle_file_returned
movies_df = ratings_file_returned

In [41]:
#Test code
movies_with_ratings_df

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,tt0098987,9548,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,104.0,49000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0098994,25501,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,114.0,6000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0099005,11856,Air America,Air America,The few. The proud. The totally insane.,,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,112.0,35000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0099012,8217,Alice,Alice,,,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,102.0,12000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0099018,25943,Almost an Angel,Almost an Angel,Who does he think he is?,,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,95.0,25000000.0,...,3.0,0.0,3.0,2.0,5.0,26.0,37.0,46.0,16.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6047,tt5639354,429191,A Fantastic Woman,Una mujer fantástica,,,https://en.wikipedia.org/wiki/A_Fantastic_Woman,https://www.imdb.com/title/tt5639354/,104.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6048,tt5390066,390059,Permission,Permission,,,https://en.wikipedia.org/wiki/Permission_(film),https://www.imdb.com/title/tt5390066/,96.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6049,tt6304162,429174,Loveless,Нелюбовь,,,https://en.wikipedia.org/wiki/Loveless_(film),https://www.imdb.com/title/tt6304162/,128.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6050,tt5795086,412302,Gemini,Gemini,,,https://en.wikipedia.org/wiki/Gemini_(2017_film),https://www.imdb.com/title/tt5795086/,92.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
#Test code
movies_with_ratings_df.columns.to_list()

['imdb_id',
 'kaggle_id',
 'title',
 'original_title',
 'tagline',
 'belongs_to_collection',
 'wikipedia_url',
 'imdb_link',
 'runtime',
 'budget',
 'revenue',
 'release_date',
 'popularity',
 'vote_average',
 'vote_count',
 'genres',
 'original_language',
 'overview',
 'spoken_languages',
 'country',
 'production_companies',
 'production_countries',
 'distributor',
 'producer',
 'director',
 'starring',
 'cinematography',
 'editors',
 'writers',
 'screenplay',
 'composers',
 'based_on',
 'production_company',
 'homepage',
 'rating_0.5',
 'rating_1.0',
 'rating_1.5',
 'rating_2.0',
 'rating_2.5',
 'rating_3.0',
 'rating_3.5',
 'rating_4.0',
 'rating_4.5',
 'rating_5.0']

In [42]:
#Test code
movies_df.columns.to_list()

['imdb_id',
 'kaggle_id',
 'title',
 'original_title',
 'tagline',
 'belongs_to_collection',
 'wikipedia_url',
 'imdb_link',
 'runtime',
 'budget',
 'revenue',
 'release_date',
 'popularity',
 'vote_average',
 'vote_count',
 'genres',
 'original_language',
 'overview',
 'spoken_languages',
 'country',
 'production_companies',
 'production_countries',
 'distributor',
 'producer',
 'director',
 'starring',
 'cinematography',
 'editors',
 'writers',
 'screenplay',
 'composers',
 'based_on',
 'production_company',
 'homepage']

In [43]:
# 13. Check the wiki_movies_df DataFrame. 
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Distributed by,Country,Language,Written by,Productioncompanies,imdb_id,Box_Office,budget,Release_Date,Running_Time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,20th Century Fox,United States,English,,,tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,Avenue Pictures,United States,English,,,tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,TriStar Pictures,United States,"[English, Lao]",,,tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,Orion Pictures,United States,English,Woody Allen,,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,Paramount Pictures,US,English,Paul Hogan,,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [44]:
# 14. Check the movies_with_ratings_df DataFrame.
movies_with_ratings_df.loc[3513:3521]

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
3513,tt0437232,1123,Catch a Fire,Catch a Fire,"The spark that ignites us, unites us.",,https://en.wikipedia.org/wiki/Catch_a_Fire_(film),https://www.imdb.com/title/tt0437232/,101.0,14000000.0,...,2.0,13.0,0.0,14.0,2.0,43.0,10.0,89.0,5.0,37.0
3514,tt0413895,9986,Charlotte's Web,Charlotte's Web,"Something unexpected, unbelievable, unforgetta...",,https://en.wikipedia.org/wiki/Charlotte%27s_We...,https://www.imdb.com/title/tt0413895/,97.0,85000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3515,tt0206634,9693,Children of Men,Children of Men,The future's a thing of the past.,,https://en.wikipedia.org/wiki/Children_of_Men,https://www.imdb.com/title/tt0206634/,109.0,76000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3516,tt0446298,14041,Civic Duty,Civic Duty,,,https://en.wikipedia.org/wiki/Civic_Duty_(film),https://www.imdb.com/title/tt0446298/,98.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3517,tt0424345,2295,Clerks II,Clerks II,With No Power Comes No Responsibility,"{'id': 182813, 'name': 'Clerks Collection', 'p...",https://en.wikipedia.org/wiki/Clerks_II,https://www.imdb.com/title/tt0424345/,97.0,5000000.0,...,2.0,29.0,2.0,62.0,19.0,130.0,26.0,184.0,15.0,77.0
3518,tt0389860,9339,Click,Click,What If You Had A Remote... That Controlled Yo...,,https://en.wikipedia.org/wiki/Click_(2006_film),https://www.imdb.com/title/tt0389860/,107.0,82500000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3519,tt0380268,14761,Cocaine Cowboys,Cocaine Cowboys,How Miami became the cocaine capital of the Un...,"{'id': 376970, 'name': 'Cocaine Cowboys', 'pos...",https://en.wikipedia.org/wiki/Cocaine_Cowboys,https://www.imdb.com/title/tt0380268/,118.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3520,tt0473174,40922,Coffee Date,Coffee Date,,,https://en.wikipedia.org/wiki/Coffee_Date,https://www.imdb.com/title/tt0473174/,94.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3521,tt0457308,7547,Come Early Morning,Come Early Morning,"Before you fall in love, you need to love your...",,https://en.wikipedia.org/wiki/Come_Early_Morning,https://www.imdb.com/title/tt0457308/,97.0,6000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# 15. Check the movies_df DataFrame
movies_df.loc[3016:3036]

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,director,starring,cinematography,editors,writers,screenplay,composers,based_on,production_company,homepage
3016,tt0290334,36658,X2,X2,The time has come for those who are different ...,"{'id': 748, 'name': 'X-Men Collection', 'poste...",https://en.wikipedia.org/wiki/X2_(film),https://www.imdb.com/title/tt0290334/,133.0,110000000.0,...,Bryan Singer,"[Patrick Stewart, Hugh Jackman, Ian McKellen, ...",Newton Thomas Sigel,John Ottman,,"[Michael Dougherty, Dan Harris, David Hayter]",John Ottman,"[X-Men, by, Stan Lee, Jack Kirby]",,
3017,tt0379593,3396,The Yes Men,The Yes Men,Changing the world one prank at a time.,"{'id': 475520, 'name': 'The Yes Men Collection...",https://en.wikipedia.org/wiki/The_Yes_Men_(film),https://www.imdb.com/title/tt0379593/,80.0,,...,"[Dan Ollman, Sarah Price, Chris Smith]","[Mike Bonanno, Andy Bichlbaum]",,Dan Ollman,,,Jon Solomon,,,http://www.theyesmen.org/
3018,tt0318850,42418,Young Black Stallion,Young Black Stallion,The greatest story of friendship ever told.,,https://en.wikipedia.org/wiki/The_Young_Black_...,https://www.imdb.com/title/tt0318850/,49.0,,...,Simon Wincer,"[Richard Romanus, Biana G. Tamimi, Patrick Elyas]",Reed Smoot,"[Bud Smith, Terry Blythe, M. Scott Smith]",,Jeanne Rosenberg,William Ross,"[The Young Black Stallion, by, Walter Farley, ...","[Walt Disney Pictures, The Kennedy/Marshall Co...",http://movies.disney.com/the-young-black-stallion
3019,tt0365960,27090,Zero Day,Zero Day,,,https://en.wikipedia.org/wiki/Zero_Day_(film),https://www.imdb.com/title/tt0365960/,92.0,20000.0,...,Ben Coccio,"[Andre Keuck, Cal Robertson]",,,"[Ben Coccio, Christopher Coccio]",,,,,
3020,tt0337563,10096,13 Going on 30,13 Going on 30,"For some, 13 feels like it was just yesterday....",,https://en.wikipedia.org/wiki/13_Going_on_30,https://www.imdb.com/title/tt0337563/,98.0,37000000.0,...,Gary Winick,"[Jennifer Garner, Mark Ruffalo, Judy Greer, An...",Don Burgess,Susan Littenberg,"[Josh Goldsmith, Cathy Yuspa]","[Cathy Yuspa, Josh Goldsmith, Niels Mueller]",Theodore Shapiro,,"[Revolution Studios, Columbia Pictures]",
3021,tt0343660,1824,50 First Dates,50 First Dates,Imagine having to win over the girl of your dr...,,https://en.wikipedia.org/wiki/50_First_Dates,https://www.imdb.com/title/tt0343660/,99.0,75000000.0,...,Peter Segal,"[Adam Sandler, Drew Barrymore, Rob Schneider, ...",Jack N. Green,Jeff Gourson,George Wing,,Teddy Castellucci,,,
3022,tt0367479,10589,After the Sunset,After the Sunset,Who will walk away?,,https://en.wikipedia.org/wiki/After_the_Sunset,https://www.imdb.com/title/tt0367479/,97.0,60000000.0,...,Brett Ratner,"[Pierce Brosnan, Salma Hayek, Woody Harrelson,...",Dante Spinotti,Mark Helfrich,,"[Paul Zbyszewski, Craig Rosenberg]",Lalo Schifrin,,,http://www.afterthesunset.com/
3023,tt0312329,8842,Against the Ropes,Against the Ropes,,,https://en.wikipedia.org/wiki/Against_the_Ropes,https://www.imdb.com/title/tt0312329/,106.0,39000000.0,...,Charles S. Dutton,"[Meg Ryan, Omar Epps, Tony Shalhoub, Tim Daly,...",Jack N. Green,Eric L. Beason,,,Michael Kamen,,Cort/Madden Productions,
3024,tt0358349,17047,Agent Cody Banks 2: Destination London,Agent Cody Banks 2: Destination London,Adventure is an attitude.,"{'id': 91427, 'name': 'Agent Cody Banks Collec...",https://en.wikipedia.org/wiki/Agent_Cody_Banks...,https://www.imdb.com/title/tt0358349/,100.0,26000000.0,...,Kevin Allen,"[Frankie Muniz, Hannah Spearritt, Anthony Ande...",Denis Crossan,Andrew MacRitchie,,Don Rhymer,Mark Thomas,"[Characters, by Jeffrey Jurgensen]","[Splendid Pictures, Maverick Films, Dylan Sell...",http://www.mgm.com/view/Movie/33/
3025,tt0318974,10733,The Alamo,The Alamo,You will never forget,,https://en.wikipedia.org/wiki/The_Alamo_(2004_...,https://www.imdb.com/title/tt0318974/,137.0,145000000.0,...,John Lee Hancock,"[Dennis Quaid, Billy Bob Thornton, Jason Patri...",Dean Semler,Eric L. Beason,"[John Lee Hancock, Leslie Bohem, Stephen Gaghan]",,Carter Burwell,,"[Touchstone Pictures, Imagine Entertainment]",
