In [1]:
#Import Dependencies
import pandas as pd
import numpy as np
import requests
import json
import time
import datetime
from config import tmdb_key
from pprint import pprint

In [7]:
#Read and Display CSV file
flix = pd.read_csv('../Rutgers_DS_Project_2/NetflixOriginals.csv')
flix.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [8]:
#Convert 'Premiere' column into datetime format and display updated df
full_dates = flix['Premiere']
converted = pd.to_datetime(full_dates)
flix['Premiere'] = converted
flix.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,2019-08-05,58,2.5,English/Japanese
1,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish
2,The App,Science fiction/Drama,2019-12-26,79,2.6,Italian
3,The Open House,Horror thriller,2018-01-19,94,3.2,English
4,Kaali Khuhi,Mystery,2020-10-30,90,3.4,Hindi


In [3]:
#Verify whether any values are missing in df
flix.isnull().values.any()

False

In [4]:
#Verify value number in df columns
flix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


## Perform API Calls

In [5]:
#Save config information
url = "https://api.themoviedb.org/3/search/movie?"

In [6]:
#Create Movie ID list
flix_id_list = []
release_date = []
title = []
counter = 0
sets = 1

#Movie list
movies = flix['Title']

#Begin Retrieval
print(f"Beginning Data Retrieval\n"  
f"-----------------------------\n")

#Run loop through movie list 
for movie in movies:
    #Build partial query URL
    query_url = f"{url}api_key={tmdb_key}&query={movie}"
    #Response from API requests
    response = requests.get(query_url).json()
    #Call and creation of list
    try:
        #Add data to list
        title.append(response['results'][0]['title'])
        flix_id_list.append(response['results'][0]['id'])
        release_date.append(response['results'][0]['release_date'])
        #Populate counters
        if counter>50:
            counter=0
            sets+=1
            time.sleep(1)
        else:
            counter+=1
            print(f"Processing Record {counter} of Set {sets} | {movie}")
            time.sleep(1)
    #Should the movie not be found, print exception statement
    except(KeyError, IndexError):
        print(f"Movie not found. Skipping...") 
#Close of loop
print(f"-----------------------------\n"
f"Data Retrieval Complete\n"      
f"-----------------------------\n")

Beginning Data Retrieval
-----------------------------

Processing Record 1 of Set 1 | Enter the Anime
Processing Record 2 of Set 1 | Dark Forces
Processing Record 3 of Set 1 | The App
Processing Record 4 of Set 1 | The Open House
Processing Record 5 of Set 1 | Kaali Khuhi
Processing Record 6 of Set 1 | Drive
Processing Record 7 of Set 1 | Leyla Everlasting
Processing Record 8 of Set 1 | The Last Days of American Crime
Processing Record 9 of Set 1 | Paradox
Processing Record 10 of Set 1 | Sardar Ka Grandson
Processing Record 11 of Set 1 | Searching for Sheela
Processing Record 12 of Set 1 | The Call
Processing Record 13 of Set 1 | Whipped
Processing Record 14 of Set 1 | All Because of You
Processing Record 15 of Set 1 | Mercy
Processing Record 16 of Set 1 | After the Raid
Processing Record 17 of Set 1 | Ghost Stories
Processing Record 18 of Set 1 | The Last Thing He Wanted
Processing Record 19 of Set 1 | What Happened to Mr. Cha?
Processing Record 20 of Set 1 | Death Note
Processing Re

KeyboardInterrupt: 

In [9]:
#Test
response

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/lOSdUkGQmbAl5JQ3QoHqBZUbZhC.jpg',
   'genre_ids': [53, 28, 878],
   'id': 775996,
   'original_language': 'en',
   'original_title': 'Outside the Wire',
   'overview': 'In the near future, a drone pilot is sent into a deadly militarized zone and must work with an android officer to locate a doomsday device.',
   'popularity': 394.591,
   'poster_path': '/6XYLiMxHAaCsoyrVo38LBWMw2p8.jpg',
   'release_date': '2021-01-15',
   'title': 'Outside the Wire',
   'video': False,
   'vote_average': 6.4,
   'vote_count': 1150},
  {'adult': False,
   'backdrop_path': None,
   'genre_ids': [10752, 18],
   'id': 787742,
   'original_language': 'en',
   'original_title': 'Outside the Wire',
   'overview': 'A female Australian soldier cut off from her team, lies trapped in a collapsed building behind enemy lines. With a broken leg and minimal supplies, she is on her own until the unlikely arrival of a frightened 10 year old boy wielding a

In [10]:
#Test
pprint(response['results'])

[{'adult': False,
  'backdrop_path': '/lOSdUkGQmbAl5JQ3QoHqBZUbZhC.jpg',
  'genre_ids': [53, 28, 878],
  'id': 775996,
  'original_language': 'en',
  'original_title': 'Outside the Wire',
  'overview': 'In the near future, a drone pilot is sent into a deadly '
              'militarized zone and must work with an android officer to '
              'locate a doomsday device.',
  'popularity': 394.591,
  'poster_path': '/6XYLiMxHAaCsoyrVo38LBWMw2p8.jpg',
  'release_date': '2021-01-15',
  'title': 'Outside the Wire',
  'video': False,
  'vote_average': 6.4,
  'vote_count': 1150},
 {'adult': False,
  'backdrop_path': None,
  'genre_ids': [10752, 18],
  'id': 787742,
  'original_language': 'en',
  'original_title': 'Outside the Wire',
  'overview': 'A female Australian soldier cut off from her team, lies trapped '
              'in a collapsed building behind enemy lines. With a broken leg '
              'and minimal supplies, she is on her own until the unlikely '
              'arrival o

In [None]:
#Test
result = []
for key in response['results']:
    if result 

In [None]:
#Test
# response['results']['title']

In [None]:
#Converting Raw data to df
pulled_flix_df = pd.DataFrame({
    "Pulled_ID": flix_id_list,
    "Pulled_Movie_Name": title,
    "Pulled_Release_Date": release_date
})

#Show df
pulled_flix_df

#Add empty fields to df
pulled_flix_df['Budget'] = ''
pulled_flix_df['Revenue'] = ''

#Show df
pulled_flix_df

In [None]:
#Save to CSV
pulled_flix_df.to_csv('PulledTMDBData.csv')

In [None]:
url = f"https://api.themoviedb.org/3/movie/"

#Created list
Budget = []
Revenue = []
drop_index = []

#Start for loop to go through each row and retrieve movie financial data
for index, row in pulled_flix_df.iterrows():
    movie = row['Pulled_Movie_Name']
    movie_id = row['Pulled_ID']
    #Build partial query URL
    query_url = f"{url}{movie_id}?api_key={tmdb_key}"
    #Response from API requests
    response = requests.get(query_url).json()
    try:
        print(f"The budget and revenue for {movie} is {response['budget']} and {response['revenue']}, respectively.")
        if response['original_title'] != '':
            pulled_flix_df.loc[index,'Budget'] = response['budget']
            pulled_flix_df.loc[index,'Revenue'] = response['revenue']
    except (KeyError, IndexError):
        drop_index.append(index)
        print('Missing field/result...skipping.')
    print('-'*10)
    time.sleep(1)
print(f'-----End of Search-----')

In [None]:
#Create new "Profit" column and display df
clean_pulled = pulled_flix_df
clean_pulled['Profit'] = clean_pulled['Revenue']-clean_pulled['Budget']
clean_pulled

In [None]:
#Update Pulled Data CSV
clean_pulled.to_csv('PulledTMDBData.csv')

In [None]:
clean_pulled['Preformer'] = ''
clean_pulled['Gender'] = ''
clean_pulled.head()

In [None]:
url = f"https://api.themoviedb.org/3/movie/"

#Created list
gender = []
preformer = []
drop_index = []

#Start for loop to go through each row and retrieve preformer data
for index, row in pulled_flix_df.iterrows():
    movie_id = row['Pulled_ID']
    movie = row['Pulled_Movie_Name']
    #Build partial query URL
    query_url = f"{url}{movie_id}/credits?api_key={tmdb_key}"
    #Response from API requests
    response = requests.get(query_url).json()
    try:
        print(f"The preformer in {movie}, is {response['cast'][0]['name']}. Their gender is characterized as, {response['cast'][0]['gender']}.")
        if response['cast'][0]['name'] != '':
            clean_pulled.loc[index,'Preformer'] = response['cast'][0]['name']
            clean_pulled.loc[index,'Gender'] = response['cast'][0]['gender']
    except (KeyError, IndexError):
        drop_index.append(index)
        print('Missing field/result...skipping.')
    print('-'*10)
    time.sleep(1)
print(f'-----End of Search-----')

In [None]:
#Delete the 'performer' field show df
#Gender: 2 - Male, 1 - Female, 0 - Unknown
clean_pulled.head()

In [None]:
#Update Pulled Data CSV
clean_pulled.to_csv('PulledTMDBData.csv')

In [None]:
new_clean_pulled = clean_pulled

# Start here guys!!!!

In [None]:
#Identify field dtypes
new_clean_pulled.info()

In [None]:
#Rename 'Pulled_Movie_Name' column in clean_pulled df
ncp_df = new_clean_pulled.rename(columns={"Pulled_Movie_Name": "Title"})

In [None]:
#Display ncp_df
ncp_df.head()

In [None]:
#Merge 'flix' df and 'clean_pulled' df on movie titles, in order to verify that "movie name" 
#and "release dates" match and display
merged_df = flix.merge(ncp_df, how='inner', on='Title')
merged_df

In [None]:
#Identify field dtypes
merged_df.info()

In [None]:
#df of validated API pulls
valid_df = merged_df[merged_df['Premiere'] == merged_df['Pulled_Release_Date']]
valid_df

In [None]:
#df of inconsitent API pulls
inconsist_df = merged_df[merged_df['Premiere'] != merged_df['Pulled_Release_Date']]
inconsist_df