In [1]:
# Dependencies
import matplotlib.pyplot as plt
import requests
import pandas as pd
from config import api_key
import json
import numpy as np

In [2]:
# Check that the CSV was read properly
movie_list = pd.read_csv("movies.csv")
movie_list.head()

Unnamed: 0,Title
0,The Shawshank Redemption
1,The Godfather
2,The Dark Knight
3,The Godfather Part II
4,12 Angry Men


In [3]:
# Add id column to the DataFrame
movie_list['id'] = ''
movie_list

Unnamed: 0,Title,id
0,The Shawshank Redemption,
1,The Godfather,
2,The Dark Knight,
3,The Godfather Part II,
4,12 Angry Men,
...,...,...
245,Jai Bhim,
246,Aladdin,
247,Gandhi,
248,The Help,


In [4]:
# # TMDB API call
# movie=movie_list['Title'][100]
# base_url="https://api.themoviedb.org/3/search/movie?"
# build_url = f'https://api.themoviedb.org/3/search/movie?query={movie}&api_key={api_key}'
# response=requests.get(build_url).json()
# #print(response.url)
# print(json.dumps(response, indent=4, sort_keys=True))

In [5]:
#response['results'][0]['id']

In [6]:
# Loop through the CSV's Title column to call said movie from the API
# Create an empty list to store unfound movies in the TMDB database into
unfound_movies = []
for index, row in movie_list.iterrows():
    movie=movie_list['Title'][index]
    base_url="https://api.themoviedb.org/3/search/movie?"
    build_url = f'{base_url}query={movie}&api_key={api_key}'
    response=requests.get(build_url).json()
    #print(response.url)

    try:
        movie_list.loc[index, "id"] = response['results'][0]['id']
    except (KeyError, IndexError):
        # If no movie name is found, set the movie id as NaN .
        movie_list.loc[index, "id"] = np.nan
        unfound_movies.append(movie_list.loc[index, "Title"])

In [7]:
# View the movies not found in the database
unfound_movies

['Gisaengchung', 'Oldeuboi', 'Capharnaüm']

In [8]:
#Print the DataFrame along with it's shape
print(movie_list.shape)
movie_list

(250, 2)


Unnamed: 0,Title,id
0,The Shawshank Redemption,278
1,The Godfather,238
2,The Dark Knight,155
3,The Godfather Part II,240
4,12 Angry Men,389
...,...,...
245,Jai Bhim,855400
246,Aladdin,420817
247,Gandhi,783
248,The Help,50014


In [9]:
# Export the City_Data into a csv
#movie_list.to_csv("movie_list.csv")

In [10]:
# movie_list_df = movie_list.loc[movie_list['id']=='NaN']
# movie_list_df

In [11]:
#Drop NaN rows
movie_list.dropna(how='any',inplace=True)

In [12]:
#Check size of Data Frame
movie_list.shape

(247, 2)

In [13]:
#Add columns name to the DataFrame
movie_list[['imdb_id',
            'budget',
            'genres',
            'revenue',
            'runtime',
            'popularity',
            'release_date',
            'production_countries'
            ,'tmdb_rating']] =''
movie_list

Unnamed: 0,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating
0,The Shawshank Redemption,278,,,,,,,,,
1,The Godfather,238,,,,,,,,,
2,The Dark Knight,155,,,,,,,,,
3,The Godfather Part II,240,,,,,,,,,
4,12 Angry Men,389,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
245,Jai Bhim,855400,,,,,,,,,
246,Aladdin,420817,,,,,,,,,
247,Gandhi,783,,,,,,,,,
248,The Help,50014,,,,,,,,,


In [14]:
# # Defining the path for each column value

#imdb_id = url_byid_response['imdb_id']
#release_date = url_byid_response['release_date']
#url_byid_response['revenue']
#url_byid_response['runtime']

# #extracting genre names from list
# for i in range(0,len(url_byid_response['genres'])):
#     print(url_byid_response['genres'][i]['name'])

# #extracting genre names from list
# for i in range(0,len(url_byid_response['production_countries'])):
#     print(url_byid_response['production_countries'][i]['name'])

#url_byid_response['popularity']
#url_byid_response['budget']

#url_byid_response['vote_average']


In [15]:
# #API call, search movie by ID
# for i in range(0,len(movie_list['Title']))
#     movie_id=movie_list['id'][i]
#     url_ById = "https://api.themoviedb.org/3/movie/"
#     build_url_ById=f"{url_ById}{movie_id}?api_key={api_key}"
#     url_byid_response=requests.get(build_url_ById).json()
#     #print(json.dumps(url_byid_response, indent=4, sort_keys=True))

In [16]:
# Loop through each movie's title for the API call and store the data we want to analyze into our DataFrame

for index, row in movie_list.iterrows():
# Create empty lists to store multiple-value columns' data into 
    genres = []
    production_countries = []
    try:
        movie_id=movie_list['id'][index]
        base_url_ById = "https://api.themoviedb.org/3/movie/"
        build_url_ById=f"{base_url_ById}{movie_id}?api_key={api_key}"
        url_byid_response=requests.get(build_url_ById).json()
        movie_list.loc[index, "imdb_id"] = url_byid_response['imdb_id']
        movie_list.loc[index, "release_date"] = url_byid_response['release_date']
        movie_list.loc[index, "revenue"] = url_byid_response['revenue']
        movie_list.loc[index, "runtime"]= url_byid_response['runtime']
        movie_list.loc[index, "Title"]= url_byid_response['title']

        
        
        #extracting genre names from list
        for i in range(0,len(url_byid_response['genres'])):
            current_genre = url_byid_response['genres'][i]['name']
            genres.append(current_genre)
            

        #extracting production_countries names from list
        for i in range(0,len(url_byid_response['production_countries'])):
            current_prod_country = url_byid_response['production_countries'][i]['name']
            production_countries.append(current_prod_country)

        movie_list.loc[index, "popularity"] = url_byid_response['popularity']
        movie_list.loc[index, "budget"] = url_byid_response['budget']
        movie_list.loc[index, "tmdb_rating"] = url_byid_response['vote_average']
        movie_list.loc[index, "genres"] = ", ".join(genres)
        movie_list.loc[index, "production_countries"] = ", ".join(production_countries)
    except (KeyError, IndexError):
        # If no movie name is found, set the movie id as NaN .
        movie_list.loc[index, "imdb_id"] = np.nan
        movie_list.loc[index, "release_date"] = np.nan
        movie_list.loc[index, "revenue"] = np.nan
        movie_list.loc[index, "runtime"]= np.nan
        movie_list.loc[index, "popularity"] = np.nan
        movie_list.loc[index, "budget"] = np.nan
        movie_list.loc[index, "tmdb_rating"] = np.nan
        movie_list.loc[index, "production_countries"] = np.nan
        movie_list.loc[index, "genres"] = np.nan

        

In [17]:
movie_list.head()

Unnamed: 0,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating
0,The Shawshank Redemption,278,tt0111161,25000000,"Drama, Crime",28341469,142,98.667,1994-09-23,United States of America,8.704
1,The Godfather,238,tt0068646,6000000,"Drama, Crime",245066411,175,134.974,1972-03-14,United States of America,8.709
2,The Dark Knight,155,tt0468569,185000000,"Drama, Action, Crime, Thriller",1004558444,152,94.409,2008-07-16,"United Kingdom, United States of America",8.512
3,The Godfather Part II,240,tt0071562,13000000,"Drama, Crime",102600000,202,63.334,1974-12-20,United States of America,8.59
4,12 Angry Men,389,tt0050083,350000,Drama,1000000,97,46.095,1957-04-10,United States of America,8.539


In [18]:
movie_list.shape

(247, 11)

In [None]:
# # Export the City_Data into a csv
# movie_list.to_csv("movie_df.csv")

In [19]:
no_budget = movie_list["budget"]==0
# null_imdb_id = movie_list.loc[movie_list]
no_budget

0      False
1      False
2      False
3      False
4      False
       ...  
245     True
246    False
247    False
248    False
249    False
Name: budget, Length: 247, dtype: bool

In [20]:
no_budget_df = movie_list[no_budget]
# len(no_budget_df["budget"])
no_budget_df

Unnamed: 0,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating
46,Beautiful Swordswomen: Double Seppuku,76659,tt0440107,0,Horror,0,42,1.4,1990-01-01,Japan,4.6
48,Grave of the Fireflies,76826,tt0491652,0,Drama,0,148,10.804,2005-11-01,Japan,7.8
59,Ed Ruscha - Motorized Photographs of Sunset Bl...,664159,,0,Documentary,0,2,0.6,2019-12-19,United States of America,0.0
84,Your Name.,372058,tt5311514,0,"Romance, Animation, Drama",357986087,106,83.039,2016-08-26,Japan,8.515
91,Akira Kurosawa: It Is Wonderful to Create: 'Hi...,523625,,0,Documentary,0,37,0.928,2002-10-25,Japan,0.0
97,M,832,tt0022100,0,"Drama, Thriller, Crime",0,111,22.705,1931-05-11,Germany,8.093
101,Come and See,25237,tt0091251,0,"Drama, War",168817,142,26.951,1985-07-09,Soviet Union,8.306
106,Ikiru,3782,tt0044741,0,Drama,55240,143,15.679,1952-10-09,Japan,8.3
113,Hamilton,556574,tt8503618,0,"History, Drama",0,160,19.33,2020-07-03,United States of America,8.214
138,Ran,635736,tt8906624,0,,0,20,0.675,2018-06-06,,0.0


In [21]:
no_revenue = no_budget_df["revenue"]==0
no_revenue.value_counts()

revenue
True     18
False     6
Name: count, dtype: int64

In [22]:
# movie_list.loc["genres"].value_counts()

In [23]:
oscar_list = pd.read_csv("the_oscar_award.csv")
print(oscar_list.shape)
oscar_list.head(2)

(10765, 7)


Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True


In [24]:
oscar_list["film"].value_counts()

film
A Star Is Born          25
West Side Story         18
Titanic                 16
Mutiny on the Bounty    15
Moulin Rouge            15
                        ..
Mural on Our Street      1
Ouverture                1
Point of View            1
To Be Alive!             1
Welcome to Chechnya      1
Name: count, Length: 4991, dtype: int64

In [25]:
gglobe_list = pd.read_csv("golden_globe_awards.csv")
print(gglobe_list.shape)
gglobe_list.head(2)

(7991, 7)


Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
0,1943,1944,1,Best Performance by an Actress in a Supporting...,Katina Paxinou,For Whom The Bell Tolls,True
1,1943,1944,1,Best Performance by an Actor in a Supporting R...,Akim Tamiroff,For Whom The Bell Tolls,True


In [26]:
gglobe_list["film"].value_counts()

film
L.A. Law                 26
Will & Grace             23
All In The Family        23
Cheers                   22
Taxi                     20
                         ..
Woman Called Golda, A     1
Making Love               1
Yes, Giorgio              1
Rocky III                 1
Living with Yourself      1
Name: count, Length: 3083, dtype: int64

In [27]:
# # This will split a cell with multiple values back into a list
# newvar = movie_list.loc[9, "production_countries"].split(", ")
# newvar[2]

In [28]:
checkexplode = movie_list["genres"].explode('drama')
checkexplode

0                             Drama, Crime
1                             Drama, Crime
2           Drama, Action, Crime, Thriller
3                             Drama, Crime
4                                    Drama
                      ...                 
242                  Crime, Drama, Mystery
243    Adventure, Fantasy, Romance, Family
244                         Drama, History
245                                  Drama
246    Romance, Family, Animation, Fantasy
Name: genres, Length: 247, dtype: object

In [30]:
def contains_value(val, search_val):
    return search_val in val.split(', ')

In [31]:
# Create a new "Year" column from the "release date" by making a new df converting to datetime
movie_list["release_date"] = pd.to_datetime(movie_list["release_date"])
movie_list["Year"] = movie_list["release_date"].dt.year
# movie_list.drop("release_date", axis=1, inplace=True)
movie_list[["Title", "Year", "id", "imdb_id", "genres", "runtime", "budget", "revenue", "production_countries",
           "popularity", "tmdb_rating"]]
movie_list

Unnamed: 0,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,production_countries,tmdb_rating,Year
0,The Shawshank Redemption,278,tt0111161,25000000,"Drama, Crime",28341469,142,98.667,United States of America,8.704,1994
1,The Godfather,238,tt0068646,6000000,"Drama, Crime",245066411,175,134.974,United States of America,8.709,1972
2,The Dark Knight,155,tt0468569,185000000,"Drama, Action, Crime, Thriller",1004558444,152,94.409,"United Kingdom, United States of America",8.512,2008
3,The Godfather Part II,240,tt0071562,13000000,"Drama, Crime",102600000,202,63.334,United States of America,8.59,1974
4,12 Angry Men,389,tt0050083,350000,Drama,1000000,97,46.095,United States of America,8.539,1957
...,...,...,...,...,...,...,...,...,...,...,...
245,Jai Bhim,855400,tt15097216,0,"Crime, Drama, Mystery",0,164,7.796,India,7.476,2021
246,Aladdin,420817,tt6139732,183000000,"Adventure, Fantasy, Romance, Family",1054304000,127,53.261,United States of America,7.113,2019
247,Gandhi,783,tt0083987,22000000,"Drama, History",77737889,191,21.865,"India, United Kingdom, United States of America",7.544,1982
248,The Help,50014,tt1454029,25000000,Drama,216639112,146,67.526,United States of America,8.202,2011


In [None]:
# crime_df = movie_list[movie_list['genres'].apply(lambda x: contains_value(x, 'Crime'))]
# drama_df = movie_list[movie_list['genres'].apply(lambda x: contains_value(x, 'Drama'))]


# # merge1_df = pd.merge(crime_df, drama_df, on=["Title", "Year", "id", "budget", "revenue", "popularity"])
# # merge1_df
# # merge1_df.groupby["Year"]
# # merge1_df

In [None]:
# # Creating empty lists to split multiple-value columns' values into
# genre_list = []
# prod_country_list = []

# for index, row in movie_list.iterrows():
#     genre_list.append(movie_list.loc[index, "genres"].split(", "))
#     prod_country_list.append(movie_list.loc[index, "production_countries"])

# genre_df = pd.DataFrame(genre_list)
# prod_country_df = pd.DataFrame(prod_country_list)
# genre_df