In [1]:
# Dependencies
import matplotlib.pyplot as plt
import requests
import pandas as pd
from scipy.stats import linregress
from config import tmdb_api_key
from config import omdb_api_key
import json
import numpy as np
import cpi
import time

In [2]:
# Check that the CSV was read properly
movie_import = pd.read_csv("movies_metadata.csv", low_memory=False)
print(movie_import.shape)
movie_import.head()

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
movie_import.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [4]:
# Check the dataset's budget and revenue values to see how many entries don't have said data
print("Budget value counts:")
print(movie_import["budget"].value_counts())
print("---------------------------------------")
print("Revenue value counts:")
print(movie_import["revenue"].value_counts())

Budget value counts:
budget
0           36573
5000000       286
10000000      259
20000000      243
2000000       242
            ...  
923             1
72500000        1
2160000         1
4439832         1
1254040         1
Name: count, Length: 1226, dtype: int64
---------------------------------------
Revenue value counts:
revenue
0.0           38052
12000000.0       20
11000000.0       19
10000000.0       19
2000000.0        18
              ...  
36565280.0        1
439564.0          1
35610100.0        1
10217873.0        1
1413000.0         1
Name: count, Length: 6863, dtype: int64


In [5]:
# Drop duplicate movies (movie entries with the same title and release date) from the imported data
movie_import.drop_duplicates(subset=["release_date", "title"], keep='first', inplace=True)

In [6]:
# View if a movie had 0 budget or not (true/false)
# First, convert the budget column from a string to numeric and write NaN values to non-numeric cells
movie_import["budget"] = pd.to_numeric(movie_import['budget'], errors='coerce', downcast='integer')
no_budget = movie_import["budget"]==0
no_budget

0        False
1        False
2         True
3        False
4         True
         ...  
45461     True
45462     True
45463     True
45464     True
45465     True
Name: budget, Length: 45434, dtype: bool

In [7]:
# Create a DataFrame from the last cell's result
no_budget_df = movie_import[no_budget]
# Remove the rows with zero budget movies
movie_import=movie_import[~no_budget]
movie_import

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
6,False,,58000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,1995-12-15,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45402,False,,2000000.0,"[{'id': 35, 'name': 'Comedy'}]",,293654,tt3761706,ru,Корпоратив,"Igor, a furniture store manager, tries to figu...",...,2014-09-18,0.0,89.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Corporate Event,False,3.5,4.0
45408,False,,2000000.0,"[{'id': 35, 'name': 'Comedy'}]",,100152,tt0417949,ru,Марс,"Somewhere deep, deep in Russia there is a town...",...,2004-11-11,0.0,100.0,[],Released,,Mars,False,5.0,2.0
45409,False,,800000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,62757,tt0933361,en,Dikari,"The sea, August, interesting and simple people...",...,2006-11-23,1328612.0,100.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Savages,False,5.8,6.0
45412,False,,2000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,63281,tt1718881,en,Про любоff,"У девушки Даши, приехавшей с подругой «покорят...",...,2010-09-30,1268793.0,107.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Pro Lyuboff,False,4.0,3.0


In [8]:
# View if a movie had 0 budget or not (true/false)
# First, convert the budget column from a string to numeric and write NaN values to non-numeric cells
movie_import["revenue"] = pd.to_numeric(movie_import["revenue"], errors="coerce", downcast="float")
no_revenue = movie_import["revenue"]==0
no_revenue

0        False
1        False
3        False
5        False
6         True
         ...  
45402     True
45408     True
45409    False
45412    False
45422    False
Name: revenue, Length: 8883, dtype: bool

In [9]:
# Create a DataFrame from the last block's result
no_revenue_df = movie_import[no_revenue]
# Remove the rows with zero budget movies
movie_import=movie_import[~no_revenue]
print(movie_import.shape)
movie_import

(5378, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
8,False,,35000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45250,False,,12000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,24049,tt0479751,ta,சிவாஜி,Corrupt police and politicians target a comput...,...,2007-06-14,19000000.0,185.0,"[{'iso_639_1': 'ta', 'name': 'தமிழ்'}, {'iso_6...",Released,,Sivaji: The Boss,False,6.9,25.0
45399,False,,750000.0,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",,280422,tt3805180,ru,Все и сразу,,...,2014-06-05,3.0,0.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,All at Once,False,6.0,4.0
45409,False,,800000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,62757,tt0933361,en,Dikari,"The sea, August, interesting and simple people...",...,2006-11-23,1328612.0,100.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Savages,False,5.8,6.0
45412,False,,2000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,63281,tt1718881,en,Про любоff,"У девушки Даши, приехавшей с подругой «покорят...",...,2010-09-30,1268793.0,107.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Pro Lyuboff,False,4.0,3.0


In [10]:
# Check if there are any 0 values remaining for either budget or revenue
print(f"Budget check: ")
print(movie_import["budget"].min())
print(f"Revenue check: ")
print(movie_import["revenue"].min())

Budget check: 
1.0
Revenue check: 
1.0


In [11]:
# Further fitler down the original movie dataset by removing movies with budgets and revenues less than $1000
movie_prep1 = movie_import[movie_import["budget"]>=1000]
movie_prep1 = movie_import[movie_import["revenue"]>=1000]
movie_prep1

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
8,False,,35000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45167,False,,11000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,395834,tt5362988,en,Wind River,An FBI agent teams with the town's veteran gam...,...,2017-08-03,184770205.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Nothing is harder to track than the truth.,Wind River,False,7.4,181.0
45250,False,,12000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,24049,tt0479751,ta,சிவாஜி,Corrupt police and politicians target a comput...,...,2007-06-14,19000000.0,185.0,"[{'iso_639_1': 'ta', 'name': 'தமிழ்'}, {'iso_6...",Released,,Sivaji: The Boss,False,6.9,25.0
45409,False,,800000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,62757,tt0933361,en,Dikari,"The sea, August, interesting and simple people...",...,2006-11-23,1328612.0,100.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Savages,False,5.8,6.0
45412,False,,2000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,63281,tt1718881,en,Про любоff,"У девушки Даши, приехавшей с подругой «покорят...",...,2010-09-30,1268793.0,107.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Pro Lyuboff,False,4.0,3.0


In [12]:
# Make a new DataFrame and import the title column from the metadata and make a blank id column to be populate
movie_prep2 = pd.DataFrame(movie_prep1["original_title"])
movie_prep2.rename(columns={"original_title": "Title"}, inplace=True)
movie_prep2['id'] = ''
movie_prep2

Unnamed: 0,Title,id
0,Toy Story,
1,Jumanji,
3,Waiting to Exhale,
5,Heat,
8,Sudden Death,
...,...,...
45167,Wind River,
45250,சிவாஜி,
45409,Dikari,
45412,Про любоff,


In [13]:
movie_list = movie_prep2.sample(1000)
movie_list.reset_index(inplace=True, drop=True)
movie_list

Unnamed: 0,Title,id
0,Home Alone 2: Lost in New York,
1,Orca: The Killer Whale,
2,Martin Lawrence Live: Runteldat,
3,Daredevil,
4,L'avenir,
...,...,...
995,Race,
996,Hostel,
997,The Wizard,
998,Maniac,


In [14]:
# Test a TMDB API call based on an index of the movie list and printing the json response
movie=movie_list['Title'][100]
base_url="https://api.themoviedb.org/3/search/movie?"
build_url = f'https://api.themoviedb.org/3/search/movie?query={movie}&api_key={tmdb_api_key}'
response=requests.get(build_url).json()
#print(response.url)
print(json.dumps(response, indent=4, sort_keys=True))

{
    "page": 1,
    "results": [
        {
            "adult": false,
            "backdrop_path": "/lYkxECrpZuaDUixxze5cXpvEomf.jpg",
            "genre_ids": [
                18
            ],
            "id": 55720,
            "original_language": "en",
            "original_title": "A Better Life",
            "overview": "A gardener in East L.A. struggles to keep his son away from gangs and immigration agents while traveling across town to perform landscaping work for the city's wealthy landowners.",
            "popularity": 8.749,
            "poster_path": "/eY9wOw5PjrRXUTLgNmqWb60it10.jpg",
            "release_date": "2011-06-24",
            "title": "A Better Life",
            "video": false,
            "vote_average": 7.198,
            "vote_count": 200
        },
        {
            "adult": false,
            "backdrop_path": "/2yS8gTQdzy3eP9SakyAFX9yOLSN.jpg",
            "genre_ids": [
                18
            ],
            "id": 85546,
            "or

In [15]:
# Test an OMDB API call based on an index of the movie list and printing the json response
url = "http://www.omdbapi.com/?t="
api_key_url = "&apikey=" + omdb_api_key
movie_name=movie_list["Title"][250]
response = requests.get(url + movie_name + api_key_url).json()
print(json.dumps(response, indent=4, sort_keys=True))

{
    "Actors": "Michael Caine, Brendan Fraser, Do Thi Hai Yen",
    "Awards": "Nominated for 1 Oscar. 13 wins & 14 nominations total",
    "BoxOffice": "$12,988,801",
    "Country": "United Kingdom, Germany, United States, Vietnam, Australia, France, Canada, New Zealand",
    "DVD": "08 Oct 2016",
    "Director": "Phillip Noyce",
    "Genre": "Drama, Romance, Thriller",
    "Language": "English, French, Vietnamese",
    "Metascore": "84",
    "Plot": "An old British reporter vies with a young U.S. doctor for the affections of a beautiful Vietnamese woman.",
    "Poster": "https://m.media-amazon.com/images/M/MV5BNzk0ZjMwOTQtYzAwYS00MmY3LTg1MDAtYzAzMWM5ZjUxMGQyXkEyXkFqcGdeQXVyNjk1Njg5NTA@._V1_SX300.jpg",
    "Production": "N/A",
    "Rated": "R",
    "Ratings": [
        {
            "Source": "Internet Movie Database",
            "Value": "7.0/10"
        },
        {
            "Source": "Rotten Tomatoes",
            "Value": "87%"
        },
        {
            "Source": "Metac

In [16]:
# Loop through the CSV's Title column to call said movie from the TMDB API
# Create an empty list to store unfound movies in the TMDB database into
unfound_movies = []
for index, row in movie_list.iterrows():
    movie=movie_list['Title'][index]
    base_url="https://api.themoviedb.org/3/search/movie?"
    build_url = f'{base_url}query={movie}&api_key={tmdb_api_key}'
    response=requests.get(build_url).json()
    #print(response.url)

    try:
        movie_list.loc[index, "id"] = response['results'][0]['id']
    except (KeyError, IndexError):
        # If no movie name is found, set the movie id as NaN .
        movie_list.loc[index, "id"] = np.nan
        unfound_movies.append(movie_list.loc[index, "Title"])

# View the movies not found in the database
unfound_movies

['#Pellichoopulu']

In [17]:
# Cleaning: drop any NaN rows
movie_list.dropna(how='any',inplace=True)
# Reset the index to not run into any index errors while looping later in the cleaning
movie_list.reset_index(inplace=True)
#Re-check size of Data Frame
movie_list.shape

(999, 3)

In [18]:
#Add new columns to the DataFrame to be populated with the next TMDB API calls
movie_list[['imdb_id',
            'budget',
            'genres',
            'revenue',
            'runtime',
            'popularity',
            'release_date',
            'production_countries'
            ,'tmdb_rating']] =''
movie_list

Unnamed: 0,index,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating
0,0,Home Alone 2: Lost in New York,772,,,,,,,,,
1,1,Orca: The Killer Whale,12707,,,,,,,,,
2,2,Martin Lawrence Live: Runteldat,20337,,,,,,,,,
3,3,Daredevil,9480,,,,,,,,,
4,4,L'avenir,374465,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
994,995,Race,323677,,,,,,,,,
995,996,Hostel,1690,,,,,,,,,
996,997,The Wizard,183,,,,,,,,,
997,998,Maniac,103620,,,,,,,,,


In [19]:
# Loop through each movie's title for the TMDB API call and store the data we want to analyze into our DataFrame

for index, row in movie_list.iterrows():
# Create empty lists to store multiple-value columns' data into 
    genres = []
    production_countries = []
    try:
        movie_id=movie_list['id'][index]
        base_url_ById = "https://api.themoviedb.org/3/movie/"
        build_url_ById=f"{base_url_ById}{movie_id}?api_key={tmdb_api_key}"
        url_byid_response=requests.get(build_url_ById).json()
        movie_list.loc[index, "imdb_id"] = url_byid_response['imdb_id']
        movie_list.loc[index, "release_date"] = url_byid_response['release_date']
        movie_list.loc[index, "revenue"] = url_byid_response['revenue']
        movie_list.loc[index, "runtime"]= url_byid_response['runtime']
        movie_list.loc[index, "Title"]= url_byid_response['title']

        
        
        #extracting genre names from list
        for i in range(0,len(url_byid_response['genres'])):
            current_genre = url_byid_response['genres'][i]['name']
            genres.append(current_genre)
            

        #extracting production_countries names from list
        for i in range(0,len(url_byid_response['production_countries'])):
            current_prod_country = url_byid_response['production_countries'][i]['name']
            production_countries.append(current_prod_country)

        movie_list.loc[index, "popularity"] = url_byid_response['popularity']
        movie_list.loc[index, "budget"] = url_byid_response['budget']
        movie_list.loc[index, "tmdb_rating"] = url_byid_response['vote_average']
        movie_list.loc[index, "genres"] = ", ".join(genres)
        movie_list.loc[index, "production_countries"] = ", ".join(production_countries)
    except (KeyError, IndexError):
        # If no movie name is found, set the movie id as NaN .
        movie_list.loc[index, "imdb_id"] = np.nan
        movie_list.loc[index, "release_date"] = np.nan
        movie_list.loc[index, "revenue"] = np.nan
        movie_list.loc[index, "runtime"]= np.nan
        movie_list.loc[index, "popularity"] = np.nan
        movie_list.loc[index, "budget"] = np.nan
        movie_list.loc[index, "tmdb_rating"] = np.nan
        movie_list.loc[index, "production_countries"] = np.nan
        movie_list.loc[index, "genres"] = np.nan

        

In [20]:
# Make calls to the OMDB API to populate new rows with imdb ratings and their vote counts

for index, row in movie_list.iterrows():
    try:
        imdb_id=movie_list["imdb_id"][index]
        base_url_ById = "http://www.omdbapi.com/?i="
        build_url_ById=f"{base_url_ById}{imdb_id}&apikey={omdb_api_key}"
        url_byid_response=requests.get(build_url_ById).json()
        rating_url = url_byid_response['imdbRating']
        movie_list.loc[index, "imdb_rating"] = rating_url
        votes_url = url_byid_response['imdbVotes']
        movie_list.loc[index, "imdb_votes"] = votes_url
    except:
        movie_list.loc[index, "imdb_rating"] = np.nan
        movie_list.loc[index, "imdb_votes"] = np.nan
movie_list

Unnamed: 0,index,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating,imdb_rating,imdb_votes
0,0,Home Alone 2: Lost in New York,772,tt0104431,18000000,"Comedy, Family, Adventure",358994850,120,50.494,1992-11-19,United States of America,6.709,6.8,381425
1,1,Orca,12707,tt0076504,17500000,"Adventure, Horror, Thriller",14717854,95,14.683,1977-07-22,United States of America,6.132,5.7,12922
2,2,Martin Lawrence Live: Runteldat,20337,tt0327036,3000000,Comedy,19184015,113,2.248,2002-08-01,United States of America,5.1,5.4,1772
3,3,Daredevil,9480,tt0287978,78000000,"Fantasy, Action",179179718,103,24.325,2003-02-14,United States of America,5.251,5.3,226123
4,4,Things to Come,374465,tt4120176,2100000,Drama,282382,102,7.672,2016-04-06,"Germany, France",6.6,6.9,14456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,995,Race,323677,tt3499096,5000000,"Drama, History",24804129,134,14.754,2016-02-19,"Canada, Germany",7.257,7.1,39919
995,996,Hostel,1690,tt0450278,4800000,Horror,81979826,94,21.954,2006-01-06,"United States of America, Czech Republic",5.929,5.9,187504
996,997,The Wizard,183,tt0098663,6000000,"Adventure, Comedy, Drama, Family",14278900,100,9.758,1989-12-15,United States of America,5.986,6.1,16230
997,998,Maniac,103620,tt2103217,6000000,"Horror, Thriller",31081,89,12.288,2012-12-26,France,6.043,6.1,39565


In [21]:
print(movie_list.shape)
movie_list.head()

(999, 14)


Unnamed: 0,index,Title,id,imdb_id,budget,genres,revenue,runtime,popularity,release_date,production_countries,tmdb_rating,imdb_rating,imdb_votes
0,0,Home Alone 2: Lost in New York,772,tt0104431,18000000,"Comedy, Family, Adventure",358994850,120,50.494,1992-11-19,United States of America,6.709,6.8,381425
1,1,Orca,12707,tt0076504,17500000,"Adventure, Horror, Thriller",14717854,95,14.683,1977-07-22,United States of America,6.132,5.7,12922
2,2,Martin Lawrence Live: Runteldat,20337,tt0327036,3000000,Comedy,19184015,113,2.248,2002-08-01,United States of America,5.1,5.4,1772
3,3,Daredevil,9480,tt0287978,78000000,"Fantasy, Action",179179718,103,24.325,2003-02-14,United States of America,5.251,5.3,226123
4,4,Things to Come,374465,tt4120176,2100000,Drama,282382,102,7.672,2016-04-06,"Germany, France",6.6,6.9,14456


In [22]:
# Create a new "Year" column from the "release date" by making a new df converting to datetime

movie_list["release_date"] = pd.to_datetime(movie_list["release_date"])
movie_list["Year"] = movie_list["release_date"].dt.year

for i in range(len(movie_list["Year"])):
    if movie_list.loc[i, "Year"]*0 != 0:
        movie_list.loc[i, "Year"] = np.nan
        print(f"NaN value found at index: {i}")
#         print(movie_list.loc[i, "Year"])

movie_list["Year"] = movie_list["Year"].astype(int)
# for index, row in movie_list.iterrows():
#     try:
#         movie_list.loc[index, "Year"] = movie_list.loc[index, "release_date"].dt.year
#         movie_list.loc[index, "Year"] = movie_list.loc[index, "Year"].astype(int)
#     except:
#         movie_list.loc[index, "Year"] = movie_list.loc[index, "Year"] = np.nan
movie_list["Year"].dropna()

0      1992
1      1977
2      2002
3      2003
4      2016
       ... 
994    2016
995    2006
996    1989
997    2012
998    2008
Name: Year, Length: 999, dtype: int64

In [40]:
movie_list["tmdb_rating"] = pd.to_numeric(movie_list["tmdb_rating"], errors='coerce')
movie_list["imdb_rating"] = pd.to_numeric(movie_list["imdb_rating"], errors='coerce')
movie_list["imdb_votes"] = pd.to_numeric(movie_list["imdb_votes"], errors='coerce')
movie_list["imdb_votes"].dtype

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [39]:
movie_list["imdb_votes"].unique()

array([ nan,  13.,  42., 226.,  15., 350., 243., 195., 772., 890., 412.,
       273., 684., 336., 396., 535.,  23., 474., 482., 642., 109., 172.,
       166., 314., 536.])

In [36]:
# Do more cleaning of the data by looping through numeric columns and replacing any non-numeric values with NaN
# Then, drop those NaN rows

# Correct particular numeric columns to be the proper data type
movie_list["tmdb_rating"] = pd.to_numeric(movie_list["tmdb_rating"], errors='coerce')
movie_list["imdb_rating"] = pd.to_numeric(movie_list["imdb_rating"], errors='coerce')
movie_list["imdb_votes"] = pd.to_numeric(movie_list["imdb_votes"], errors='coerce')


for i in range(len(movie_list["Title"])):
    if movie_list.loc[i, "runtime"]*0 != 0:
        movie_list.loc[i, "runtime"] = np.nan
        print(f"NaN value found in the runtime column at index: {i}")
#         print(movie_list.loc[i, "runtime"])
    if movie_list.loc[i, "budget"]*0 != 0:
        movie_list.loc[i, "budget"] = np.nan
        print(f"NaN value found in the budget column at index: {i}")
#         print(movie_list.loc[i, "budget"])
    if movie_list.loc[i, "revenue"]*0 != 0:
        movie_list.loc[i, "revenue"] = np.nan
        print(f"NaN value found in the revenue column at index: {i}")
#         print(movie_list.loc[i, "revenue"])
    if movie_list.loc[i, "tmdb_rating"]*0 != 0:
        movie_list.loc[i, "tmdb_rating"] = np.nan
        print(f"NaN value found in the tmdb_rating column at index: {i}")
        print(movie_list.loc[i, "tmdb_rating"])
    if movie_list.loc[i, "imdb_rating"]*0 != 0:
        movie_list.loc[i, "imdb_rating"] = np.nan
        print(f"NaN value found in the imdb_rating column at index: {i}")
    if str_value.isdigit():
        int_value = int(str_value)
    else:
        print(f"NaN value found in the imdb_votes column at index: {i}")
#         print(movie_list.loc[i, "imdb_rating"])
#     if movie_list.loc[i, "imdb_votes"]*0 != 0:
#         movie_list.loc[i, "imdb_votes"] = np.nan
#         print(f"NaN value found in the imdb_votes column at index: {i}")
#         print(movie_list.loc[i, "imdb_votes"])



# movie_list.dropna(how='any',inplace=True)
# Reset the index to not run into any index errors while looping later in the cleaning
# movie_list.reset_index(inplace=True)

NaN value found in the imdb_votes column at index: 0
NaN value found in the imdb_votes column at index: 1
NaN value found in the imdb_votes column at index: 2
NaN value found in the imdb_votes column at index: 3
NaN value found in the imdb_votes column at index: 4
NaN value found in the imdb_votes column at index: 5
NaN value found in the imdb_votes column at index: 6
NaN value found in the imdb_votes column at index: 7
NaN value found in the imdb_votes column at index: 8
NaN value found in the imdb_votes column at index: 9
NaN value found in the imdb_votes column at index: 10
NaN value found in the imdb_votes column at index: 11
NaN value found in the imdb_votes column at index: 12
NaN value found in the imdb_votes column at index: 13
NaN value found in the imdb_votes column at index: 14
NaN value found in the imdb_votes column at index: 15
NaN value found in the imdb_votes column at index: 16
NaN value found in the imdb_votes column at index: 17
NaN value found in the imdb_votes colu

In [None]:
movie_list["runtime"].unique

In [None]:
# Drop the old release date column and reorder the current DataFrame to be nicer
movie_list=movie_list[["Title", "Year", "id", "imdb_id", "genres", "runtime", "budget", "revenue", "production_countries",
           "popularity", "tmdb_rating", "imdb_rating", "imdb_votes"]]
movie_list

In [None]:
# Check the DataFrame's budget and revenue values to see how many entries don't have said data
print("Budget value counts:")
print(movie_list["budget"].value_counts())
print("---------------------------------------")
print("Revenue value counts:")
print(movie_list["revenue"].value_counts())

In [None]:
# View if a movie had 0 budget or not (true/false)
no_budget = movie_list["budget"]==0
# null_imdb_id = movie_list.loc[movie_list]
no_budget

In [None]:
# Create a DataFrame from the last block's result
no_budget_df = movie_list[no_budget]
# Remove rows with zero budget values
movie_list=movie_list[~no_budget]
movie_list

In [None]:
# View if a movie had 0 revenue or not (true/false)
no_revenue = movie_list["revenue"]==0
# null_imdb_id = movie_list.loc[movie_list]
no_revenue

In [None]:
# Removing rows with zero revenue movies
movie_list=movie_list[~no_revenue]
movie_list

In [None]:
# Check if there are any 0 values remaining for either budget or revenue in the DataFrame
print(f"Budget check: ")
print(movie_list["budget"].min())
print(f"Revenue check: ")
print(movie_list["revenue"].min())

In [None]:
# # Dummy test on adjusting for inflation over the years using the cpi library
# budget_past = 438656843
# budget_year = 1991
# budget_2023 = cpi.inflate(budget_past, budget_year)
# round(budget_2023, 2)

In [None]:
# Create new columns in the DataFrame for inflation-adjusted budget and revenue
movie_list['infl_adj_budget'] = ''
movie_list['infl_adj_revenue'] = ''
movie_list

In [None]:
# POSSIBLE TEMPORARY CODE 
# DROP ANY 2023 DATES TO FIX CPI FUNCTION
print("True = number of films from 2023")
find23 = movie_list["Year"]==2023
find23.value_counts()

In [None]:
# Create a dataframe to hold onto the 2023 films to concatenate after inflation calculation
hold23 = movie_list[find23]
hold23

In [None]:
hold23['infl_adj_budget'] = hold23['budget'].astype(int)
hold23['infl_adj_revenue'] = hold23['revenue'].astype(float)
hold23

In [None]:
# Drop all rows with the year 2023 from the movie list
movie_list = movie_list[movie_list["Year"]<2023]
print(movie_list.shape)
movie_list.head()

In [None]:
# POSSIBLE TEMPORARY CODE 
# NEW CODE FROM CARTER: I THINK WE (AGAIN) NEED TO RESET OUR INDEX AFTER DROPPING OUR NAN ROWS 
# AND MISSING BUDGET/REVENUE ROWS
# movie_list.reset_index(inplace=True, drop=True)

In [None]:
# # Defining a function to calculate a running inflation adjusted value for a column in a dataframe
def inflate_column(data, column):
    #     Adjust for inflation the series of values in column of the dataframe data
    return data.apply(lambda x: cpi.inflate(x[column], x.Year), axis=1)

# adjust the production budget 
movie_list.loc[:,'budget'] = movie_list['budget'].astype(int)
movie_list.loc[:,'infl_adj_budget'] = inflate_column(movie_list, 'budget')
#adjust the worldwide gross profits
movie_list.loc[:,'infl_adj_revenue'] = inflate_column(movie_list, 'revenue').astype(float)

# calculate the real worldwide net profit
movie_list.loc[:,'real_profit'] = movie_list.apply(lambda x: x.infl_adj_revenue - x.infl_adj_budget, axis=1)
movie_list

In [None]:
# Concatenate DataFrames horizontally (side by side)
movie_list = pd.concat([movie_list, hold23], axis=0)

In [None]:
# Looking at a sampling of the dataframe
print(movie_list.shape)
movie_list.sample(5)

In [None]:
# Do a final cleaning of the data by looping through numeric columns and replacing any non-numeric values with NaN
# Then, drop those NaN rows
for i in range(len(movie_list["Title"])):
    if movie_list.loc[i, "infl_adj_budget"]*0 != 0:
        movie_list.loc[i, "infl_adj_budget"] = np.nan
        print(f"NaN value found in the infl_adj_budget column at index: {i}")
        print(movie_list.loc[i, "infl_adj_budget"])
    if movie_list.loc[i, "infl_adj_revenue"]*0 != 0:
        movie_list.loc[i, "infl_adj_revenue"] = np.nan
        print(f"NaN value found in the infl_adj_revenue column at index: {i}")
        print(movie_list.loc[i, "infl_adj_revenue"])


movie_list.dropna(how='any',inplace=True)
# Reset the index to not run into any index errors while looping later in the cleaning
movie_list.reset_index(inplace=True)

# Movie Data Analysis

In [None]:
# Define custom bin edges
bin_edges = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030]
labels =["1920s", "1930s", "1940s", "1950s", "1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "2020s"]

# Cut the data into custom intervals
movie_list.loc[:,'Decades'] = pd.cut(movie_list['Year'], bins=bin_edges,labels=labels)
movie_list

In [None]:
# Sorting movie_list df revenue column ascending=False
sorted_revenue_desc = movie_list.sort_values(by='infl_adj_revenue', ascending=False)
top_ten_df = sorted_revenue_desc.iloc[0:10, :] 
top_ten_df

In [None]:
# Create a bar chart to display top 10 grossing movies

top_ten_df.plot("Title", "infl_adj_revenue", kind='bar', alpha=0.6, color='blue', title="Top 10 Grossing Movies (1922-2022)",
                xlabel="Movie Titles", ylabel="Movie Revenue", legend='')


# ax=top_ten_df.plot("Title", "revenue", kind='bar', alpha=0.6, color='blue', title="Top 10 Grossing Movies (1922-2022)",
#                 xlabel="Movie Titles", ylabel="Movie Revenue", legend='')
# ax.set_xticklabels(top_ten_df['Title'], rotation=45)
# plt.show()

In [None]:
country_df = movie_list.copy(deep=True)
country_df[['country1', 'country2', 'country3', 'country4', 'country5', 'country6']] = country_df['production_countries'].str.split(', ', expand=True, n=5)
country_df.sample(10)

In [None]:
country_df2 = country_df.sort_values(by='infl_adj_revenue', ascending=False)
country_df3 = country_df2.iloc[0:50, :]
country_df3

In [None]:
countries = list(country_df3["country1"].unique())
countries

In [None]:
# # MAP TIME
# # Import dependencies needed for world map visualizations
import matplotlib.pyplot as plt
import hvplot.pandas
import geoviews
import geopandas 

# Load the world map GeoDataFrame
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# Filter the GeoDataFrame to select the list of top 10 grossing films' production countries

country_data = []
for i in range(len(countries)):
    country_data.append(world[world['name'] == countries[i]])

# Print the USA GeoDataFrame
print(country_data)

In [None]:
# Create a base world map
world.plot(color='w', figsize=(30, 20), edgecolor='black', aspect="auto", linewidth=0.5)

# Extract the indices
indices = [index for index, word in indexed_list]

# Create a list of colors to be looped through when plotting the list of countries
colors = ['red', 'deepskyblue', 'green', 'yellow', 'purple', 'orange', 'pink', 'cyan', 'peachpuff', 'gold', 
          'firebrick', 'darkturquoise', 'blueviolet', 'maroon']

for i in range(len(country_data)):
    current_color = colors[i % len(colors)] 
    country_data[i].plot(color=current_color, alpha=1, aspect='1.25', ax=plt.gca())

plt.show()

In [None]:
# # Create a base world map
# world.plot(color='w', figsize=(30, 20), edgecolor='black', aspect="auto", linewidth=0.5)

# # Assign ascending indices to the length of the list
# indexed_list = list(enumerate(countries))

# # Extract the indices
# indices = [index for index, word in indexed_list]

# # Plot the United States in a different color

# for i in range(len(country_data)):
#     for_alpha=(0.1*float(indices[i]))+0.2
#     country_data[i].plot(color='blue', alpha=for_alpha, aspect='1.25', ax=plt.gca())

# plt.show()

In [None]:
# Create a bar chart based upon the above data
y_axis=movie_list.groupby('Decades').count()['Title']
y_axis.plot(kind='bar', alpha=0.6, color='blue', title="Number of Movies per decade", xlabel="Decades", ylabel="Number of Movies")

## Pie Chart

In [None]:
# Minimum Budget
min_budget = movie_list['infl_adj_budget'].min()
# Average Budget
avg_budget = movie_list['infl_adj_budget'].mean()
# Maximum Budget
max_budget = movie_list['infl_adj_budget'].max()
diff = (max_budget-min_budget)/3
# Define the bins and labels
bins=[min_budget,round(min_budget+diff),round(min_budget+2*diff),round(min_budget+3*diff)]
labels = ['Low Budget', 'Average Budget', 'High Budget']
# Cut the 'Budget' column into categories and create a new column 'Budget_Category'
movie_list['Budget_Category'] = pd.cut(movie_list['infl_adj_budget'], bins=bins, labels=labels)
movie_list.sample(10)

In [None]:
# Pie plot showing the distribution of Budget category using pyplot
budget_distribution = movie_list['Budget_Category'].value_counts().reset_index()
print(budget_distribution)
#create y label
y = [budget_distribution['count'][0],budget_distribution['count'][1],budget_distribution['count'][2]]
#Show Pie Plot
plt.pie(y,autopct='%1.1f%%',labels=['Low Budget','Average Budget','High Budget'])
plt.show()

In [None]:
genre_df = movie_list.copy(deep=True)
# genre_df
genre_df[['primary_genre', 'sub_genre_1', 'sub_genre_2', 'sub_genre_3', 'sub_genre_4', 'sub_genre_5']] = genre_df['genres'].str.split(', ', expand=True, n=5)
genre_df

In [None]:

# colors = plt.cm.Paired(range(len(primary_genre)))
primary_genre = genre_df['primary_genre'].value_counts()
primary_genre_length = len(primary_genre)

# Make blank list for explode values and loop through the primary genre length to append 0 after 0.1 accordingly
explode = []
for i in range(primary_genre_length):
    if i == 0:
        explode.append(0.1)
    else:
        explode.append(0)
print(f"Number of primary genres: {len(primary_genre)}")
primary_genre

In [None]:
plt.figure(figsize=(12, 12))
categories = genre_df['primary_genre'].unique()
# This code would be to include 'exploding' the biggest value out but has an error when 'explode' list is not equal
# to length of primary genres
plt.pie(primary_genre, autopct= "%1.1f%%", shadow=True, startangle=140, pctdistance=1.2, explode=explode)
# plt.pie(primary_genre, autopct= "%1.1f%%", shadow=True, startangle=140, pctdistance=1.2)
plt.legend(categories, loc='best', bbox_to_anchor=(1, 1))
plt.title('Primary Genres')
plt.show()

## Bar Plot

In [None]:
#Bar plot Budget Categories by Decades
grouped_df = movie_list.groupby(['Decades', 'Budget_Category'])['Title'].count().reset_index()
pivot_df=grouped_df.set_index(['Decades', 'Budget_Category'])
# Set the figure size
plt.figure(figsize=(10,2))  # Width, Height in inches
# Create a stacked bar plot
pivot_df.plot(kind='bar',width=1,color='orange', stacked=True)
# Add labels and a title
plt.xlabel('Decades')
plt.ylabel('Count')
plt.title('Bar Plot of Budget Categories by Decades')
plt.legend("Budget")
# Show the plot
plt.show()

## Histogram Plotting

In [None]:
# Plot a histogram of budget
plt.hist(movie_list['infl_adj_budget'],bins=5, color='deepskyblue', edgecolor='k')
plt.xlabel('Budget')
plt.ylabel('Frequency')
plt.title('Histogram of Budget')

In [None]:
# 
from scipy.stats import shapiro
DataToTest = movie_list['infl_adj_budget']
stat,p = shapiro(DataToTest)
print(f"stat: {stat},p: {p}")
if(p>0.05):
    print("Accept NULL Hypothesis :Budget data is not normally Distributed")
else:
    print("Reject NULL Hypothesis: Budget data is not normally Distributed")

## Scatter Plots with Linear Regression

In [None]:
# CARTER NOTE: THIS BLOCK BELOW DID NOT SHOW OUR FIRST SCATTER PLOT WHEN RUN FOR ME
    # Looking over the code, could the error be in the "xvalues = xval" lines?

In [None]:
# Creating a function for computing linear regression
def lin_regress(xval, yval, x_label, y_label): 
    x_values = xval
    y_values = yval
    
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    plt.scatter(x_values,y_values, c='blueviolet')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.plot(x_values,regress_values,"r-")
    plt.annotate(line_eq,(10,20),fontsize=15,color="red")
    plt.legend(['Regression Line: y = {:.2f}x + {:.2f}'.format(slope, intercept)])
    print("The r-value is: ",rvalue)
    plt.show()
#     x_values = 0
#     y_values = 0

In [None]:
# Create a scatter plot for regression analysis

xval = movie_list['Year']
yval = movie_list['runtime'].astype(int)
x_label='Years'
y_label='Runtime(mins) of Movies'

lin_regress(xval, yval, x_label, y_label)

In [None]:
# Create a scatter plot for regression analysis: Budget vs. tmdb

xval = movie_list['infl_adj_budget'].astype(int)
yval = movie_list['tmdb_rating'].astype(float)
x_label='Budget (adjusted for inflation)'
y_label='tmdb rating'

lin_regress(xval, yval, x_label, y_label)

In [None]:
movie_list['imdb_rating'].unique()

In [None]:
# Create a scatter plot for regression analysis: Budget vs. imbd Rating

xval = movie_list['infl_adj_budget'].astype(int)
yval = movie_list['imdb_rating'].astype(float)
x_label='Budget (adjusted for inflation)'
y_label='IMDb rating'

lin_regress(xval, yval, x_label, y_label)

In [None]:
# Create a scatter plot for regression analysis: Revenue vs. imbd Rating

xval = movie_list['infl_adj_revenue'].astype(float)
yval = movie_list['imdb_rating'].astype(float)
x_label='Revenue (adjusted for inflation)'
y_label='imdb rating'

lin_regress(xval, yval, x_label, y_label)

In [None]:
# Create a scatter plot for regression analysis: Revenue vs. Runtime

xval = movie_list['infl_adj_revenue'].astype(float)
yval = movie_list['runtime'].astype(int)
x_label='Revenue (adjusted for inflation)'
y_label='Runtime(mins)'

lin_regress(xval, yval, x_label, y_label)

# Data Testing

## 

In [None]:
#TEST NULL HYPOTHESIS(budget and revenue)
#H0: There is a significant association between budget and revenue of a movie.
#H1: There is no significant association between budget and revenue of a movie
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(movie_list['infl_adj_budget'], movie_list['infl_adj_revenue'])

# Perform the chi-square test for independence
chi2, p, _, _ = chi2_contingency(contingency_table)

print("Chi-Square Statistic:", chi2)
print("P-value:", p)

alpha = 0.05  # Significance level

if p < alpha:
    print("Accept Null Hypothesis: There is a significant association between budget and revenue of a movie.")
else:
    print("Reject Null Hypothesis:There is no significant association between budget and revenue.")

In [None]:
#Identifying Outliers in Movie Runtimes Over the Years
import seaborn as sns
ax = sns.boxplot(x='Decades',y='runtime',data=movie_list)

# Incorporating Oscar and Golden Globe Data

### Oscar dataset import and cleaning

In [None]:
# Import the Oscar csv to analyze
oscar_list = pd.read_csv("the_oscar_award.csv")
print(oscar_list.shape)
oscar_list.head(2)

In [None]:
# Check for NaN values in the film column of the Oscar List
oscar_list["film"].isnull().value_counts()

In [None]:
# Drop the NaN values and the "False" winners to clean the data
oscar_list.dropna(how="any", inplace=True)
oscar_winners = oscar_list["winner"]==True

oscars = oscar_list[oscar_winners]
print(oscars.shape)
oscars

In [None]:
# Count the number of wins each film has and turn that series into a DataFrame
oscar_wins = oscars[["film", "year_film"]].value_counts()
oscar_win_df = oscar_wins.reset_index()
oscar_win_df

In [None]:
# Rename the columns of the DataFrame to cleanly merge with the movie data
oscar_win_df = oscar_win_df.rename(columns={'film': 'Title', 'year_film': 'Year', 'count': 'Oscar_Wins'})
oscar_win_df

### Golden Globe dataset import and cleaning

In [None]:
# Import the golden globes csv to analyze
gglobe_list = pd.read_csv("golden_globe_awards.csv")
print(gglobe_list.shape)
gglobe_list.head()

# gglobe_list

In [None]:
# Check for NaN values in the film column of the Golden Globe List
gglobe_list["film"].isnull().value_counts()

In [None]:
# Drop the NaN values and the "False" winners to clean the data
# gglobe_list.dropna(how="any", inplace=True)
gglobe_winners = gglobe_list["win"]==True
# gglobe_winners
gglobes = gglobe_list[gglobe_winners]

# Update the 'film' column where it is NaN with values from the 'nominee' column
gglobes['film'] = gglobes['film'].fillna(gglobes['nominee'])
gglobes.sample(10)

# print(gglobes.shape)
# gglobes

In [None]:
# # Drop the NaN values and the "False" winners to clean the data
# gglobe_list.dropna(how="any", inplace=True)
# gglobe_winners = gglobe_list["win"]==True
# gglobe_winners

# gglobes = gglobe_list[gglobe_winners]
# print(gglobes.shape)
# gglobes

In [None]:
# Count the number of wins each film has and turn that series into a DataFrame
gglobe_count = gglobes[["film", "year_film"]].value_counts()
gglobe_win_df = gglobe_count.reset_index()
gglobe_win_df

In [None]:
# Rename the columns of the DataFrame to cleanly merge with the movie data
gglobe_win_df = gglobe_win_df.rename(columns={'film': 'Title', 'year_film': 'Year', 'count': 'Golden_Globe_Wins'})
gglobe_win_df

## Merging the Oscars and Golden Globes DataFrames into the Movie List DataFrame

In [None]:
# Referencing the movie_list for ease of coding
print(movie_list.shape)
movie_list.head(1)

In [None]:
# Individually merge the Oscars and Golden Globes DataFrames into the master movie list
oscar_awards = pd.merge(movie_list, oscar_win_df, how="inner", on=["Title", "Year"])
print(oscar_awards.shape)
oscar_awards.head()

In [None]:
gglobe_awards = pd.merge(movie_list, gglobe_win_df, how="inner", on=["Title", "Year"])
print(gglobe_awards.shape)
gglobe_awards.head()

In [None]:
# Make a DataFrame merging Golden Globe Awards data onto the merged Oscar data to show only films with wins in both
both_awards = pd.merge(oscar_awards, gglobe_win_df, how="inner", on=["Title", "Year"])
print(both_awards.shape)
both_awards

In [None]:
# Sum the Oscar and Golden Globe Award wins into a new column called cumulative awards
both_awards['Cumulative_Awards'] = ''
for index,row in both_awards.iterrows():
    both_awards.loc[index, "Cumulative_Awards"] = both_awards.loc[index,'Oscar_Wins']+both_awards.loc[index,'Golden_Globe_Wins']

both_awards.sort_values(by='Cumulative_Awards',ascending = False,inplace=True)
both_awards.head(50)

In [None]:
# Sort the data descending and grab the top 20 cumulative winners to do visualizations with
top20_both_awards = both_awards.sort_values(by='Cumulative_Awards', ascending=False)
top20_movies = top20_both_awards.iloc[0:20, :] 
top20_movies.head()

In [None]:
# plt.barh(x, y)
x=top20_movies['Title']
y=top20_movies['Cumulative_Awards']
plt.figure(figsize=(10,5))
plt.title('Top 20 movies with total Oscar & Golden Globe wins')
plt.barh(x,y,color='green',alpha=0.7)

In [None]:
# Take a look at how many awards were won by each decade
per_decade_wins=both_awards['Decades'].value_counts()
per_decade_wins

In [None]:
# Bar plot for cumulative Oscar and Golden Globe wins by Decade
decade_groupby = both_awards.groupby('Decades')
decades_sum=decade_groupby['Cumulative_Awards'].sum().reset_index()
decades_sum['Cumulative_Awards']
x=decades_sum['Decades']
y=decades_sum['Cumulative_Awards']
#plt.xlabel("Total oscar and golden globe wins")

plt.figure(figsize=(8,5))
plt.title(f"Each Decades' # of Movies with Oscar and Golden Globe Wins")
plt.barh(x,y,color='red',alpha=0.5)

### Pulling genre and produciton country information out of the DataFrame

In [None]:
# # Testing the Pandas "explode" function
# checkexplode = movie_list["genres"].explode('drama')
# checkexplode