In [1]:
#Import Dependencies
import pandas as pd
import requests
import json
import time
import numpy as np

In [2]:
#Query API and Convert to DataFrame

#Read csv with proper encoding
path = "U.S. Released Movies_ 1972-2016.csv"
movies = pd.read_csv(path, encoding='latin1')
movies.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors
0,1,tt0110912,4/23/2013,4/23/2013,,Pulp Fiction,https://www.imdb.com/title/tt0110912/,movie,8.9,154.0,1994.0,"Crime, Drama",1607823.0,5/21/1994,Quentin Tarantino
1,2,tt1872181,4/23/2013,4/23/2013,,The Amazing Spider-Man 2,https://www.imdb.com/title/tt1872181/,movie,6.6,142.0,2014.0,"Action, Adventure, Sci-Fi",381550.0,4/10/2014,Marc Webb
2,3,tt0111161,4/23/2013,4/23/2013,,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,movie,9.3,142.0,1994.0,Drama,2057262.0,9/10/1994,Frank Darabont
3,4,tt0076759,4/23/2013,4/23/2013,,Star Wars,https://www.imdb.com/title/tt0076759/,movie,8.6,121.0,1977.0,"Action, Adventure, Fantasy, Sci-Fi",1102354.0,5/25/1977,George Lucas
4,5,tt0088763,4/23/2013,4/23/2013,,Back to the Future,https://www.imdb.com/title/tt0088763/,movie,8.5,116.0,1985.0,"Adventure, Comedy, Sci-Fi",915281.0,7/3/1985,Robert Zemeckis


In [3]:
#Extract relevant columns
movies = movies[['Const', 'Title', 'Runtime (mins)', 'Year', 'Genres', 'Release Date', 'IMDb Rating']]
movies.head()

Unnamed: 0,Const,Title,Runtime (mins),Year,Genres,Release Date,IMDb Rating
0,tt0110912,Pulp Fiction,154.0,1994.0,"Crime, Drama",5/21/1994,8.9
1,tt1872181,The Amazing Spider-Man 2,142.0,2014.0,"Action, Adventure, Sci-Fi",4/10/2014,6.6
2,tt0111161,The Shawshank Redemption,142.0,1994.0,Drama,9/10/1994,9.3
3,tt0076759,Star Wars,121.0,1977.0,"Action, Adventure, Fantasy, Sci-Fi",5/25/1977,8.6
4,tt0088763,Back to the Future,116.0,1985.0,"Adventure, Comedy, Sci-Fi",7/3/1985,8.5


In [4]:
#Rename Columns
movies = movies.rename(columns= {'Const': 'IMDb ID'})
movies.head()

Unnamed: 0,IMDb ID,Title,Runtime (mins),Year,Genres,Release Date,IMDb Rating
0,tt0110912,Pulp Fiction,154.0,1994.0,"Crime, Drama",5/21/1994,8.9
1,tt1872181,The Amazing Spider-Man 2,142.0,2014.0,"Action, Adventure, Sci-Fi",4/10/2014,6.6
2,tt0111161,The Shawshank Redemption,142.0,1994.0,Drama,9/10/1994,9.3
3,tt0076759,Star Wars,121.0,1977.0,"Action, Adventure, Fantasy, Sci-Fi",5/25/1977,8.6
4,tt0088763,Back to the Future,116.0,1985.0,"Adventure, Comedy, Sci-Fi",7/3/1985,8.5


In [5]:
#Drop rows with incomplete data
movies = movies.dropna()

#Exract rows from years 2010 and later
movies = movies[movies.iloc[:,3] >= 2010]
movies = movies.set_index(['IMDb ID'])
movies.head()

Unnamed: 0_level_0,Title,Runtime (mins),Year,Genres,Release Date,IMDb Rating
IMDb ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt1872181,The Amazing Spider-Man 2,142.0,2014.0,"Action, Adventure, Sci-Fi",4/10/2014,6.6
tt1323594,Despicable Me,95.0,2010.0,"Animation, Comedy, Family, Fantasy",6/20/2010,7.7
tt1375670,Grown Ups,102.0,2010.0,Comedy,6/24/2010,6.0
tt0892769,How to Train Your Dragon,98.0,2010.0,"Animation, Action, Adventure, Family, Fantasy",3/18/2010,8.1
tt1375666,Inception,148.0,2010.0,"Action, Adventure, Sci-Fi, Thriller",7/8/2010,8.8


In [6]:
#Change Year and Runtime to integers
movies[['Year','Runtime (mins)']] = movies[['Year','Runtime (mins)']].applymap(np.int64)
movies.head()

Unnamed: 0_level_0,Title,Runtime (mins),Year,Genres,Release Date,IMDb Rating
IMDb ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt1872181,The Amazing Spider-Man 2,142,2014,"Action, Adventure, Sci-Fi",4/10/2014,6.6
tt1323594,Despicable Me,95,2010,"Animation, Comedy, Family, Fantasy",6/20/2010,7.7
tt1375670,Grown Ups,102,2010,Comedy,6/24/2010,6.0
tt0892769,How to Train Your Dragon,98,2010,"Animation, Action, Adventure, Family, Fantasy",3/18/2010,8.1
tt1375666,Inception,148,2010,"Action, Adventure, Sci-Fi, Thriller",7/8/2010,8.8


In [7]:
# Pull: Rating, Metascore, and BoxOffice from omdb
movies['Metascore'] = ''
movies['Rating'] = ''
movies['Box Office'] = ''
count=0

movies.head()

Unnamed: 0_level_0,Title,Runtime (mins),Year,Genres,Release Date,IMDb Rating,Metascore,Rating,Box Office
IMDb ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
tt1872181,The Amazing Spider-Man 2,142,2014,"Action, Adventure, Sci-Fi",4/10/2014,6.6,,,
tt1323594,Despicable Me,95,2010,"Animation, Comedy, Family, Fantasy",6/20/2010,7.7,,,
tt1375670,Grown Ups,102,2010,Comedy,6/24/2010,6.0,,,
tt0892769,How to Train Your Dragon,98,2010,"Animation, Action, Adventure, Family, Fantasy",3/18/2010,8.1,,,
tt1375666,Inception,148,2010,"Action, Adventure, Sci-Fi, Thriller",7/8/2010,8.8,,,


In [None]:
for imdb in movies.index:
    try:
        count += 1
        url = 'http://www.omdbapi.com/?i={0}&apikey=trilogy'.format(imdb)
        results = requests.get(url).json()
        movies.loc[imdb,'Rating'] = results['Rated']
        movies.loc[imdb,'Metascore'] = results['Metascore']
        movies.loc[imdb,'Box Office'] = results['BoxOffice']
        print(f'{imdb} processed: {count} / {len(movies.index)}')
        time.sleep(.1)
    except:
        print(f'Error processing {imdb}, skipping: {count} / {len(movies.index)}')

tt1872181 processed: 1 / 1562
tt1323594 processed: 2 / 1562
tt1375670 processed: 3 / 1562
tt0892769 processed: 4 / 1562
tt1375666 processed: 5 / 1562
tt1228705 processed: 6 / 1562
tt1250777 processed: 7 / 1562
tt1245526 processed: 8 / 1562
tt1130884 processed: 9 / 1562
tt1104001 processed: 10 / 1562
tt0435761 processed: 11 / 1562
tt0780504 processed: 12 / 1562
tt1596343 processed: 13 / 1562
tt1201607 processed: 14 / 1562
tt1219289 processed: 15 / 1562
tt1298650 processed: 16 / 1562
tt1270798 processed: 17 / 1562
tt1568346 processed: 18 / 1562
tt1454029 processed: 19 / 1562
tt1637688 processed: 20 / 1562
tt1232829 processed: 21 / 1562
tt0948470 processed: 22 / 1562
tt1605630 processed: 23 / 1562
tt0848228 processed: 24 / 1562
tt1440129 processed: 25 / 1562
tt1194173 processed: 26 / 1562
tt1217209 processed: 27 / 1562
tt1259521 processed: 28 / 1562
tt1790886 processed: 29 / 1562
tt1371111 processed: 30 / 1562
tt1345836 processed: 31 / 1562
tt1077368 processed: 32 / 1562
tt1853728 process

tt1486185 processed: 261 / 1562
tt1401152 processed: 262 / 1562
tt1240982 processed: 263 / 1562
tt1657507 processed: 264 / 1562
tt0477302 processed: 265 / 1562
tt1448755 processed: 266 / 1562
tt1204342 processed: 267 / 1562
tt0448694 processed: 268 / 1562
tt0471042 processed: 269 / 1562
tt1568911 processed: 270 / 1562
tt1591479 processed: 271 / 1562
tt1366365 processed: 272 / 1562
tt1142977 processed: 273 / 1562
tt1838544 processed: 274 / 1562
tt1397514 processed: 275 / 1562
tt1327194 processed: 276 / 1562
tt1667353 processed: 277 / 1562
tt2109184 processed: 278 / 1562
tt0431021 processed: 279 / 1562
tt1899353 processed: 280 / 1562
tt2083383 processed: 281 / 1562
tt1496025 processed: 282 / 1562
tt1596365 processed: 283 / 1562
tt1959332 processed: 284 / 1562
tt1213663 processed: 285 / 1562
tt1645155 processed: 286 / 1562
tt1132449 processed: 287 / 1562
tt1990314 processed: 288 / 1562
tt1999995 processed: 289 / 1562
tt1308729 processed: 290 / 1562
tt0404978 processed: 291 / 1562
tt174817

In [None]:
movies

In [None]:
#Clean df
movies = movies.dropna()
movies = movies[~movies['Box Office'].str.contains("N/A")]
movies = movies[~movies['Rating'].str.contains("N/A")]
movies = movies[~movies['Metascore'].str.contains("N/A")]

movies

In [None]:
#Convert Box Office column to int
movies['Box Office'] = movies[movies.columns[8:]].replace('[\$,]', '', regex=True).astype(float)
movies['Box Office'] = movies['Box Office'].astype(int)
movies

In [None]:
# Save Data to csv
movies.to_csv("Movie_Data.csv")

In [None]:
#Critics Ratings

#subquery

#Statistical differnece? 2 sample T-test

#Scatterplots to visualize correlation

#test for significance - Linear Regression

In [None]:
#Genres

#Paredo bar Chart to identify top Genres

#Plot changes over time with a Line graph

In [None]:
#MPAA Rating

#Paredo bar Chart

In [None]:
#Runtime

#Scatterplot

#test for significance 