In [1]:
import requests
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
URL = 'https://www.imdb.com/chart/top?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=R7HANXM9HJ8XV2FNPN5W&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_3'

response = requests.get(URL)

#### Checking that the URL worked succesfully

In [3]:
response.status_code

200

In [4]:
soup = BS(response.text)

In [5]:
soup

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Top 250 Movies - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/chart/top" rel="canonical"/>
<meta content="http://www.imdb.com/chart/top" property="og:url"/>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadI

In [6]:
soup.find('title').text

'Top 250 Movies - IMDb'

#### Step 1: Find all the movie titles in IMDb's 'Top 250 Movies' list

In [7]:
title_soup = soup.findAll('td', attrs={'class':'titleColumn'})

title = []
for x in range(len(title_soup)):
    title.append(title_soup[x].find('a').text)
title

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather Part II',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Good, the Bad and the Ugly',
 'Forrest Gump',
 'Fight Club',
 'The Lord of the Rings: The Two Towers',
 'Inception',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Matrix',
 'Goodfellas',
 "One Flew Over the Cuckoo's Nest",
 'Se7en',
 'Seven Samurai',
 "It's a Wonderful Life",
 'The Silence of the Lambs',
 'City of God',
 'Saving Private Ryan',
 'Life Is Beautiful',
 'Interstellar',
 'The Green Mile',
 'Star Wars',
 'Terminator 2: Judgment Day',
 'Back to the Future',
 'Spirited Away',
 'Psycho',
 'The Pianist',
 'Parasite',
 'Léon: The Professional',
 'The Lion King',
 'Gladiator',
 'American History X',
 'The Departed',
 'The Usual Suspects',
 'The Prestige',
 'Whiplash',
 'Casablanca',
 'Harakiri',
 'The Intouchables',


#### Step 2: Create unique ID codes for the movie titles ... where '1776' is the base and the end digit(s) change down the list corresponding to the ranking on IMDb

In [8]:
id = '1776'
title_id = []
for x in range(1,251):
    title_id.append(id+f'{x}') 
title_id

['17761',
 '17762',
 '17763',
 '17764',
 '17765',
 '17766',
 '17767',
 '17768',
 '17769',
 '177610',
 '177611',
 '177612',
 '177613',
 '177614',
 '177615',
 '177616',
 '177617',
 '177618',
 '177619',
 '177620',
 '177621',
 '177622',
 '177623',
 '177624',
 '177625',
 '177626',
 '177627',
 '177628',
 '177629',
 '177630',
 '177631',
 '177632',
 '177633',
 '177634',
 '177635',
 '177636',
 '177637',
 '177638',
 '177639',
 '177640',
 '177641',
 '177642',
 '177643',
 '177644',
 '177645',
 '177646',
 '177647',
 '177648',
 '177649',
 '177650',
 '177651',
 '177652',
 '177653',
 '177654',
 '177655',
 '177656',
 '177657',
 '177658',
 '177659',
 '177660',
 '177661',
 '177662',
 '177663',
 '177664',
 '177665',
 '177666',
 '177667',
 '177668',
 '177669',
 '177670',
 '177671',
 '177672',
 '177673',
 '177674',
 '177675',
 '177676',
 '177677',
 '177678',
 '177679',
 '177680',
 '177681',
 '177682',
 '177683',
 '177684',
 '177685',
 '177686',
 '177687',
 '177688',
 '177689',
 '177690',
 '177691',
 '177692

#### Step 3: Retrieving the Release Year for each movie title 

In [9]:
year_soup = soup.findAll('span', attrs={'class':'secondaryInfo'})

release_year = []

for x in year_soup:
    y = x.text.strip()
    release_year.append(y)
release_year

['(1994)',
 '(1972)',
 '(2008)',
 '(1974)',
 '(1957)',
 '(1993)',
 '(2003)',
 '(1994)',
 '(2001)',
 '(1966)',
 '(1994)',
 '(1999)',
 '(2002)',
 '(2010)',
 '(1980)',
 '(1999)',
 '(1990)',
 '(1975)',
 '(1995)',
 '(1954)',
 '(1946)',
 '(1991)',
 '(2002)',
 '(1998)',
 '(1997)',
 '(2014)',
 '(1999)',
 '(1977)',
 '(1991)',
 '(1985)',
 '(2001)',
 '(1960)',
 '(2002)',
 '(2019)',
 '(1994)',
 '(1994)',
 '(2000)',
 '(1998)',
 '(2006)',
 '(1995)',
 '(2006)',
 '(2014)',
 '(1942)',
 '(1962)',
 '(2011)',
 '(1988)',
 '(1936)',
 '(1968)',
 '(1954)',
 '(1988)',
 '(1979)',
 '(1931)',
 '(1979)',
 '(2000)',
 '(1981)',
 '(2012)',
 '(2008)',
 '(2006)',
 '(1950)',
 '(1957)',
 '(1940)',
 '(1980)',
 '(2018)',
 '(1957)',
 '(1986)',
 '(2018)',
 '(1999)',
 '(1964)',
 '(2012)',
 '(2003)',
 '(1984)',
 '(2019)',
 '(2009)',
 '(2017)',
 '(1995)',
 '(1995)',
 '(1981)',
 '(2019)',
 '(1997)',
 '(1984)',
 '(1997)',
 '(2022)',
 '(2016)',
 '(2000)',
 '(2009)',
 '(1952)',
 '(2010)',
 '(1963)',
 '(1983)',
 '(2018)',
 '(1968)',

In [10]:
# creating a list comprehension to remove '()' in the years
release_year1 = [x.strip('()') for x in release_year]
release_year1
#in hindsight, I probably could have done this with one code above

['1994',
 '1972',
 '2008',
 '1974',
 '1957',
 '1993',
 '2003',
 '1994',
 '2001',
 '1966',
 '1994',
 '1999',
 '2002',
 '2010',
 '1980',
 '1999',
 '1990',
 '1975',
 '1995',
 '1954',
 '1946',
 '1991',
 '2002',
 '1998',
 '1997',
 '2014',
 '1999',
 '1977',
 '1991',
 '1985',
 '2001',
 '1960',
 '2002',
 '2019',
 '1994',
 '1994',
 '2000',
 '1998',
 '2006',
 '1995',
 '2006',
 '2014',
 '1942',
 '1962',
 '2011',
 '1988',
 '1936',
 '1968',
 '1954',
 '1988',
 '1979',
 '1931',
 '1979',
 '2000',
 '1981',
 '2012',
 '2008',
 '2006',
 '1950',
 '1957',
 '1940',
 '1980',
 '2018',
 '1957',
 '1986',
 '2018',
 '1999',
 '1964',
 '2012',
 '2003',
 '1984',
 '2019',
 '2009',
 '2017',
 '1995',
 '1995',
 '1981',
 '2019',
 '1997',
 '1984',
 '1997',
 '2022',
 '2016',
 '2000',
 '2009',
 '1952',
 '2010',
 '1963',
 '1983',
 '2018',
 '1968',
 '2004',
 '1992',
 '2012',
 '1985',
 '1941',
 '1931',
 '1962',
 '1959',
 '1958',
 '2001',
 '1971',
 '1960',
 '1952',
 '1944',
 '1987',
 '2020',
 '1983',
 '1973',
 '1962',
 '2010',
 

#### Step 4: Pulling out the directors for the movie titles

In [11]:
dir_soup = soup.findAll('td', attrs={'class':'titleColumn'})

dir_list = []
stripped = []
for x in range(len(dir_soup)):
    dir_list.append(dir_soup[x].find('a').get('title'))
    stripped.append(dir_list[x].split(' (dir.)',1)[0])
stripped

['Frank Darabont',
 'Francis Ford Coppola',
 'Christopher Nolan',
 'Francis Ford Coppola',
 'Sidney Lumet',
 'Steven Spielberg',
 'Peter Jackson',
 'Quentin Tarantino',
 'Peter Jackson',
 'Sergio Leone',
 'Robert Zemeckis',
 'David Fincher',
 'Peter Jackson',
 'Christopher Nolan',
 'Irvin Kershner',
 'Lana Wachowski',
 'Martin Scorsese',
 'Milos Forman',
 'David Fincher',
 'Akira Kurosawa',
 'Frank Capra',
 'Jonathan Demme',
 'Fernando Meirelles',
 'Steven Spielberg',
 'Roberto Benigni',
 'Christopher Nolan',
 'Frank Darabont',
 'George Lucas',
 'James Cameron',
 'Robert Zemeckis',
 'Hayao Miyazaki',
 'Alfred Hitchcock',
 'Roman Polanski',
 'Bong Joon Ho',
 'Luc Besson',
 'Roger Allers',
 'Ridley Scott',
 'Tony Kaye',
 'Martin Scorsese',
 'Bryan Singer',
 'Christopher Nolan',
 'Damien Chazelle',
 'Michael Curtiz',
 'Masaki Kobayashi',
 'Olivier Nakache',
 'Isao Takahata',
 'Charles Chaplin',
 'Sergio Leone',
 'Alfred Hitchcock',
 'Giuseppe Tornatore',
 'Ridley Scott',
 'Charles Chaplin

#### Step 5: Grabbing movie ratings by clicking the movie title and moving to its IMDb page 

In [12]:
#title_soup already gives me the information that I need to narrow down, so I am going to use it to get the movie links

href = []
for x in range(len(title_soup)):
    href.append(title_soup[x].find('a').get('href'))
href

['/title/tt0111161/',
 '/title/tt0068646/',
 '/title/tt0468569/',
 '/title/tt0071562/',
 '/title/tt0050083/',
 '/title/tt0108052/',
 '/title/tt0167260/',
 '/title/tt0110912/',
 '/title/tt0120737/',
 '/title/tt0060196/',
 '/title/tt0109830/',
 '/title/tt0137523/',
 '/title/tt0167261/',
 '/title/tt1375666/',
 '/title/tt0080684/',
 '/title/tt0133093/',
 '/title/tt0099685/',
 '/title/tt0073486/',
 '/title/tt0114369/',
 '/title/tt0047478/',
 '/title/tt0038650/',
 '/title/tt0102926/',
 '/title/tt0317248/',
 '/title/tt0120815/',
 '/title/tt0118799/',
 '/title/tt0816692/',
 '/title/tt0120689/',
 '/title/tt0076759/',
 '/title/tt0103064/',
 '/title/tt0088763/',
 '/title/tt0245429/',
 '/title/tt0054215/',
 '/title/tt0253474/',
 '/title/tt6751668/',
 '/title/tt0110413/',
 '/title/tt0110357/',
 '/title/tt0172495/',
 '/title/tt0120586/',
 '/title/tt0407887/',
 '/title/tt0114814/',
 '/title/tt0482571/',
 '/title/tt2582802/',
 '/title/tt0034583/',
 '/title/tt0056058/',
 '/title/tt1675434/',
 '/title/t

In [13]:
len(href)

250

In [14]:
#I need to create a for loop that will enter the 'href' in the corresponding spots and add a number that increases by 1 each time to the established URL
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
}

i = 0
rating = []


for x in href:
    for i in range(1,251):
        i+=1
    try: 
        IMDb_soup = BS(requests.get(f'https://www.imdb.com{x}?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_{i}', headers = headers).text)
        rating.append(IMDb_soup.findAll('li', attrs={'role':'presentation'})[4].find('a').text)
    except:
        rating.append("Not Available on IMDb") 
    
print(rating)

['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13', 'R', 'PG-13', 'R', 'PG-13', 'PG-13', 'PG', 'R', 'R', 'R', 'R', 'Not Rated', 'PG', 'R', 'R', 'R', 'PG-13', 'PG-13', 'R', 'PG', 'R', 'PG', 'PG', 'R', 'R', 'R', 'R', 'G', 'R', 'R', 'R', 'R', 'PG-13', 'R', 'PG', 'Not Rated', 'R', 'Not Rated', 'G', 'PG-13', 'PG', 'PG', 'R', 'G', 'R', 'R', 'PG', 'R', 'G', 'R', 'Passed', 'Approved', 'G', 'R', 'PG-13', 'Approved', 'R', 'PG', 'R', 'PG', 'PG-13', 'R', 'PG', 'R', 'R', 'PG', 'G', 'R', 'Not Available on IMDb', 'PG-13', 'PG-13', 'R', 'R', 'PG-13', 'TV-PG', 'Unrated', 'PG-13', 'G', 'G', 'Not Rated', 'PG', 'R', 'G', 'R', 'R', 'R', 'Not Rated', 'PG', 'Passed', 'Approved', 'Approved', 'PG', 'R', 'X', 'Approved', 'Not Rated', 'Passed', 'R', 'PG-13', 'R', 'PG', 'Approved', 'R', 'PG', 'R', 'R', 'PG-13', 'Not Rated', 'R', 'R', 'R', 'Not Rated', 'PG-13', 'PG', 'R', 'R', 'R', 'Not Rated', 'PG-13', 'Passed', 'Passed', 'PG-13', 'Passed', 'PG-13', 'R', 'Approved', 'R', 'R', 'R', 'R', 'R', 'PG', 'PG

In [15]:
len(rating)

250

#### Step 6: Retrieving Genre for each movie

In [16]:
i = 0
genre = []

for x in href:
    for i in range(1,251):
        i+=1
    try: 
        IMDb_soup = BS(requests.get(f'https://www.imdb.com{x}?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_{i}', headers = headers).text)
        genre.append(IMDb_soup.findAll('div', attrs={'class':'ipc-chip-list__scroller'})[0].text)
    except:
        genre.append("Not Available on IMDb") 

In [17]:
genre

['Drama',
 'CrimeDrama',
 'ActionCrimeDrama',
 'CrimeDrama',
 'CrimeDrama',
 'BiographyDramaHistory',
 'ActionAdventureDrama',
 'CrimeDrama',
 'ActionAdventureDrama',
 'AdventureWestern',
 'DramaRomance',
 'Drama',
 'ActionAdventureDrama',
 'ActionAdventureSci-Fi',
 'ActionAdventureFantasy',
 'ActionSci-Fi',
 'BiographyCrimeDrama',
 'Drama',
 'CrimeDramaMystery',
 'ActionDrama',
 'DramaFamilyFantasy',
 'CrimeDramaThriller',
 'CrimeDrama',
 'DramaWar',
 'ComedyDramaRomance',
 'AdventureDramaSci-Fi',
 'CrimeDramaFantasy',
 'ActionAdventureFantasy',
 'ActionSci-Fi',
 'AdventureComedySci-Fi',
 'AnimationAdventureFamily',
 'HorrorMysteryThriller',
 'BiographyDramaMusic',
 'DramaThriller',
 'ActionCrimeDrama',
 'AnimationAdventureDrama',
 'ActionAdventureDrama',
 'CrimeDrama',
 'CrimeDramaThriller',
 'CrimeDramaMystery',
 'DramaMysterySci-Fi',
 'DramaMusic',
 'DramaRomanceWar',
 'ActionDramaMystery',
 'BiographyComedyDrama',
 'AnimationDramaWar',
 'ComedyDramaRomance',
 'Western',
 'MysteryT

In [18]:
len(genre)

250

In [19]:
#need to separate the different genres with a space
genre_clean = []
for x in genre:
    for y in x:
        if(y.isupper()):
            x = x.replace(y,", "+y).strip(", ").replace(" , "," ")
    genre_clean.append(x)
genre_clean

['Drama',
 'Crime, Drama',
 'Action, Crime, Drama',
 'Crime, Drama',
 'Crime, Drama',
 'Biography, Drama, History',
 'Action, Adventure, Drama',
 'Crime, Drama',
 'Action, Adventure, Drama',
 'Adventure, Western',
 'Drama, Romance',
 'Drama',
 'Action, Adventure, Drama',
 'Action, Adventure, Sci-, Fi',
 'Action, Adventure, Fantasy',
 'Action, Sci-, Fi',
 'Biography, Crime, Drama',
 'Drama',
 'Crime, Drama, Mystery',
 'Action, Drama',
 'Drama, Family, Fantasy',
 'Crime, Drama, Thriller',
 'Crime, Drama',
 'Drama, War',
 'Comedy, Drama, Romance',
 'Adventure, Drama, Sci-, Fi',
 'Crime, Drama, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Sci-, Fi',
 'Adventure, Comedy, Sci-, Fi',
 'Animation, Adventure, Family',
 'Horror, Mystery, Thriller',
 'Biography, Drama, Music',
 'Drama, Thriller',
 'Action, Crime, Drama',
 'Animation, Adventure, Drama',
 'Action, Adventure, Drama',
 'Crime, Drama',
 'Crime, Drama, Thriller',
 'Crime, Drama, Mystery',
 'Drama, Mystery, Sci-, Fi',
 'Drama, Mus

#### Step 7: Grabbing Runtime for each movie

In [20]:
test = BS(requests.get(f'https://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2', headers = headers).text)
test.findAll('li', attrs={'class':'ipc-inline-list__item'})[5].text

'2h 55m'

In [21]:
i = 0
rt = []

for x in href:
    for i in range(1,251):
        i+=1
    try: 
        IMDb_soup = BS(requests.get(f'https://www.imdb.com{x}?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_{i}', headers = headers).text)
        rt.append(IMDb_soup.findAll('li', attrs={'class':'ipc-inline-list__item'})[5].text)
    except:
        rt.append("Not Available on IMDb") 

In [22]:
rt

['2h 22m',
 '2h 55m',
 '2h 32m',
 '3h 22m',
 '1h 36m',
 '3h 15m',
 '3h 21m',
 '2h 34m',
 '2h 58m',
 '2h 58m',
 '2h 22m',
 '2h 19m',
 '2h 59m',
 '2h 28m',
 '2h 4m',
 '2h 16m',
 '2h 25m',
 '2h 13m',
 '2h 7m',
 '3h 27m',
 '2h 10m',
 '1h 58m',
 '2h 10m',
 '2h 49m',
 '1h 56m',
 '2h 49m',
 '3h 9m',
 '2h 1m',
 '2h 17m',
 '1h 56m',
 '2h 5m',
 '1h 49m',
 '2h 30m',
 '2h 12m',
 '1h 50m',
 '1h 28m',
 '2h 35m',
 '1h 59m',
 '2h 31m',
 '1h 46m',
 '2h 10m',
 '1h 46m',
 '1h 42m',
 '2h 13m',
 '1h 52m',
 '1h 29m',
 '1h 27m',
 '2h 45m',
 '1h 52m',
 '2h 35m',
 '1h 57m',
 '1h 27m',
 '2h 27m',
 '1h 53m',
 '1h 55m',
 '2h 45m',
 '1h 38m',
 '2h 17m',
 '1h 50m',
 '1h 28m',
 '2h 5m',
 '2h 26m',
 '2h 29m',
 '1h 56m',
 '2h 17m',
 '1h 57m',
 '2h 2m',
 '1h 35m',
 '2h 44m',
 '2h',
 '2h 40m',
 '2h 2m',
 '2h 33m',
 '1h 45m',
 '1h 21m',
 '2h 58m',
 'Wolfgang Petersen',
 '3h 1m',
 '2h 14m',
 '3h 49m',
 '2h 6m',
 '2h 10m',
 '1h 46m',
 '1h 42m',
 '2h 50m',
 '1h 43m',
 '1h 43m',
 '2h 23m',
 '2h 11m',
 '2h 6m',
 '2h 29m',
 '1

In [23]:
#because "Das Boot" does not have a rating on IMDb, the loop needed to say [4] instead of [5]
#I am just going to manually change it

rt_clean = []
for x in rt:
    x = x.replace("Wolfgang Petersen", "2h 29m")
    rt_clean.append(x)
rt_clean

['2h 22m',
 '2h 55m',
 '2h 32m',
 '3h 22m',
 '1h 36m',
 '3h 15m',
 '3h 21m',
 '2h 34m',
 '2h 58m',
 '2h 58m',
 '2h 22m',
 '2h 19m',
 '2h 59m',
 '2h 28m',
 '2h 4m',
 '2h 16m',
 '2h 25m',
 '2h 13m',
 '2h 7m',
 '3h 27m',
 '2h 10m',
 '1h 58m',
 '2h 10m',
 '2h 49m',
 '1h 56m',
 '2h 49m',
 '3h 9m',
 '2h 1m',
 '2h 17m',
 '1h 56m',
 '2h 5m',
 '1h 49m',
 '2h 30m',
 '2h 12m',
 '1h 50m',
 '1h 28m',
 '2h 35m',
 '1h 59m',
 '2h 31m',
 '1h 46m',
 '2h 10m',
 '1h 46m',
 '1h 42m',
 '2h 13m',
 '1h 52m',
 '1h 29m',
 '1h 27m',
 '2h 45m',
 '1h 52m',
 '2h 35m',
 '1h 57m',
 '1h 27m',
 '2h 27m',
 '1h 53m',
 '1h 55m',
 '2h 45m',
 '1h 38m',
 '2h 17m',
 '1h 50m',
 '1h 28m',
 '2h 5m',
 '2h 26m',
 '2h 29m',
 '1h 56m',
 '2h 17m',
 '1h 57m',
 '2h 2m',
 '1h 35m',
 '2h 44m',
 '2h',
 '2h 40m',
 '2h 2m',
 '2h 33m',
 '1h 45m',
 '1h 21m',
 '2h 58m',
 '2h 29m',
 '3h 1m',
 '2h 14m',
 '3h 49m',
 '2h 6m',
 '2h 10m',
 '1h 46m',
 '1h 42m',
 '2h 50m',
 '1h 43m',
 '1h 43m',
 '2h 23m',
 '2h 11m',
 '2h 6m',
 '2h 29m',
 '1h 48m',
 '1

In [24]:
len(rt_clean)

250

In [25]:
r = pd.DataFrame(rt_clean)
r.index = np.arange(1, len(r) + 1)
r.tail(20)

Unnamed: 0,0
231,1h 33m
232,1h 38m
233,1h 52m
234,2h 23m
235,2h 9m
236,2h 34m
237,2h 10m
238,2h 7m
239,2h 19m
240,1h 39m


In [26]:
#need to separate the hours from the minutes
hr = []
min = []

for x in rt_clean:
    y = x.split(" ")
    hr.append(y[0])
    min.append(y[-1])

In [27]:
hr_clean = []
for x in hr:
    x = x.replace("45m", "0")
    hr_clean.append(x)

hr_min = [(int(x.strip('h')))*60 for x in hr_clean]
hr_min

[120,
 120,
 120,
 180,
 60,
 180,
 180,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 120,
 180,
 120,
 60,
 120,
 120,
 60,
 120,
 180,
 120,
 120,
 60,
 120,
 60,
 120,
 120,
 60,
 60,
 120,
 60,
 120,
 60,
 120,
 60,
 60,
 120,
 60,
 60,
 60,
 120,
 60,
 120,
 60,
 60,
 120,
 60,
 60,
 120,
 60,
 120,
 60,
 60,
 120,
 120,
 120,
 60,
 120,
 60,
 120,
 60,
 120,
 120,
 120,
 120,
 120,
 60,
 60,
 120,
 120,
 180,
 120,
 180,
 120,
 120,
 60,
 60,
 120,
 60,
 60,
 120,
 120,
 120,
 120,
 60,
 60,
 60,
 120,
 60,
 60,
 180,
 120,
 120,
 120,
 120,
 120,
 120,
 60,
 60,
 120,
 120,
 120,
 120,
 120,
 60,
 120,
 60,
 120,
 120,
 120,
 60,
 120,
 60,
 120,
 120,
 60,
 120,
 120,
 120,
 120,
 60,
 120,
 60,
 120,
 120,
 180,
 120,
 120,
 120,
 60,
 120,
 120,
 60,
 60,
 120,
 120,
 60,
 120,
 60,
 120,
 120,
 120,
 60,
 60,
 120,
 60,
 60,
 120,
 120,
 120,
 180,
 120,
 60,
 60,
 60,
 120,
 60,
 120,
 60,
 60,
 120,
 60,
 60,
 120,
 60,
 120,
 120,
 60,
 60,
 60,
 60,

In [28]:
min_clean = []
for x in min:
    x = x.replace("2h", "0").replace("3h", "0")
    min_clean.append(x)
min_clean

['22m',
 '55m',
 '32m',
 '22m',
 '36m',
 '15m',
 '21m',
 '34m',
 '58m',
 '58m',
 '22m',
 '19m',
 '59m',
 '28m',
 '4m',
 '16m',
 '25m',
 '13m',
 '7m',
 '27m',
 '10m',
 '58m',
 '10m',
 '49m',
 '56m',
 '49m',
 '9m',
 '1m',
 '17m',
 '56m',
 '5m',
 '49m',
 '30m',
 '12m',
 '50m',
 '28m',
 '35m',
 '59m',
 '31m',
 '46m',
 '10m',
 '46m',
 '42m',
 '13m',
 '52m',
 '29m',
 '27m',
 '45m',
 '52m',
 '35m',
 '57m',
 '27m',
 '27m',
 '53m',
 '55m',
 '45m',
 '38m',
 '17m',
 '50m',
 '28m',
 '5m',
 '26m',
 '29m',
 '56m',
 '17m',
 '57m',
 '2m',
 '35m',
 '44m',
 '0',
 '40m',
 '2m',
 '33m',
 '45m',
 '21m',
 '58m',
 '29m',
 '1m',
 '14m',
 '49m',
 '6m',
 '10m',
 '46m',
 '42m',
 '50m',
 '43m',
 '43m',
 '23m',
 '11m',
 '6m',
 '29m',
 '48m',
 '39m',
 '55m',
 '22m',
 '59m',
 '39m',
 '38m',
 '16m',
 '8m',
 '2m',
 '16m',
 '5m',
 '23m',
 '47m',
 '56m',
 '40m',
 '50m',
 '9m',
 '9m',
 '11m',
 '36m',
 '50m',
 '54m',
 '3m',
 '33m',
 '18m',
 '42m',
 '12m',
 '29m',
 '7m',
 '45m',
 '59m',
 '36m',
 '12m',
 '41m',
 '20m',
 '8m

In [29]:
min2 = [int(x.strip('m')) for x in min_clean]
min2

[22,
 55,
 32,
 22,
 36,
 15,
 21,
 34,
 58,
 58,
 22,
 19,
 59,
 28,
 4,
 16,
 25,
 13,
 7,
 27,
 10,
 58,
 10,
 49,
 56,
 49,
 9,
 1,
 17,
 56,
 5,
 49,
 30,
 12,
 50,
 28,
 35,
 59,
 31,
 46,
 10,
 46,
 42,
 13,
 52,
 29,
 27,
 45,
 52,
 35,
 57,
 27,
 27,
 53,
 55,
 45,
 38,
 17,
 50,
 28,
 5,
 26,
 29,
 56,
 17,
 57,
 2,
 35,
 44,
 0,
 40,
 2,
 33,
 45,
 21,
 58,
 29,
 1,
 14,
 49,
 6,
 10,
 46,
 42,
 50,
 43,
 43,
 23,
 11,
 6,
 29,
 48,
 39,
 55,
 22,
 59,
 39,
 38,
 16,
 8,
 2,
 16,
 5,
 23,
 47,
 56,
 40,
 50,
 9,
 9,
 11,
 36,
 50,
 54,
 3,
 33,
 18,
 42,
 12,
 29,
 7,
 45,
 59,
 36,
 12,
 41,
 20,
 8,
 1,
 37,
 18,
 10,
 0,
 59,
 42,
 58,
 58,
 10,
 38,
 43,
 47,
 15,
 28,
 50,
 18,
 31,
 6,
 7,
 52,
 28,
 51,
 2,
 40,
 49,
 4,
 10,
 9,
 58,
 12,
 35,
 47,
 45,
 9,
 59,
 41,
 55,
 33,
 20,
 38,
 56,
 33,
 26,
 21,
 12,
 29,
 35,
 57,
 48,
 41,
 14,
 10,
 32,
 31,
 29,
 33,
 7,
 39,
 3,
 13,
 11,
 5,
 36,
 19,
 9,
 45,
 12,
 2,
 36,
 58,
 0,
 32,
 38,
 57,
 32,
 4,
 8,
 54,
 

Need to convert hours to minutes and combine this with the remaining minutes already listed

In [30]:
runtime = []
for i in range(0, len(hr_min)):
    runtime.append(hr_min[i] + min2[i])
runtime

[142,
 175,
 152,
 202,
 96,
 195,
 201,
 154,
 178,
 178,
 142,
 139,
 179,
 148,
 124,
 136,
 145,
 133,
 127,
 207,
 130,
 118,
 130,
 169,
 116,
 169,
 189,
 121,
 137,
 116,
 125,
 109,
 150,
 132,
 110,
 88,
 155,
 119,
 151,
 106,
 130,
 106,
 102,
 133,
 112,
 89,
 87,
 165,
 112,
 155,
 117,
 87,
 147,
 113,
 115,
 165,
 98,
 137,
 110,
 88,
 125,
 146,
 149,
 116,
 137,
 117,
 122,
 95,
 164,
 120,
 160,
 122,
 153,
 105,
 81,
 178,
 149,
 181,
 134,
 229,
 126,
 130,
 106,
 102,
 170,
 103,
 103,
 143,
 131,
 126,
 149,
 108,
 99,
 115,
 142,
 119,
 99,
 218,
 136,
 128,
 122,
 136,
 125,
 143,
 107,
 116,
 160,
 170,
 129,
 129,
 131,
 96,
 170,
 114,
 123,
 153,
 138,
 102,
 132,
 89,
 127,
 165,
 119,
 156,
 132,
 161,
 140,
 68,
 121,
 97,
 138,
 130,
 180,
 179,
 162,
 178,
 118,
 130,
 158,
 103,
 107,
 135,
 148,
 110,
 138,
 91,
 126,
 127,
 172,
 88,
 111,
 122,
 100,
 109,
 124,
 130,
 129,
 238,
 132,
 95,
 107,
 105,
 129,
 119,
 161,
 115,
 93,
 140,
 98,
 116,


#### Step 8: Scraping for Awards

In [31]:
test = BS(requests.get(f'https://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1', headers = headers).text)
test.findAll('label', attrs ={'class':'ipc-metadata-list-item__list-content-item'})[0].text

'21 wins & 43 nominations total'

In [32]:
i = 0
awards = []

for x in href:
    for i in range(1,251):
        i+=1
    try: 
        IMDb_soup = BS(requests.get(f'https://www.imdb.com{x}?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2PJKZQ0CK19EY9BY16QH&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_{i}', headers = headers).text)
        awards.append(IMDb_soup.findAll('label', attrs ={'class':'ipc-metadata-list-item__list-content-item'})[0].text)
    except:
        awards.append("Not Available on IMDb") 

In [33]:
awards

['21 wins & 43 nominations total',
 '32 wins & 30 nominations total',
 '160 wins & 163 nominations total',
 '17 wins & 21 nominations total',
 '17 wins & 13 nominations total',
 '91 wins & 49 nominations total',
 '209 wins & 124 nominations total',
 '70 wins & 75 nominations total',
 '121 wins & 126 nominations total',
 '3 wins & 6 nominations',
 '51 wins & 75 nominations total',
 '11 wins & 38 nominations total',
 '126 wins & 138 nominations total',
 '158 wins & 220 nominations total',
 '26 wins & 20 nominations total',
 '42 wins & 51 nominations total',
 '44 wins & 38 nominations total',
 '39 wins & 16 nominations total',
 '29 wins & 43 nominations total',
 '5 wins & 8 nominations total',
 '6 wins & 6 nominations total',
 '69 wins & 51 nominations total',
 '74 wins & 50 nominations total',
 '79 wins & 75 nominations total',
 '72 wins & 52 nominations total',
 '44 wins & 148 nominations total',
 '15 wins & 37 nominations total',
 '64 wins & 29 nominations total',
 '37 wins & 33 nomina

In [38]:
wins = []
noms = []

for x in awards:
    y = x.split(" & ")
    wins.append(y[0])
    noms.append(y[-1])

In [55]:
wins_clean = []
for x in wins: 
    if "nominations" in x:
        x = x.replace(x,'0')
    if "nomination" in x:
        x = x.replace(x,'0')
    if "See the Top 250 movies as rated by IMDb users" in x:
        x = x.replace(x,'0')
    wins_clean.append(x)

wins1 = [int(x.strip('wins').strip('wins total')) for x in wins_clean]
wins1

[21,
 32,
 160,
 17,
 17,
 91,
 209,
 70,
 121,
 3,
 51,
 11,
 126,
 158,
 26,
 42,
 44,
 39,
 29,
 5,
 6,
 69,
 74,
 79,
 72,
 44,
 15,
 64,
 37,
 23,
 58,
 7,
 57,
 308,
 6,
 41,
 60,
 4,
 97,
 37,
 6,
 98,
 13,
 9,
 38,
 3,
 4,
 5,
 6,
 25,
 18,
 4,
 21,
 57,
 38,
 58,
 95,
 80,
 19,
 7,
 7,
 4,
 46,
 4,
 20,
 82,
 112,
 14,
 39,
 40,
 43,
 122,
 134,
 110,
 28,
 33,
 13,
 70,
 15,
 12,
 24,
 4,
 16,
 37,
 63,
 7,
 61,
 3,
 23,
 38,
 17,
 73,
 12,
 38,
 3,
 11,
 2,
 31,
 8,
 11,
 59,
 12,
 24,
 6,
 2,
 8,
 19,
 0,
 18,
 14,
 40,
 79,
 0,
 22,
 89,
 6,
 92,
 4,
 8,
 20,
 8,
 28,
 135,
 22,
 0,
 29,
 13,
 2,
 15,
 36,
 27,
 59,
 37,
 16,
 30,
 4,
 109,
 50,
 118,
 40,
 37,
 37,
 35,
 4,
 11,
 3,
 16,
 44,
 3,
 9,
 29,
 164,
 48,
 0,
 11,
 22,
 24,
 22,
 7,
 101,
 13,
 5,
 53,
 14,
 30,
 133,
 23,
 5,
 83,
 21,
 10,
 5,
 16,
 67,
 13,
 3,
 13,
 30,
 1,
 242,
 46,
 29,
 18,
 64,
 5,
 2,
 136,
 24,
 7,
 6,
 19,
 11,
 56,
 6,
 2,
 33,
 49,
 10,
 108,
 247,
 5,
 25,
 5,
 15,
 15,
 20,
 5,


In [54]:
noms_clean = []
for x in noms: 
    if "wins" in x:
        x = x.replace(x,'0')
    if "win" in x:
        x = x.replace(x,'0')
    if "See the Top 250 movies as rated by IMDb users" in x:
        x = x.replace(x,'0')
    noms_clean.append(x)

noms1 = [int(x.strip('nominations total').strip('nominations')) for x in noms_clean]
noms1

[43,
 30,
 163,
 21,
 13,
 49,
 124,
 75,
 126,
 6,
 75,
 38,
 138,
 220,
 20,
 51,
 38,
 16,
 43,
 8,
 6,
 51,
 50,
 75,
 52,
 148,
 37,
 29,
 33,
 25,
 31,
 14,
 74,
 271,
 15,
 35,
 106,
 15,
 141,
 17,
 45,
 146,
 9,
 3,
 40,
 0,
 1,
 5,
 13,
 32,
 22,
 1,
 33,
 60,
 24,
 158,
 95,
 38,
 20,
 4,
 6,
 8,
 79,
 16,
 23,
 57,
 102,
 11,
 103,
 21,
 15,
 239,
 172,
 40,
 23,
 34,
 11,
 132,
 7,
 12,
 61,
 16,
 26,
 70,
 28,
 9,
 96,
 3,
 20,
 55,
 12,
 111,
 23,
 73,
 0,
 13,
 0,
 15,
 10,
 7,
 74,
 24,
 8,
 2,
 9,
 15,
 46,
 8,
 6,
 16,
 18,
 87,
 14,
 20,
 50,
 7,
 86,
 7,
 6,
 3,
 22,
 21,
 206,
 34,
 2,
 27,
 79,
 0,
 15,
 159,
 20,
 124,
 179,
 25,
 23,
 11,
 115,
 47,
 137,
 69,
 56,
 69,
 67,
 2,
 66,
 3,
 8,
 27,
 11,
 5,
 104,
 140,
 63,
 3,
 22,
 25,
 28,
 12,
 29,
 117,
 9,
 3,
 43,
 20,
 8,
 235,
 35,
 22,
 58,
 22,
 38,
 2,
 46,
 86,
 5,
 3,
 19,
 9,
 7,
 337,
 94,
 16,
 4,
 188,
 4,
 1,
 227,
 27,
 41,
 0,
 15,
 26,
 115,
 16,
 0,
 5,
 58,
 2,
 141,
 234,
 9,
 63,
 18,
 3

#### Step 9: Creating Score_ID

In [62]:
id1 = '0704'
score_id = []
for x in range(1,251):
    score_id.append(id1+f'{x}') 
print(score_id)

['07041', '07042', '07043', '07044', '07045', '07046', '07047', '07048', '07049', '070410', '070411', '070412', '070413', '070414', '070415', '070416', '070417', '070418', '070419', '070420', '070421', '070422', '070423', '070424', '070425', '070426', '070427', '070428', '070429', '070430', '070431', '070432', '070433', '070434', '070435', '070436', '070437', '070438', '070439', '070440', '070441', '070442', '070443', '070444', '070445', '070446', '070447', '070448', '070449', '070450', '070451', '070452', '070453', '070454', '070455', '070456', '070457', '070458', '070459', '070460', '070461', '070462', '070463', '070464', '070465', '070466', '070467', '070468', '070469', '070470', '070471', '070472', '070473', '070474', '070475', '070476', '070477', '070478', '070479', '070480', '070481', '070482', '070483', '070484', '070485', '070486', '070487', '070488', '070489', '070490', '070491', '070492', '070493', '070494', '070495', '070496', '070497', '070498', '070499', '0704100', '070410

#### Step 10: Creating a Pandas DataFrame for Table 'IMDb_Movies'

In [66]:
table = {'Title_ID':title_id,'Title':title,'Director':stripped,'Release_Year':release_year1,'Rating':rating, 'Genre': genre_clean, 'Runtime': runtime, 'Award_Noms':noms1,'Award_Wins':wins1, 'Score_ID': score_id}

IMDb_Movies = pd.DataFrame(table, columns=['Title_ID','Title','Director', 'Rating', 'Genre', 'Runtime', 'Release_Year', 'Award_Noms', 'Award_Wins', 'Score_ID'])

IMDb_Movies[40:46]

#NOTE: Sound of Music and Persona swap themselves in order, all info is correct though

Unnamed: 0,Title_ID,Title,Director,Rating,Genre,Runtime,Release_Year,Award_Noms,Award_Wins,Score_ID
40,177641,The Prestige,Christopher Nolan,PG-13,"Drama, Mystery, Sci-, Fi",130,2006,45,6,70441
41,177642,Whiplash,Damien Chazelle,R,"Drama, Music",106,2014,146,98,70442
42,177643,Casablanca,Michael Curtiz,PG,"Drama, Romance, War",102,1942,9,13,70443
43,177644,Harakiri,Masaki Kobayashi,Not Rated,"Action, Drama, Mystery",133,1962,3,9,70444
44,177645,The Intouchables,Olivier Nakache,R,"Biography, Comedy, Drama",112,2011,40,38,70445
45,177646,Grave of the Fireflies,Isao Takahata,Not Rated,"Animation, Drama, War",89,1988,0,3,70446


In [63]:
IMDb_Movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title_ID      250 non-null    object
 1   Title         250 non-null    object
 2   Director      250 non-null    object
 3   Rating        250 non-null    object
 4   Genre         250 non-null    object
 5   Runtime       250 non-null    int64 
 6   Release_Year  250 non-null    object
 7   Award_Noms    250 non-null    int64 
 8   Award_Wins    250 non-null    int64 
 9   Score_ID      250 non-null    object
dtypes: int64(3), object(7)
memory usage: 19.7+ KB


In [57]:
IMDb_Movies.sort_values('Release_Year').head(250)

Unnamed: 0,Title_ID,Title,Director,Rating,Genre,Runtime,Release_Year,Award_Noms,Award_Wins
127,1776128,The Kid,Charles Chaplin,Passed,"Comedy, Drama, Family",68,1921,0,2
194,1776195,Sherlock Jr.,Buster Keaton,Passed,"Action, Comedy, Romance",45,1924,0,2
175,1776176,The Gold Rush,Charles Chaplin,Passed,"Adventure, Comedy, Drama",95,1925,3,3
185,1776186,The General,Clyde Bruckman,Passed,"Action, Adventure, Comedy",67,1926,1,2
115,1776116,Metropolis,Fritz Lang,Not Rated,"Drama, Sci-, Fi",153,1927,7,6
...,...,...,...,...,...,...,...,...,...
106,1776107,Hamilton,Thomas Kail,PG-13,"Biography, Drama, History",160,2020,46,19
229,1776230,Jai Bhim,T.J. Gnanavel,TV-MA,"Crime, Drama, Mystery",164,2021,17,6
142,1776143,Spider-Man: No Way Home,Jon Watts,PG-13,"Action, Adventure, Fantasy",148,2021,67,35
81,177682,Top Gun: Maverick,Joseph Kosinski,PG-13,"Action, Drama",130,2022,16,4


In [36]:
with pd.option_context("display.max_rows", None):
    display(IMDb_Movies.sort_values('Release_Year'))

Unnamed: 0,Title_ID,Title,Director,Rating,Genre,Runtime,Release_Year
127,1776128,The Kid,Charles Chaplin,Passed,"Comedy, Drama, Family",68,1921
194,1776195,Sherlock Jr.,Buster Keaton,Passed,"Action, Comedy, Romance",45,1924
175,1776176,The Gold Rush,Charles Chaplin,Passed,"Adventure, Comedy, Drama",95,1925
185,1776186,The General,Clyde Bruckman,Passed,"Action, Adventure, Comedy",67,1926
115,1776116,Metropolis,Fritz Lang,Not Rated,"Drama, Sci-, Fi",153,1927
206,1776207,The Passion of Joan of Arc,Carl Theodor Dreyer,Passed,"Biography, Drama, History",114,1928
51,177652,City Lights,Charles Chaplin,G,"Comedy, Drama, Romance",87,1931
96,177697,M,Fritz Lang,Passed,"Crime, Mystery, Thriller",99,1931
241,1776242,It Happened One Night,Frank Capra,Passed,"Comedy, Romance",105,1934
46,177647,Modern Times,Charles Chaplin,G,"Comedy, Drama, Romance",87,1936


#### Saving IMDb_Movies table as a CSV into my Data folder

In [61]:
IMDb_Movies.to_csv('../Data/IMDb_Movies.csv')