# Top 100 Movies of All Time Analysis
## Data Processing

In [84]:
import pandas as pd

In [85]:
df = pd.read_csv('../data/raw/Top_100_Movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,rank,title,description,genre,rating,id,year,imdbid,imdb_link,image
0,0,1,The Shawshank Redemption,Two imprisoned men bond over a number of years...,['Drama'],9.3,top1,1994,tt0111161,https://www.imdb.com/title/tt0111161,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,1,2,The Godfather,The aging patriarch of an organized crime dyna...,"['Crime', 'Drama']",9.2,top2,1972,tt0068646,https://www.imdb.com/title/tt0068646,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,2,3,The Dark Knight,When the menace known as the Joker wreaks havo...,"['Action', 'Crime', 'Drama']",9.0,top3,2008,tt0468569,https://www.imdb.com/title/tt0468569,https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,3,4,The Godfather Part II,The early life and career of Vito Corleone in ...,"['Crime', 'Drama']",9.0,top4,1974,tt0071562,https://www.imdb.com/title/tt0071562,https://m.media-amazon.com/images/M/MV5BMWMwMG...
4,4,5,12 Angry Men,The jury in a New York City murder trial is fr...,"['Crime', 'Drama']",9.0,top5,1957,tt0050083,https://www.imdb.com/title/tt0050083,https://m.media-amazon.com/images/M/MV5BMWU4N2...


In [86]:
# store imdb links in list for extra data extraction

imdb_links = df['imdb_link'].tolist()

In [87]:
# drop unnecessary columns and rename rating to imdb_rating
df.drop(columns=['Unnamed: 0', 'imdbid', 'image', 'imdb_link', 'description'], inplace=True)

df.rename(columns={'rating': 'imdb_rating'}, inplace=True)

df.head()

Unnamed: 0,rank,title,genre,imdb_rating,id,year
0,1,The Shawshank Redemption,['Drama'],9.3,top1,1994
1,2,The Godfather,"['Crime', 'Drama']",9.2,top2,1972
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,top3,2008
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,top4,1974
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,top5,1957


In [88]:
from bs4 import BeautifulSoup
import requests

In [82]:
# headers to allow scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# initialize lists for ratings and times
rating_list = []
time_list = []

for link in imdb_links:
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # check to see if rating exists on IMDB and add to list
    try:
        rating = soup.select_one('div.iPPbjm li:nth-of-type(2) a').text
    except AttributeError:
        rating = 'Not Rated'
    rating_list.append(rating)
    
    # check to see if the time is the second or third list item 
    try:
        total_time = soup.select_one('div.iPPbjm li:nth-of-type(3)').text.replace('h', '').replace('m', '').split(' ')
    except AttributeError:
        total_time = soup.select_one('div.iPPbjm li:nth-of-type(2)').text.replace('h', '').replace('m', '').split(' ')
        
    # check to see if the time contains hours and minutes or just hours and add to list
    if len(total_time) == 2:
        time = int(total_time[0]) * 60 + int(total_time[1])
    else:
        time = int(total_time[0]) * 60
    time_list.append(time)

[142] ['R']
[142, 175] ['R', 'R']
[142, 175, 152] ['R', 'R', 'PG-13']
[142, 175, 152, 202] ['R', 'R', 'PG-13', 'R']
[142, 175, 152, 202, 96] ['R', 'R', 'PG-13', 'R', 'Approved']
[142, 175, 152, 202, 96, 195] ['R', 'R', 'PG-13', 'R', 'Approved', 'R']
[142, 175, 152, 202, 96, 195, 201] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13']
[142, 175, 152, 202, 96, 195, 201, 154] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R']
[142, 175, 152, 202, 96, 195, 201, 154, 178] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13']
[142, 175, 152, 202, 96, 195, 201, 154, 178, 178] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13', 'R']
[142, 175, 152, 202, 96, 195, 201, 154, 178, 178, 142] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13', 'R', 'PG-13']
[142, 175, 152, 202, 96, 195, 201, 154, 178, 178, 142, 139] ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13', 'R', 'PG-13', 'R']
[142, 175, 152, 202, 96, 195, 201, 154, 178, 178, 142, 139, 

In [89]:
df['rating'] = rating_list
df['time'] = time_list

df.head()

Unnamed: 0,rank,title,genre,imdb_rating,id,year,rating,time
0,1,The Shawshank Redemption,['Drama'],9.3,top1,1994,R,142
1,2,The Godfather,"['Crime', 'Drama']",9.2,top2,1972,R,175
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,top3,2008,PG-13,152
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,top4,1974,R,202
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,top5,1957,Approved,96
