# Top 100 Movies of All Time Analysis
## Data Processing

In [67]:
import pandas as pd

In [68]:
df = pd.read_csv('../data/raw/Top_100_Movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,rank,title,description,genre,rating,id,year,imdbid,imdb_link,image
0,0,1,The Shawshank Redemption,Two imprisoned men bond over a number of years...,['Drama'],9.3,top1,1994,tt0111161,https://www.imdb.com/title/tt0111161,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,1,2,The Godfather,The aging patriarch of an organized crime dyna...,"['Crime', 'Drama']",9.2,top2,1972,tt0068646,https://www.imdb.com/title/tt0068646,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,2,3,The Dark Knight,When the menace known as the Joker wreaks havo...,"['Action', 'Crime', 'Drama']",9.0,top3,2008,tt0468569,https://www.imdb.com/title/tt0468569,https://m.media-amazon.com/images/M/MV5BMTMxNT...
3,3,4,The Godfather Part II,The early life and career of Vito Corleone in ...,"['Crime', 'Drama']",9.0,top4,1974,tt0071562,https://www.imdb.com/title/tt0071562,https://m.media-amazon.com/images/M/MV5BMWMwMG...
4,4,5,12 Angry Men,The jury in a New York City murder trial is fr...,"['Crime', 'Drama']",9.0,top5,1957,tt0050083,https://www.imdb.com/title/tt0050083,https://m.media-amazon.com/images/M/MV5BMWU4N2...


In [69]:
# store imdb links in list for extra data extraction

imdb_links = df['imdb_link'].tolist()

In [70]:
# drop unnecessary columns and rename rating to imdb_rating
df.drop(columns=['Unnamed: 0', 'imdbid', 'image', 'imdb_link', 'description', 'id'], inplace=True)

df.rename(columns={'rating': 'imdb_rating'}, inplace=True)

df.head()

Unnamed: 0,rank,title,genre,imdb_rating,year
0,1,The Shawshank Redemption,['Drama'],9.3,1994
1,2,The Godfather,"['Crime', 'Drama']",9.2,1972
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,2008
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,1974
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,1957


In [71]:
from bs4 import BeautifulSoup
import requests

In [72]:
# headers to allow scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# initialize lists for ratings and times
rating_list = []
time_list = []

for link in imdb_links:
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # check to see if rating exists on IMDB and add to list
    try:
        rating = soup.select_one('div.iPPbjm li:nth-of-type(2) a').text
    except AttributeError:
        rating = 'Not Rated'
    rating_list.append(rating)
    
    # check to see if the time is the second or third list item 
    try:
        total_time = soup.select_one('div.iPPbjm li:nth-of-type(3)').text.replace('h', '').replace('m', '').split(' ')
    except AttributeError:
        total_time = soup.select_one('div.iPPbjm li:nth-of-type(2)').text.replace('h', '').replace('m', '').split(' ')
        
    # check to see if the time contains hours and minutes or just hours and add to list
    if len(total_time) == 2:
        time = int(total_time[0]) * 60 + int(total_time[1])
    else:
        time = int(total_time[0]) * 60
    time_list.append(time)

In [73]:
# add ratings and runtime to df

df['rating'] = rating_list
df['time'] = time_list

df.head()

Unnamed: 0,rank,title,genre,imdb_rating,year,rating,time
0,1,The Shawshank Redemption,['Drama'],9.3,1994,R,142
1,2,The Godfather,"['Crime', 'Drama']",9.2,1972,R,175
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,2008,PG-13,152
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,1974,R,202
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,1957,Approved,96


In [74]:
# split the end of the year to make a decades column

df_split = df['year'].astype(str).str.split('', expand=True)

df_split[4] = '0'

df['decade'] = df_split[3] + df_split[4]

df.head(5)

Unnamed: 0,rank,title,genre,imdb_rating,year,rating,time,decade
0,1,The Shawshank Redemption,['Drama'],9.3,1994,R,142,90
1,2,The Godfather,"['Crime', 'Drama']",9.2,1972,R,175,70
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,2008,PG-13,152,0
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,1974,R,202,70
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,1957,Approved,96,50


In [79]:
# organize columns to get decade after year
df = df[['rank', 'title', 'genre', 'imdb_rating', 'year', 'decade', 'rating', 'time']]

df

Unnamed: 0,rank,title,genre,imdb_rating,year,decade,rating,time
0,1,The Shawshank Redemption,['Drama'],9.3,1994,90,R,142
1,2,The Godfather,"['Crime', 'Drama']",9.2,1972,70,R,175
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,2008,0,PG-13,152
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,1974,70,R,202
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,1957,50,Approved,96
...,...,...,...,...,...,...,...,...
95,96,Reservoir Dogs,"['Crime', 'Thriller']",8.3,1992,90,R,99
96,97,Ikiru,['Drama'],8.3,1952,50,Not Rated,143
97,98,Lawrence of Arabia,"['Adventure', 'Biography', 'Drama']",8.3,1962,60,PG,218
98,99,Citizen Kane,"['Drama', 'Mystery']",8.3,1941,40,PG,119


In [80]:
df.to_csv('../data/processed/top_100_movies_processed.csv', index=False, header=True)

## Data Analysis

In [81]:
df = pd.read_csv('../data/processed/top_100_movies_processed.csv')

df.head()

Unnamed: 0,rank,title,genre,imdb_rating,year,decade,rating,time
0,1,The Shawshank Redemption,['Drama'],9.3,1994,90,R,142
1,2,The Godfather,"['Crime', 'Drama']",9.2,1972,70,R,175
2,3,The Dark Knight,"['Action', 'Crime', 'Drama']",9.0,2008,0,PG-13,152
3,4,The Godfather Part II,"['Crime', 'Drama']",9.0,1974,70,R,202
4,5,12 Angry Men,"['Crime', 'Drama']",9.0,1957,50,Approved,96


In [78]:
import re

unique_genres = set()

genres_list = df['genre'].to_list()

fixed_list = []

# the list in the dataframe was just a string, so remove the brackets and quotes
for genre in genres_list:
    fixed_list.extend(re.sub(r"[,\[\]']", '', genre).split(' '))
    # loop through the list and add to the set to get the unique genres
    for genre in fixed_list:
        unique_genres.add(genre)

# convert the set back to a list to be used in analysis
unique_genres = sorted(list(unique_genres))

unique_genres

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']