# Extract Transform Load
---

In [2]:
import pandas as pd
import requests

from config import omdb_key

import json
from pprint import pprint

from splinter import Browser
from bs4 import BeautifulSoup

import re

## Scraping IMDB for Top 250 movies
___

In [3]:
# URL to scrape for top 250 IMDb movies

url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'

# Read all the tables in the URL

tables = pd.read_html(url)

Raw_df = tables[0]

Raw_df

Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,1. The Shawshank Redemption (1994),9.2,12345678910 NOT YET RELEASED Seen,
1,,2. The Godfather (1972),9.1,12345678910 NOT YET RELEASED Seen,
2,,3. The Godfather: Part II (1974),9.0,12345678910 NOT YET RELEASED Seen,
3,,4. The Dark Knight (2008),9.0,12345678910 NOT YET RELEASED Seen,
4,,5. 12 Angry Men (1957),8.9,12345678910 NOT YET RELEASED Seen,
...,...,...,...,...,...
245,,246. Butch Cassidy and the Sundance Kid (1969),8.0,12345678910 NOT YET RELEASED Seen,
246,,247. Aladdin (1992),8.0,12345678910 NOT YET RELEASED Seen,
247,,248. Akira (1988),8.0,12345678910 NOT YET RELEASED Seen,
248,,249. Throne of Blood (1957),8.0,12345678910 NOT YET RELEASED Seen,


In [4]:
Raw_df = Raw_df.drop(['Unnamed: 0','Your Rating', 'Unnamed: 4'], axis = 1)
Raw_df

Unnamed: 0,Rank & Title,IMDb Rating
0,1. The Shawshank Redemption (1994),9.2
1,2. The Godfather (1972),9.1
2,3. The Godfather: Part II (1974),9.0
3,4. The Dark Knight (2008),9.0
4,5. 12 Angry Men (1957),8.9
...,...,...
245,246. Butch Cassidy and the Sundance Kid (1969),8.0
246,247. Aladdin (1992),8.0
247,248. Akira (1988),8.0
248,249. Throne of Blood (1957),8.0


### Separtae 'Rank & Title' column to Rank, Title and Year columns
---

In [5]:
Imdb_movies = []

for movie in Raw_df['Rank & Title']:
    
    rank = movie.split('.')[0]
    
    title1 = re.split('[.(]', movie)[1]
    title = re.sub("  ", "", title1)
        
    year = re.split('[()]', movie)[1]
  
    Imdb_movies.append({'IMDb Rank': rank,
                        'Movie Title': title,
                        'Year Released': year
                       })
    
Imdb_df = pd.DataFrame(Imdb_movies)
    
Imdb_df['IMDb Rating'] = Raw_df['IMDb Rating'] 
    
Imdb_df

Unnamed: 0,IMDb Rank,Movie Title,Year Released,IMDb Rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.1
2,3,The Godfather: Part II,1974,9.0
3,4,The Dark Knight,2008,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Butch Cassidy and the Sundance Kid,1969,8.0
246,247,Aladdin,1992,8.0
247,248,Akira,1988,8.0
248,249,Throne of Blood,1957,8.0


## OMDb API
---

In [6]:
# Sample APi request

url = "http://www.omdbapi.com/?apikey=" + omdb_key + "&t="

requests.get(url+"oldboy").json()

{'Title': 'Oldboy',
 'Year': '2003',
 'Rated': 'R',
 'Released': '21 Nov 2003',
 'Runtime': '120 min',
 'Genre': 'Action, Drama, Mystery, Thriller',
 'Director': 'Chan-wook Park',
 'Writer': 'Garon Tsuchiya (story), Nobuaki Minegishi (comic), Chan-wook Park (character created by: Oldboy,  Vengeance Trilogy), Chan-wook Park (screenplay), Joon-hyung Lim (screenplay), Jo-yun Hwang (screenplay)',
 'Actors': 'Min-sik Choi, Ji-Tae Yoo, Hye-jeong Kang, Dae-han Ji',
 'Plot': 'After being kidnapped and imprisoned for fifteen years, Oh Dae-Su is released, only to find that he must find his captor in five days.',
 'Language': 'Korean',
 'Country': 'South Korea',
 'Awards': '39 wins & 18 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTI3NTQyMzU5M15BMl5BanBnXkFtZTcwMTM2MjgyMQ@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.4/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '82%'},
  {'Source': 'Metacritic', 'Value': '77/100'}],
 'Metascore': '77',

In [7]:
base_url = "http://www.omdbapi.com/?apikey=" + omdb_key + "&t="

movie_list = Imdb_df['Movie Title']

ID_list = []
Title_list = []
Rated_list = []
Genre_list = []
Runtime_list = []
Director_list = []
Actors_list = []
Production_list = []
Language_list = []
Awards_list = []
Plot_list = []


for movie in movie_list:
    
    results = requests.get(base_url + movie).json()
    
    try:
        
        ID_list.append(results['imdbID'])
        Title_list.append(results["Title"])
        Rated_list.append(results["Rated"])
        Genre_list.append(results["Genre"])
        Runtime_list.append(results["Runtime"])
        Director_list.append(results["Director"])
        Actors_list.append(results["Actors"])
        Production_list.append(results['Production'])
        Language_list.append(results["Language"])
        Awards_list.append(results["Awards"])
        Plot_list.append(results["Plot"])
        
    except:
        
        print(f'Movie {movie} not found')
        ID_list.append('Nan')
        Title_list.append('movie')
        Rated_list.append('Nan')
        Genre_list.append('Nan')
        Runtime_list.append('Nan')
        Director_list.append('Nan')
        Actors_list.append('Nan')
        Production_list.append('Nan')
        Language_list.append('Nan')
        Awards_list.append('Nan')
        Plot_list.append('Nan')
        pass
        

Movie Taare Zameen Par not found
Movie Capharnaüm not found
Movie Babam ve Oglum not found
Movie Relatos salvajes not found
Movie Kis Uykusu not found


In [8]:
OMDb_df = pd.DataFrame({'IMDb ID' : ID_list,
                        'Movie Title' : Title_list,
                        'Rated' : Rated_list,
                        'Genre': Genre_list,
                        'Director': Director_list,
                        'Actors' : Actors_list,
                        'Production' : Production_list,
                        'Language' : Language_list,
                        'Awards' : Awards_list,
                        'Plot' : Plot_list
                       })

OMDb_df

Unnamed: 0,IMDb ID,Movie Title,Rated,Genre,Director,Actors,Production,Language,Awards,Plot
0,tt0111161,The Shawshank Redemption,R,Drama,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Columbia Pictures,English,Nominated for 7 Oscars. Another 21 wins & 35 n...,Two imprisoned men bond over a number of years...
1,tt0068646,The Godfather,R,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",Paramount Pictures,"English, Italian, Latin",Won 3 Oscars. Another 26 wins & 30 nominations.,The aging patriarch of an organized crime dyna...
2,tt0071562,The Godfather: Part II,R,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Paramount Pictures,"English, Italian, Spanish, Latin, Sicilian",Won 6 Oscars. Another 11 wins & 20 nominations.,The early life and career of Vito Corleone in ...
3,tt0468569,The Dark Knight,PG-13,"Action, Crime, Drama, Thriller",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Warner Bros. Pictures/Legendary,"English, Mandarin",Won 2 Oscars. Another 153 wins & 159 nominations.,When the menace known as the Joker wreaks havo...
4,tt0050083,12 Angry Men,Approved,Drama,Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",Criterion Collection,English,Nominated for 3 Oscars. Another 16 wins & 9 no...,A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...,...,...,...,...,...
245,tt0064115,Butch Cassidy and the Sundance Kid,PG,"Biography, Crime, Drama, Western",George Roy Hill,"Paul Newman, Robert Redford, Katharine Ross, S...",20th Century Fox,"English, Spanish",Won 4 Oscars. Another 17 wins & 14 nominations.,"Wyoming, early 1900s. Butch Cassidy and The Su..."
246,tt0103639,Aladdin,G,"Animation, Adventure, Comedy, Family, Fantasy,...","Ron Clements, John Musker","Scott Weinger, Robin Williams, Linda Larkin, J...",Walt Disney Pictures,English,Won 2 Oscars. Another 31 wins & 22 nominations.,A kindhearted street urchin and a power-hungry...
247,tt0094625,Akira,R,"Animation, Drama, Sci-Fi, Thriller",Katsuhiro Ôtomo,"Mitsuo Iwata, Nozomu Sasaki, Mami Koyama, Tess...",Streamline Pictures,Japanese,1 win.,A secret military project endangers Neo-Tokyo ...
248,tt0050613,Throne of Blood,Not Rated,"Drama, History",Akira Kurosawa,"Toshirô Mifune, Isuzu Yamada, Takashi Shimura,...",Media Home Entertainment,Japanese,3 wins & 1 nomination.,"A war-hardened general, egged on by his ambiti..."


## Merging IMDb_df and OMDb_df
---

In [9]:
Top_Movies_df = Imdb_df.merge(OMDb_df, how = 'inner', on = 'Movie Title')

Top_Movies_df.set_index('IMDb Rank', inplace = True)

# Moving 'IMDb ID' column to the front

first_col = Top_Movies_df.pop("IMDb ID")

Top_Movies_df.insert(0, "IMDb ID", first_col)

Top_Movies_df

Unnamed: 0_level_0,IMDb ID,Movie Title,Year Released,IMDb Rating,Rated,Genre,Director,Actors,Production,Language,Awards,Plot
IMDb Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,tt0111161,The Shawshank Redemption,1994,9.2,R,Drama,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Columbia Pictures,English,Nominated for 7 Oscars. Another 21 wins & 35 n...,Two imprisoned men bond over a number of years...
2,tt0068646,The Godfather,1972,9.1,R,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",Paramount Pictures,"English, Italian, Latin",Won 3 Oscars. Another 26 wins & 30 nominations.,The aging patriarch of an organized crime dyna...
3,tt0071562,The Godfather: Part II,1974,9.0,R,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Paramount Pictures,"English, Italian, Spanish, Latin, Sicilian",Won 6 Oscars. Another 11 wins & 20 nominations.,The early life and career of Vito Corleone in ...
4,tt0468569,The Dark Knight,2008,9.0,PG-13,"Action, Crime, Drama, Thriller",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Warner Bros. Pictures/Legendary,"English, Mandarin",Won 2 Oscars. Another 153 wins & 159 nominations.,When the menace known as the Joker wreaks havo...
5,tt0050083,12 Angry Men,1957,8.9,Approved,Drama,Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",Criterion Collection,English,Nominated for 3 Oscars. Another 16 wins & 9 no...,A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...,...,...,...,...,...,...,...
246,tt0064115,Butch Cassidy and the Sundance Kid,1969,8.0,PG,"Biography, Crime, Drama, Western",George Roy Hill,"Paul Newman, Robert Redford, Katharine Ross, S...",20th Century Fox,"English, Spanish",Won 4 Oscars. Another 17 wins & 14 nominations.,"Wyoming, early 1900s. Butch Cassidy and The Su..."
247,tt0103639,Aladdin,1992,8.0,G,"Animation, Adventure, Comedy, Family, Fantasy,...","Ron Clements, John Musker","Scott Weinger, Robin Williams, Linda Larkin, J...",Walt Disney Pictures,English,Won 2 Oscars. Another 31 wins & 22 nominations.,A kindhearted street urchin and a power-hungry...
248,tt0094625,Akira,1988,8.0,R,"Animation, Drama, Sci-Fi, Thriller",Katsuhiro Ôtomo,"Mitsuo Iwata, Nozomu Sasaki, Mami Koyama, Tess...",Streamline Pictures,Japanese,1 win.,A secret military project endangers Neo-Tokyo ...
249,tt0050613,Throne of Blood,1957,8.0,Not Rated,"Drama, History",Akira Kurosawa,"Toshirô Mifune, Isuzu Yamada, Takashi Shimura,...",Media Home Entertainment,Japanese,3 wins & 1 nomination.,"A war-hardened general, egged on by his ambiti..."


In [10]:
Top_Movies_df.to_csv('../Output/Complete_IMDbTopMovies.csv')

In [11]:
import os
print(os.getcwd())

/Users/swarnaguntaka/Desktop/ETL-Project/Code
