In [1]:
import pandas as pd

from splinter import Browser
from bs4 import BeautifulSoup

import json
from pprint import pprint

import time
import re

In [2]:
!which chromedriver

executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

/usr/local/bin/chromedriver


## Scraping IMDB for Top 250 movies
___

In [3]:
# URL for top 250 IMdb movies

url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'

# Read all the tables in the URL

tables = pd.read_html(url)

Raw_df = tables[0]

Raw_df = Raw_df.drop(['Unnamed: 0','Your Rating', 'Unnamed: 4'], axis = 1)

Raw_df

Unnamed: 0,Rank & Title,IMDb Rating
0,1. The Shawshank Redemption (1994),9.2
1,2. The Godfather (1972),9.1
2,3. The Godfather: Part II (1974),9.0
3,4. The Dark Knight (2008),9.0
4,5. 12 Angry Men (1957),8.9
...,...,...
245,246. Butch Cassidy and the Sundance Kid (1969),8.0
246,247. Aladdin (1992),8.0
247,248. Akira (1988),8.0
248,249. Throne of Blood (1957),8.0


In [4]:
Imdb_movies = []

for movie in Raw_df['Rank & Title']:
    
    rank = movie.split('.')[0]
    
    title1 = re.split('[.(]', movie)[1]
    title = re.sub("  ", "", title1)
        
    year = re.split('[()]', movie)[1]
  
    Imdb_movies.append({'IMDb Rank': rank,
                        'Movie Title': title,
                        'Year Released': year
                       })
    
Imdb_df = pd.DataFrame(Imdb_movies)
    
Imdb_df['IMDb Rating'] = Raw_df['IMDb Rating'] 
    
Imdb_df

Unnamed: 0,IMDb Rank,Movie Title,Year Released,IMDb Rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.1
2,3,The Godfather: Part II,1974,9.0
3,4,The Dark Knight,2008,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Butch Cassidy and the Sundance Kid,1969,8.0
246,247,Aladdin,1992,8.0
247,248,Akira,1988,8.0
248,249,Throne of Blood,1957,8.0


## Building the URL to scrape Reelgood
---

In [5]:
Imdb_df['Movie Title'] = Imdb_df['Movie Title'].str.replace(',', '', n = -1, case=None, regex=True)
Imdb_df['Movie Title'] = Imdb_df['Movie Title'].str.replace("'",'', n = -1, case=None, regex=True)
Imdb_df['Movie Title'] = Imdb_df['Movie Title'].str.replace(":",'', n = -1, case=None, regex=True)
Imdb_df['Movie Title'] = Imdb_df['Movie Title'].str.replace(".",'', n = -1, case=None, regex=True)

In [6]:
movie_list = Imdb_df['Movie Title']
year_list = Imdb_df['Year Released']

m_list = movie_list.str.lower().str.replace(' ', '-', n = -1, case=None, regex=True)

query_urls = []

for i in range(len(movie_list)):
    title_url = f'{m_list[i]}-{year_list[i]}'
    query_url = f'https://reelgood.com/movie/{title_url}'    
    query_urls.append(query_url)
    
query_urls

['https://reelgood.com/movie/the-shawshank-redemption-1994',
 'https://reelgood.com/movie/the-godfather-1972',
 'https://reelgood.com/movie/the-godfather-part-ii-1974',
 'https://reelgood.com/movie/the-dark-knight-2008',
 'https://reelgood.com/movie/12-angry-men-1957',
 'https://reelgood.com/movie/schindlers-list-1993',
 'https://reelgood.com/movie/the-lord-of-the-rings-the-return-of-the-king-2003',
 'https://reelgood.com/movie/pulp-fiction-1994',
 'https://reelgood.com/movie/the-good-the-bad-and-the-ugly-1966',
 'https://reelgood.com/movie/the-lord-of-the-rings-the-fellowship-of-the-ring-2001',
 'https://reelgood.com/movie/fight-club-1999',
 'https://reelgood.com/movie/forrest-gump-1994',
 'https://reelgood.com/movie/inception-2010',
 'https://reelgood.com/movie/star-wars-episode-v---the-empire-strikes-back-1980',
 'https://reelgood.com/movie/the-lord-of-the-rings-the-two-towers-2002',
 'https://reelgood.com/movie/the-matrix-1999',
 'https://reelgood.com/movie/goodfellas-1990',
 'http

In [7]:
Reelgood_query_url_df = pd.DataFrame({'IMDb Rank': Imdb_df['IMDb Rank'],
                                      'Movie Title': Imdb_df['Movie Title'],
                                      'Reelgood Query URL' : query_urls
                                     })

Reelgood_query_url_df = Reelgood_query_url_df.set_index(['IMDb Rank'])

Reelgood_query_url_df

Unnamed: 0_level_0,Movie Title,Reelgood Query URL
IMDb Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Shawshank Redemption,https://reelgood.com/movie/the-shawshank-redem...
2,The Godfather,https://reelgood.com/movie/the-godfather-1972
3,The Godfather Part II,https://reelgood.com/movie/the-godfather-part-...
4,The Dark Knight,https://reelgood.com/movie/the-dark-knight-2008
5,12 Angry Men,https://reelgood.com/movie/12-angry-men-1957
...,...,...
246,Butch Cassidy and the Sundance Kid,https://reelgood.com/movie/butch-cassidy-and-t...
247,Aladdin,https://reelgood.com/movie/aladdin-1992
248,Akira,https://reelgood.com/movie/akira-1988
249,Throne of Blood,https://reelgood.com/movie/throne-of-blood-1957


In [8]:
Reelgood_query_url_df.to_csv('../Output/Reelgood_Query_URL.csv')

## Sample_Scraping Realgood
---

In [9]:
# Sample scraping

title = 'whiplash'

movie_url = 'https://reelgood.com/movie/whiplash-2014'

browser.visit(movie_url)

time.sleep(2)

browser.find_by_css('.css-543au7').first.click()

time.sleep(2)

movie_html = browser.html

movie_soup = BeautifulSoup(movie_html, 'lxml')

s_title_list = []
s_streaming_list = []
s_rent_list = []
s_buy_list = []

name_results = movie_soup.find_all('div', class_ = "css-18xrnt0 e156vy7w13")
price_results = movie_soup.find_all('div', class_ = "css-185a89x e156vy7w14")

for name in name_results:
    
    s_title_list.append(title.capitalize())
    s_streaming_list.append(name.text)
    

for price in price_results:
    
    rent = re.split('[t B]', price.text)[2]
    s_rent_list.append(rent)
    
    buy = price.text.split('Buy ',1)[1]
    s_buy_list.append(buy)


Sample_Reelgood_Streaming_on_df = pd.DataFrame({'Title': s_title_list,
                                                'Streaming' : s_streaming_list,
                                                'Rent' : s_rent_list,
                                                'Buy' : s_buy_list
                                               })

Sample_Reelgood_Streaming_on_df

Unnamed: 0,Title,Streaming,Rent,Buy
0,Whiplash,FandangoNOW,$2.99,$12.99
1,Whiplash,Google Play,$3.99,$12.99
2,Whiplash,iTunes,$3.99,$12.99
3,Whiplash,Microsoft,$3.99,$12.99
4,Whiplash,Prime Video,$2.99,$9.99
5,Whiplash,Vudu,$2.99,$9.99
6,Whiplash,YouTube,$3.99,$12.99


In [10]:
Sample_Reelgood_Streaming_on_df.to_csv('../Output/Sample_Reelgood_Scraping.csv')

## Scraping Reelgood for all the movies

In [11]:
title_list = []
streaming_list = []
price_list = []

url_list = Reelgood_query_url_df['Reelgood Query URL']
title_list = Reelgood_query_url_df['Movie Title']

count = 0

In [12]:
for i in range(1):
    
    browser.visit(url_list[count])
    print(url_list[count])

    time.sleep(2)
    
    try:
        print("here")
        browser.find_by_css('.css-543au7').first.click()

        time.sleep(2)

        movie_html = browser.html

        movie_soup = BeautifulSoup(movie_html, 'lxml')
        
        print(movie_soup)

        name_results = movie_soup.find_all('div', class_ = "css-18xrnt0 e156vy7w13")

        price_results = movie_soup.find_all('div', class_ = "css-185a89x e156vy7w14")

        print('hi')
        
        for name in name_results:
            print('a')
            title_list.append(title_list[count])
            streaming_list.append(name.text)

        for price in price_results:            
            print('c')
            price_list.append(price.text)
            
    except:
        print(f'{movie_list[i]} is not found:-(')
        pass
    
    count = count + 1

https://reelgood.com/movie/the-shawshank-redemption-1994
here
<html class="noScroll nprogress-busy" lang="en-us"><head><meta charset="utf-8"/><meta content="app-id=1031391869" name="apple-itunes-app"/><meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0, minimum-scale=1.0" name="viewport"/><link href="/manifest.3eac3022bb8c2e124d52.json" rel="manifest"/><meta content="yes" name="mobile-web-app-capable"/><meta content="#081017" name="theme-color"/><meta content="Reelgood" name="application-name"/><link href="https://assets.reelgood.com/p/8644b50ab3de0bf7e3c755eeb4dafedfb252a2bd/icons/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/><link href="https://assets.reelgood.com/p/8644b50ab3de0bf7e3c755eeb4dafedfb252a2bd/icons/apple-touch-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/><link href="https://assets.reelgood.com/p/8644b50ab3de0bf7e3c755eeb4dafedfb252a2bd/icons/apple-touch-icon-72x72.png" rel="apple-touch-icon" sizes="72x72"/><link href="htt

In [13]:
print(title_list)
print(len(title_list))

print(streaming_list)
print(len(streaming_list))

print(price_list)
print(len(price_list))

IMDb Rank
1                The Shawshank Redemption
2                           The Godfather
3                   The Godfather Part II
4                         The Dark Knight
5                            12 Angry Men
                      ...                
246    Butch Cassidy and the Sundance Kid
247                               Aladdin
248                                 Akira
249                       Throne of Blood
250                   Fanny and Alexander
Name: Movie Title, Length: 250, dtype: object
250
[]
0
[]
0


In [None]:
Streaming_on_df = []

Streaming_on_df = pd.DataFrame({'Title': title_list,
                                'Streaming' : streaming_list
                               })

Streaming_on_df

In [16]:
import os
os.getcwd()

'/Users/swarnaguntaka/Desktop/ETL-Project/Code'