In [2]:
# Import libraries
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import os

from dvc.api import make_checkpoint

# Display full outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [None]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/project/spark-3.2.1-bin-hadoop3.2"

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("PySpark App") \
    .config("spark.jars", "postgresql-42.3.2.jar") \
    .getOrCreate()

### Data Scraping

In [5]:
# Code cited and modified from @Priyank181

# Downloading imdb top 250 movie's data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]

In [6]:
# Create empty list for flim information
list_movie = []

# Iterating over movies to extract each movie's details
for index in range(0, len(movies)):

    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    crew_names = crew[index].split(',')
    director = crew_names[0][:-7]
    star_cast = ','.join(crew_names[1:])
    
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "director":director,
            "star_cast": star_cast,
            "rating": ratings[index]}
    list_movie.append(data)

In [7]:
# Create empty lists for columns
list_place = []
list_title = []
list_year = []
list_director = []
list_cast = []
list_rating = []

# Get values of details from each flim
for movie in list_movie:
    list_place.append(movie['place'])
    list_title.append(movie['movie_title'])
    list_year.append(movie['year'])
    list_director.append(movie['director'])
    list_cast.append(movie['star_cast'])
    list_rating.append(movie['rating'])

In [8]:
# Create a dataframe from the lists
df = pd.DataFrame({'place':list_place, 'title':list_title, 'year':list_year, 'director':list_director, 'cast':list_cast, 'rating':list_rating})
# By mannually examing, the place of 10 and 100 are mistaken as 1 and 10
# Fix the mistake mannually
df.at[9, 'place'] = 10
df.at[99, 'place'] = 100
make_checkpoint()

In [9]:
# A first glance at the dataframe now
df

Unnamed: 0,place,title,year,director,cast,rating
0,1,The Shawshank Redemption,1994,Frank Darabont,"Tim Robbins, Morgan Freeman",9.23345899123353
1,2,The Godfather,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino",9.155461706140777
2,3,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger",8.984114552334855
3,4,The Godfather: Part II,1974,Francis Ford Coppola,"Al Pacino, Robert De Niro",8.983749845875042
4,5,12 Angry Men,1957,Sidney Lumet,"Henry Fonda, Lee J. Cobb",8.946315701280204
...,...,...,...,...,...,...
245,246,Aladdin,1992,Ron Clements,"Scott Weinger, Robin Williams",8.006990281544315
246,247,The Help,2011,Tate Taylor,"Emma Stone, Viola Davis",8.004884016636845
247,248,Beauty and the Beast,1991,Gary Trousdale,"Paige O'Hara, Robby Benson",8.004521139445142
248,249,Du rififi chez les hommes,1955,Jules Dassin,"Jean Servais, Carl Möhner",8.002177372842601


In [10]:
# Get box office income and more information from IMDb Mojo website
# Creat empty lists for more information columns
list_domestic = []
list_international = []
list_worldwide = []
list_genre = []
list_duration = []
list_distributor = []

# Web scraping film detailed information from Box Office Mojo website of each film
for i in df.index:
    link = links[i]
    URL =f"https://www.boxofficemojo.com{link}?ref_=bo_se_r_1"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    soup_body = str(soup.body)
    
    info = soup.findAll('div', attrs= {'class': 'a-section a-spacing-none mojo-gutter mojo-summary-table'})
    for x in info:
        earning = x.find_all('span', attrs = {'class': 'money'})
        if len(earning)>= 3:
            domestic = earning[0].text.replace('$', '').replace(',', '')
            list_domestic.append(domestic)
            international = earning[1].text.replace('$', '').replace(',', '')
            list_international.append(international)
            worldwide = earning[2].text.replace('$', '').replace(',', '')
            list_worldwide.append(worldwide)
        else: # Some films are missing one or more number on box office incomes
            list_domestic.append(0)
            list_international.append(0)
            list_worldwide.append(0)
    genre = re.findall(r'Genres</span><span>(.*)', soup_body)
    list_genre.append(genre[0])
    duration = re.findall(r'Running Time</span><span>(.*)</span></div><div class="a-section a-spacing-none"><span>Genres', soup_body)
    list_duration.append(duration[0])
    distributor = re.findall(r'Domestic Distributor</span><span>(.*)<br/>', soup_body)
    if len(distributor)>0:
        list_distributor.append(distributor[0])
    else: # Some filem are missing distributor information
        list_distributor.append(0)

In [11]:
# Add information to data frame
df['genre'] = list_genre
df['duration'] = list_duration
df['domestic'] = list_domestic
df['international'] = list_international
df['worldwide'] = list_worldwide
df['distributor'] = list_distributor
make_checkpoint()

In [12]:
# Clean the genre column
df['genre'].replace({'Drama</span></div><div class="a-section a-spacing-none"><span>': 'Drama'}, inplace=True)
df['genre'].replace({'Western</span></div><div class="a-section a-spacing-none"><span>': 'Western'}, inplace=True)
df['genre'].replace({'Horror</span></div><div class="a-section a-spacing-none"><span>': 'Horror'}, inplace=True)
df['genre'].replace({'Comedy</span></div><div class="a-section a-spacing-none"><span>': 'Comedy'}, inplace=True)
make_checkpoint()

In [13]:
# Chnage units of box office income into thousands
df[['domestic', 'international', 'worldwide']] = df[['domestic', 'international', 'worldwide']].astype(int)
df['domestic'] = df['domestic']/1000
df['international'] = df['international']/1000
df['worldwide'] = df['worldwide']/1000

# Renew column names
df = df.rename(columns = {'domestic':'domestic_k','international':'international_k', 'worldwide':'worldwide_k' })
make_checkpoint()

In [14]:
# Look at the data frame
df

Unnamed: 0,place,title,year,director,cast,rating,genre,duration,domestic_k,international_k,worldwide_k,distributor
0,1,The Shawshank Redemption,1994,Frank Darabont,"Tim Robbins, Morgan Freeman",9.23345899123353,Drama,2 hr 22 min,28767.189,117.315,28884.504,Columbia Pictures
1,2,The Godfather,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino",9.155461706140777,Crime,2 hr 55 min,136381.073,113960.743,250341.816,Paramount Pictures
2,3,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger",8.984114552334855,Action,2 hr 32 min,534987.076,471115.201,1006102.277,Warner Bros.
3,4,The Godfather: Part II,1974,Francis Ford Coppola,"Al Pacino, Robert De Niro",8.983749845875042,Crime,3 hr 22 min,47834.595,126.415,47961.010,Paramount Pictures
4,5,12 Angry Men,1957,Sidney Lumet,"Henry Fonda, Lee J. Cobb",8.946315701280204,Crime,1 hr 36 min,0.000,0.000,0.000,United Artists
...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,Aladdin,1992,Ron Clements,"Scott Weinger, Robin Williams",8.006990281544315,Adventure,1 hr 30 min,217350.219,286700.000,504050.219,Walt Disney Studios Motion Pictures
246,247,The Help,2011,Tate Taylor,"Emma Stone, Viola Davis",8.004884016636845,Drama,2 hr 26 min,169708.112,46931.000,216639.112,Walt Disney Studios Motion Pictures
247,248,Beauty and the Beast,1991,Gary Trousdale,"Paige O'Hara, Robby Benson",8.004521139445142,Animation,1 hr 24 min,218967.620,186043.788,424967.620,Walt Disney Studios Motion Pictures
248,249,Du rififi chez les hommes,1955,Jules Dassin,"Jean Servais, Carl Möhner",8.002177372842601,Crime,1 hr 58 min,517.975,3.367,521.342,0


In [15]:
# Change column dtype before convert into spark data frame
df[['place', 'year']] = df[['place', 'year']].astype(int)
df[['rating']] = df[['rating']].astype(float)
df['rating'] = df['rating'].round(decimals = 4)
df[['title', 'director', 'cast', 'genre', 'duration','distributor']] = df[['title', 'director', 'cast', 'genre', 'duration', 'distributor']].astype(str)
make_checkpoint()

In [16]:
# Convert the data frame into spark data frame
df_origin_spark = spark.createDataFrame(df)

In [17]:
df_origin_spark.printSchema()

root
 |-- place: long (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- genre: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- domestic_k: double (nullable = true)
 |-- international_k: double (nullable = true)
 |-- worldwide_k: double (nullable = true)
 |-- distributor: string (nullable = true)



In [18]:
# Convert the data frame into parquet format
df_origin_spark.write.parquet("/project/Individual/parquet_files/origin.parquet", mode = 'overwrite')
make_checkpoint()