# Web Scraping

In [15]:
import requests
from bs4 import BeautifulSoup
import pprint as p
import pandas as pd
import numpy as np
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import re

## Scraping data from Rotten Tomato

In [16]:
## 1. go into a page, grab all links, get a list of links
## 2. create a list_; loop over each links, create a list:  [name, year, score], append to list_
## 3.  turn this list_ into a dataframe; add index and column name 

def get_links(start,end):
    list_link_allyear=[]
    for year in range(start,end):
        url = "https://www.rottentomatoes.com/top/bestofrt/?year=%s"%year
        html_data = requests.get(url)
    
        if html_data.status_code == 200:
            python_data = BeautifulSoup(html_data.content,'lxml')
    
        list_tag= python_data.find_all('tr')
        list_tag_link=[]
        for tag in list_tag:
            list_new=tag.find('a',class_='unstyled articleLink')
            list_tag_link.append(list_new)
    
        list_tag_link=[i for i in list_tag_link if i is not None]
        list_link_year=[]
        for tag in list_tag_link:
            list_single_movie=[0,1,2]
            list_single_movie[0]=tag.get_text().strip('\n').strip()[:-7]
            list_single_movie[1]=tag.get_text().strip('\n').strip()[-5:-1]
            list_single_movie[2]='https://www.rottentomatoes.com'+tag.get('href')
            list_link_year.append(list_single_movie)
        
        list_link_allyear.extend(list_link_year)

    print(len(list_link_allyear))
    return list_link_allyear

In [17]:
def get_movie(list_link):
    list_movie_info=[]
    for item in list_link:
        html_data = requests.get(item[2],allow_redirects=False)
        print(html_data)
        if html_data.status_code == 200:
            print('haha')
            python_data = BeautifulSoup(html_data.content,'lxml')

        movie_name=item[0]

        try:
            meter_score=python_data.find('span',class_="meter-value superPageFontColor").find('span').get_text().strip('\n')
        except:
            meter_score=np.NaN
        try:
            audience_score=python_data.find('div',class_="audience-score meter").find('span',class_='superPageFontColor').get_text()
        except:
            audience_score=np.NaN
        try:
            movie_synopsis=python_data.find('div',class_="movie_synopsis clamp clamp-6").get_text().strip('\n').strip().strip('\n')
        except:
            movie_synopsis=np.NaN

        list_info=python_data.find_all('div',class_="meta-value")
        try:
            rating=list_info[0].get_text().strip('\n')
        except:
            rating=list_info
        try:   
            genre_list=list_info[1].find_all('a')
            genre=[gen.get_text().strip('\n').strip() for gen in genre_list]
        except:
            genre=np.NaN

        try:
            director=list_info[2].get_text().strip('\n')
        except:
            director=np.NaN
        try:
            time=list_info[4].find('time').get_text().strip('\n')
        except:
            time=np.NaN
        try:
            studio=list_info[-1].get_text().strip('\n')
        except:
            studio=np.NaN
        try:
            runtime=list_info[-2].get_text().strip('\n').strip()
        except:
            runtime=np.NaN
        try:
            box_office=list_info[-3].get_text().strip('\n')
        except:
            box_office=np.NaN

        try:
            cast_list=python_data.find('div',class_="castSection").find_all('a',class_='unstyled articleLink')
            cast=[cast.get_text().strip('\n').strip() for cast in cast_list]
        except:
            cast=np.NaN
            
        list_movie_info.append([movie_name,meter_score,audience_score,rating,genre,cast,director,time,studio,runtime,box_office,movie_synopsis])    
    return list_movie_info

In [None]:
link_2000_2009=get_links(2000,2010)
link_2010_2018=get_links(2010,2019)

In [None]:
movie_2000_2009=get_movie(link_2000_2009)
df = pd.DataFrame(movie_2000_2009)
df.columns= ['name','meter_score','audience_score','rating','genre','cast',
             'director','time','studio','runtime','box_office','movie_synopsis']

In [None]:
movie_2010_2018=get_movie(link_2010_2018)
df_ = pd.DataFrame(movie_2010_2018)
df_.columns=['name','meter_score','audience_score','rating','genre','cast',
             'director','time','studio','runtime','box_office','movie_synopsis']

In [None]:
df_movie=pd.concat([df_, df], ignore_index=True)

df_movie_=df_movie.drop_duplicates(subset='movie_synopsis', keep='first', inplace=False)
df_movie_.set_index('name',inplace=True)

In [None]:
df.to_csv('raw_data.csv')

## Scraping actor scores

In [None]:
big_list=[]
for i in range(1,11):
    
    url = "https://www.imdb.com/list/ls058011111/?sort=list_order,asc&mode=grid&page=%s&ref_=nmls_vw_grd"%(i)
    html_data = requests.get(url)
    if html_data.status_code == 200:
        python_data = BeautifulSoup(html_data.content,'lxml')
    actor_tag=python_data.find_all('div',class_="lister-item mode-grid")
    
    
    for i in actor_tag:
        small_list=[1,2]
        actor_rank=i.find('div',class_="title").find('span').get_text()[:-1]
        actor_name=i.find('div',class_="title").find('a').get_text()
        small_list[0]=actor_rank[:-1].strip(',')
        small_list[1]=actor_name.strip('\n')
        big_list.append(small_list)

In [None]:
df = pd.DataFrame(big_list)
df.columns=['rank','actor']
df['actor_score']=df['rank'].copy()
df['actor_score']=df['actor_score'].apply(lambda x: 100-int(x)//10)
df.to_csv('actor_score.csv')

## Scraping movie reviews

In [1]:
def get_review(link):               #returns all top critic reviews for one movie
    link = link
    response = requests.get(link)
    review_collection = []
    if response.status_code == 200:
        try:
            result_page = BeautifulSoup(response.content,'lxml')
            all_review = result_page.find('div',class_='review_table').find_all('div',class_='the_review')
            for review in all_review:
                review_collection.append(review.get_text())
        except:
            return get_review(link)
    return review_collection

In [2]:
def get_movie_review(link,year,name):       #returns reviews of all top movies for one year
    
    response1 = requests.get(link)
    if response1.status_code == 200:
        result_page1 = BeautifulSoup(response1.content,'lxml')
        body = result_page1.find('section', id="contentReviews")
        try:
            sibody = body.find('a',class_='small unstyled subtle articleLink')
            siibody = sibody.next_sibling.next_sibling.get('href')
            link = 'https://www.rottentomatoes.com' + siibody

        except:
            return get_movie_review(link,year,name)
            
            
        
        review_collection = get_review(link)
        review = [name, year, review_collection]

    return review

In [None]:
all_reviews = []
for year in range(2014,2019):                     #getting 5 years of reviews
    page_link = 'https://www.rottentomatoes.com/top/bestofrt/?year='+str(year)
    init_response = requests.get(page_link)
    if init_response.status_code == 200:
        topmovie_page = BeautifulSoup(init_response.content,'lxml')
        movie_link_list= topmovie_page.find('table',class_='table').find_all('a',class_="unstyled articleLink" )
        for movie in movie_link_list:
            movie_link = 'http://www.rottentomatoes.com' + movie.get('href')
            movie_name = movie.get_text()[13:-7]      
            #time.sleep(2)
            review = get_movie_review(movie_link,year,movie_name)
            all_reviews.append(review)

In [None]:
import csv
with open("all_reviews.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(['Movie_Name','Year','Reviews'])
    writer.writerows(all_reviews)

# Data preprocessing

In [3]:
import pandas as pd
import numpy as np
import re
import math
from datetime import datetime

In [5]:
filename = 'raw_data.csv'
df_origin = pd.read_csv(filename)
df_origin.head()

Unnamed: 0,name,meter_score,audience_score,rating,genre,cast,director,time,studio,runtime,box_office,movie_synopsis
0,Toy Story 3,98,89%,G,"['Animation', 'Comedy', 'Kids & Family', 'Scie...","['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned...",Lee Unkrich,"Jun 18, 2010",Walt Disney Pictures,103 minutes,"$414,984,497","""Toy Story 3"" welcomes Woody, Buzz and the who..."
1,True Grit,96,85%,PG-13 (for some intense sequences of western v...,"['Action & Adventure', 'Drama', 'Western']","['Hailee Steinfeld', 'Jeff Bridges', 'Matt Dam...","Joel Coen, \n \n ...","Dec 22, 2010",Paramount Pictures,110 minutes,"$171,031,347",Fourteen-year-old Mattie Ross (Hailee Steinfel...
2,How to Train Your Dragon,98,91%,PG (for sequences of intense action and some s...,"['Animation', 'Kids & Family', 'Science Fictio...","['Jay Baruchel', 'Gerard Butler', 'Craig Fergu...","Dean DeBlois, Chris Sanders","Mar 26, 2010",Paramount/DWA,98 minutes,"$216,900,000",The son of a Viking chief must capture a drago...
3,The King's Speech,95,92%,PG-13 (for some language),['Drama'],"['Colin Firth', 'Helena Bonham Carter', 'Geoff...",Tom Hooper,"Nov 26, 2010",The Weinstein Company,118 minutes,"$138,300,000",After the death of his father King George V (M...
4,Inside Job,98,91%,PG-13 (for some drug and sex-related material),"['Documentary', 'Special Interest']","['Matt Damon', 'William Ackman', 'Daniel Alper...",Charles Ferguson (III),"Oct 8, 2010",Sony Pictures Classics,108 minutes,"$4,311,834","From Academy Award (R) nominated filmmaker, Ch..."


## Clean data

### examine null value

In [6]:
df = df_origin.dropna(axis=0,how='all')
len(df) / len(df_origin)

1.0

### examine and convert data types

In [4]:
print(df['meter_score'].describe())
# meter_score is in integer type
# no outliner

count    1735.000000
mean       88.605187
std         7.329474
min        57.000000
25%        84.000000
50%        90.000000
75%        94.000000
max       100.000000
Name: meter_score, dtype: float64


In [8]:
print(df['audience_score'].describe())
# audience_score is in str type

# convert audience_score from str into float for convience
df['audience_score'] = df['audience_score'].apply(lambda x: int(x[:-1]))

count     1735
unique      62
top        86%
freq        84
Name: audience_score, dtype: object


### set rating column as categorical data

In [6]:
# set ratting column as categorical data, delete the explanation of genre in the braket
def set_rating(pattern, string):
    res = re.match(pattern,string)
    return res.group()

pattern = r'[\w-]+'
df['rating'] = df['rating'].apply(lambda x: set_rating(pattern,x))

### set genre as string

In [7]:
# remove the '[]'  outside the genre
df['genre'] = df['genre'].apply(lambda x: x[1:-1])

### set cast as string

In [8]:
# remove the '[]' outside the cast 
def set_cast(string):
    try:
        if math.isnan(string):
            return 'nan'
    except:
        return string[1:-1][:-11]

df['cast'] = df['cast'].apply(lambda x: set_cast(x))

### clean cast data

In [9]:
# delete non-name elements in the list
df['cast'] = df['cast'].apply(lambda x: x[:-1])

### clean director column

In [10]:
def set_director(string):
    l = str(string).strip().split(',')
    for i in range(len(l)):
        if len(l[i])>20:
            l[i] = l[i].strip()
    return str(l)[1:-1]

df['director'] = df['director'].apply(lambda x: set_director(x))
# df['director'] = df['director'].apply(lambda x:x[1:-1])

### reset time column

In [11]:
# convert time from str into datetime type
# clean unformatted records
def set_date(string):
    try:
        if math.isnan(string):
            return None
    except:
        if len(string)<40:
            return datetime.strptime(string,"%b %d, %Y")
        else:
            return None

df['time'] = df['time'].apply(lambda x: set_date(x))

### drop empty time records

In [12]:
print(len(df[df['time'].isnull()]))
df = df[~df['time'].isnull()]

23


### clean studio column

In [13]:
# convert studio from str into list
# clean unformatted records
df['studio'] = df['studio'].apply(lambda x: str(x.strip().split('/'))[1:-1])

In [14]:
# some studio names are abbreviations
# be careful!
print(df['studio'][1])
print(df['studio'][2])

'Paramount Pictures'
'Paramount', 'DWA'


### reset runtime column

In [15]:
df['runtime'] = df['runtime'].apply(lambda x: x[:-8])

### reset boxoffice column

In [16]:
# clean unformatted boxoffice records
def set_boxoffice(string):
    try:
        if string[0]=='$':
            return int(string[1:].replace(',',''))
    except:
        return np.NaN

df['box_office'] = df['box_office'].apply(lambda x: set_boxoffice(x))

In [17]:
# delete NaN boxoffice records
df = df[~df['box_office'].isnull()]

## Add features

### add actor scores

In [18]:
df_actor = pd.read_csv('actor_score.csv')

actor_dic = dict()
actor_score = []

for ele in df_actor[['actor','actor_score']].values:
    key = ele[0][1:]
    value = ele[1]
    actor_dic[key] = value

for casts in df['cast']:
    score = 0
    cast_list = casts.split(', ')
    for cast in cast_list:
        score += actor_dic.get(cast[1:-1],0)
    actor_score.append(score)

df['actor_score'] = np.array(actor_score)

### add director scores

In [19]:
df_director = pd.read_csv('director_score_new.csv')

director_dic = dict()
director_score = []

for ele in df_director[['director','score']].values:
    key = ele[0]
    value = ele[1]
    director_dic[key] = value
    
for directors in df['director']:
    score = 0
    director_list = directors.split(', ')
    for director in director_list:
        director_name = director if director[1]!=' ' else str("\'"+director[2:-1]+"\'")
        name = director_name
        score += director_dic.get(director_name[1:-1],0)
    director_score.append(round(score/len(director_list)))

df['director_score'] = np.array(director_score)

## save update dataframe

In [23]:
df.head()

Unnamed: 0,name,meter_score,audience_score,rating,genre,cast,director,time,studio,runtime,box_office,movie_synopsis,actor_score,director_score
0,Toy Story 3,98,89,G,"'Animation', 'Comedy', 'Kids & Family', 'Scien...","'Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Ned ...",'Lee Unkrich',2010-06-18,'Walt Disney Pictures',103,414984497.0,"""Toy Story 3"" welcomes Woody, Buzz and the who...",233,138
1,True Grit,96,85,PG-13,"'Action & Adventure', 'Drama', 'Western'","'Hailee Steinfeld', 'Jeff Bridges', 'Matt Damo...","'Joel Coen', 'Ethan Coen'",2010-12-22,'Paramount Pictures',110,171031347.0,Fourteen-year-old Mattie Ross (Hailee Steinfel...,273,0
2,How to Train Your Dragon,98,91,PG,"'Animation', 'Kids & Family', 'Science Fiction...","'Jay Baruchel', 'Gerard Butler', 'Craig Fergus...","'Dean DeBlois', ' Chris Sanders'",2010-03-26,"'Paramount', 'DWA'",98,216900000.0,The son of a Viking chief must capture a drago...,191,0
3,The King's Speech,95,92,PG-13,'Drama',"'Colin Firth', 'Helena Bonham Carter', 'Geoffr...",'Tom Hooper',2010-11-26,'The Weinstein Company',118,138300000.0,After the death of his father King George V (M...,281,403
4,Inside Job,98,91,PG-13,"'Documentary', 'Special Interest'","'Matt Damon', 'William Ackman', 'Daniel Alpert...",'Charles Ferguson (III)',2010-10-08,'Sony Pictures Classics',108,4311834.0,"From Academy Award (R) nominated filmmaker, Ch...",95,0


In [25]:
df.to_csv('cleaned_data.csv')