# **MODS 207 NoteBook :** Web Scrapping IMDb
* **Group members :** Farah Jabri, Lauryne Moyse


## Imports

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json

## Preliminary cleaning

In [7]:
df = pd.read_excel('../award_until2019.xlsx')

In [8]:
df.head(3)

Unnamed: 0,year,movie_title,imdbid,date_nomination,date_award,nom_actor,nom_actress,nom_anime,nom_foreign,nom_direct,...,award_actor,award_actress,award_anime,award_foreign,award_direct,award_doc,award_pict,award_screen,tot_award,total
0,1995,Leaving Las Vegas,tt0113627,1996-02-13,1996-03-25,1,1,0,0,1,...,1,0,0,0,0,0,0,0,1,8.0
1,1995,Mr. Holland's Opus,tt0113862,1996-02-13,1996-03-25,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,1995,Nixon,tt0113987,1996-02-13,1996-03-25,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [9]:
df['url']='https://www.imdb.com/title/' + df['imdbid'] + '/reviews?sort=submissionDate&dir=desc&ratingFilter=0'

In [10]:
df['total'] = df['tot_nom'] + df['tot_award']

In [11]:
df.rename(columns={'imdbid': 'movie_id'}, inplace=True)

In [12]:
df.columns

Index(['year', 'movie_title', 'movie_id', 'date_nomination', 'date_award',
       'nom_actor', 'nom_actress', 'nom_anime', 'nom_foreign', 'nom_direct',
       'nom_doc', 'nom_pict', 'nom_screen', 'tot_nom', 'award_actor',
       'award_actress', 'award_anime', 'award_foreign', 'award_direct',
       'award_doc', 'award_pict', 'award_screen', 'tot_award', 'total', 'url'],
      dtype='object')

## Scraping

For the scraping : 
* for each movie retrieve : 
        • the number of reviews                        
        <div><span>1&nbsp;790 Reviews</span></div>; 
* for each review retrieve :
        • the date
        <span class="review-date">10 May 2023</span>
        • the grade
        <span class="rating-other-user-rating"><span>9</span><span class="point-scale">/10</span>
        • the author + the link of its profile
        <span class="display-name-link"><a href="/user/ur144384572/?ref_=tt_urv">MoviesGetBetterWithAge_Expert</a></span>
        • the title
        <a href="/review/rw9046423/?ref_=tt_urv" class="title"> A Wild and Entertaining Ride Through Excess</a>
        • the text content  
        <div class="content">
* on the page of each author retrieve :
        • the date of registration
        <div class="timestamp">IMDb member since November 2021</div>
        • the number of reviews issued on the website
        <div class="see-more"><a href="/user/ur144384572/comments">See all 3&nbsp;144 reviews</a> »</div>
        • the description ?
        <div class="toggle-overflow biography markdown">Hi there, I'm a passionate movie lover etc</div>
* There is a buttonload more to display the next reviews : • <button class="ipl-load-more__button" data-target-container="reviews-container" id="load-more-trigger">Load More</button>



### Preliminary scraping : html codes 
The reviews of a movie are not all displayed at once. It is necessary to click on a button "load more" a certain amount of time to have them all. We realized ths with puppeteer in Javascript and retrieve the html codes of the pages.

### Scraping of the data

Functions that retrieve the information that we need :

In [1]:
def get_nb_reviews(soup) :
  temp = soup.find('div')
  if temp:
    res = temp.find('span')
    if res:
      return res.text
    else:
      return 'None'
  else:
      return 'None'

def get_rating(reviews):
  #Some reviews don't have a grade
  temp = [i.find('span', class_='rating-other-user-rating') for i in reviews]
  res = []
  for rating in temp :
    if (rating) : 
      res.append(rating.text)
    else:
      res.append('None')
  return res

def get_date(reviews):
  temp = [i.find('span', class_='review-date') for i in reviews]
  res = [date.text for date in temp]
  return res

def get_author(reviews):
  temp = [i.find('span', class_='display-name-link') for i in reviews]
  res = [name.text for name in temp]
  return res

def get_author_link(reviews):
  temp = [i.find_('span', class_='display-name-link') for i in reviews]
  res = [author.find('a')['href'] for author in temp]
  return res
  
def get_title(reviews):
  temp = [i.find('a', class_='title') for i in reviews]
  res = [title.text for title in temp]
  return res

def get_text(reviews): 
  temp = [i.find('div', class_='text show-more__control') for i in reviews]
  res = []
  for review in temp :
    if (review) : 
      res.append('starthere '+review.text)
    else:
      res.append('starthere '+'No review' )
  return res

def get_id(soup):
  temp = soup.find('form', attrs={'name': 'lister-controls'})
  if temp:
    res = temp['action'].split('/')[2]
  return res

In [2]:
#General function to retrieve all the information
def get_info(results):

    nb_reviews = []
    rating = []
    date = []
    author = []
    author_link = []
    review_title = []
    review_text = []
    movie_id = []

    for i, movie in enumerate(results):

        print(i)

        #Retrieving the information for one movie
        soup = BeautifulSoup(movie)
        reviews = soup.find_all('div', class_='review-container')

        nb_reviews.append(get_nb_reviews(soup))
        rating.append(get_rating(reviews))
        date.append(get_date(reviews))
        author.append(get_author(reviews))
        author_link.append(get_author_link(reviews))
        review_title.append(get_title(reviews))
        review_text.append(get_text(reviews))
        movie_id.append(get_id(soup))
        
    #Create a database with the information
    df = pd.DataFrame({'nb_reviews':nb_reviews, 'ratings':rating, 'dates':date, 'authors':author,
                       'author_links':author_link, 'review_titles':review_title,
                       'texts':review_text, 'movie_id':movie_id})

    return df

### We can now work on the different html files that we have and retrieve the relevant information in the codes

Some films may have been skipped during scraping. We are therefore going to create an independent database then make a join with the one we already have according to the primary key 'movie_id'.

In [None]:
#Reading the html file
with open('reviews.html', 'r') as f:
    content = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results = content.split('<div class="header">')
del results[0]

In [None]:
db1 = get_info(results)

In [None]:
db1.head(3)

#### We do exactly the same for the 7 other html files

In [None]:
#Reading the html file
with open('reviews2.html', 'r') as f:
    content2 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results2 = content2.split('<div class="header">')
del results2[0]

In [None]:
db2 = get_info(results2)

In [None]:
#Reading the html file
with open('reviews3.html', 'r') as f:
    content3 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results3 = content3.split('<div class="header">')
del results3[0]

In [None]:
db3 = get_info(results3)

In [None]:
#Reading the html file
with open('reviews4.html', 'r') as f:
    content4 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results4 = content4.split('<div class="header">')
del results4[0]

In [None]:
db4 = get_info(results4)

In [None]:
#Reading the html file
with open('reviews5.html', 'r') as f:
    content5 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results5 = content5.split('<div class="header">')
del results5[0]

In [None]:
db5 = get_info(results5)

In [None]:
#Reading the html file
with open('reviews6.html', 'r') as f:
    content6 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results6 = content6.split('<div class="header">')
del results6[0]

In [None]:
db6 = get_info(results6)

In [None]:
#Reading the html file
with open('reviews7.html', 'r') as f:
    content7 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results7 = content7.split('<div class="header">')
del results7[0]

In [None]:
db7 = get_info(results7)

In [None]:
#Reading the html file
with open('reviews8.html', 'r') as f:
    content8 = f.read()
    
#Split it in several parts (one for each movie) because it is too big
results8 = content8.split('<div class="header">')
del results8[0]

In [None]:
db8 = get_info(results8)

### Now, we concatenate the databases together

In [None]:
dbs = [db1, db2, db3, db4, db5, db6, db7, db8]
temp_database = pd.concat(dbs)

### Finally, we join the two main databases

In [None]:
final_imdb_db = pd.merge(df, temp_database, on='movie_id')

In [None]:
#Useless columns due to the joins
final_imdb_db.drop("Unnamed: 0_x", axis=1, inplace=True)
final_imdb_db.drop("Unnamed: 0_y", axis=1, inplace=True)

In [None]:
len(final_imdb_db)

650

In [None]:
final_imdb_db.columns

In [None]:
final_imdb_db.head(3)

In [None]:
final_imdb_db.to_csv('imdb_database_not_clean.csv')

########################################

##Retrieving the index of the spoilers reviews

In [46]:
def get_index(results):

    indexes = []

    count = 0 #loop counter

    for movie in results:

        print(count)

        #Retrieving the information for one movie
        movie_indexes = []
        soup = BeautifulSoup(movie)

        temp = soup.find_all('div', class_='review-container')
        for i in range(len(temp)):
          if (temp[i].find('span', class_='spoiler-warning')):
            movie_indexes.append(i)

        indexes.append(movie_indexes)
        count+=1

    return indexes

In [47]:
with open('reviews.html', 'r') as f:
    content = f.read()
results = content.split('<div class="header">')
del results[0]

with open('reviews2.html', 'r') as f:
    content = f.read()
results2 = content.split('<div class="header">')
del results2[0]

with open('reviews3.html', 'r') as f:
    content = f.read()
results3 = content.split('<div class="header">')
del results3[0]

with open('reviews4.html', 'r') as f:
    content = f.read()
results4 = content.split('<div class="header">')
del results4[0]

with open('reviews5.html', 'r') as f:
    content = f.read()
results5 = content.split('<div class="header">')
del results5[0]

with open('reviews6.html', 'r') as f:
    content = f.read()
results6 = content.split('<div class="header">')
del results6[0]

with open('reviews7.html', 'r') as f:
    content = f.read()
results7 = content.split('<div class="header">')
del results7[0]

with open('reviews8.html', 'r') as f:
    content = f.read()
results8 = content.split('<div class="header">')
del results8[0]

In [None]:
index1 = get_index(results)
index2 = get_index(results2)
index3 = get_index(results3)
index4 = get_index(results4)
index5 = get_index(results5)
index6 = get_index(results6)
index7 = get_index(results7)
index8 = get_index(results8)

In [65]:
#we concatenate the lists 
indexes = index1+index2+index3+index4+index5+index6+index7+index8
#we save the final list as a df
df_spoilers = pd.DataFrame({'id': indexes})


In [69]:
df_spoilers

Unnamed: 0,id
0,"[8, 10, 13, 18, 21, 23, 30, 34, 44, 47, 63, 67..."
1,"[0, 1, 2, 4, 6, 10, 12, 25, 28, 34, 43, 44, 45..."
2,"[11, 12, 20, 26, 28, 33, 39, 40, 45, 54, 55, 6..."
3,"[2, 11, 12, 13, 33, 44, 46, 49, 50, 56, 61, 62..."
4,"[4, 7, 11, 22, 29, 31, 32, 37, 49, 61, 71, 77,..."
...,...
645,"[6, 33, 36, 43, 63]"
646,"[0, 2, 3, 5, 6, 11, 12, 13, 15, 16, 18, 22, 24..."
647,"[2, 8, 20, 21, 26, 32, 42, 47, 49, 57, 76, 79,..."
648,"[5, 8, 9, 21, 42, 68, 95, 107, 108, 122, 136, ..."


In [70]:
df_spoilers.to_csv('spoilers_id')












## Scraping of the authors info

In [None]:
#Changing the format to obtain the authors url 
def author_url(serie):
  for i in range(len(serie)): 
    serie[i] = ['https://www.imdb.com'+link+'?ref_=tt_urv' for link in serie[i]]
    return
df['authors_links'] = author_url(df['authors_links'])

In [None]:
def get_starts(soup):
  temp = soup.find('div', class_='timestamp')
  if temp : 
    res = temp.text
  else : 
    res = 'None'
  return res

def get_nb_comments(soup):
  temp = soup.find('div', class_='see-more')
  if temp : 
    res = temp.text
  else : 
    res = 'None'
  return res

def get_description(soup):
  temp = soup.find('div', class_='toggle-overflow biography markdown')
  if temp : 
    res = temp.text
  else : 
    res = 'None'
  return res

In [None]:
def get_info2():

    start_dates = []
    nb_comments = []
    descriptions = []

    for i in range(len(df)):
      print(i)
      i_start_dates = []
      i_nb_comments = []
      i_descriptions = []
      
      for link in df.authors_links[i]:
          #Loading the content of the page 
          page = requests.get(link, headers=headers)
          soup = BeautifulSoup(page.content, 'html.parser')
          #Retrieving the information of authors who commented the movie
          i_start_dates.append(get_starts(soup))
          i_nb_comments.append(get_nb_comments(soup))
          i_descriptions.append(get_description(soup))

      start_dates.append(i_start_dates)
      nb_comments.append(i_nb_comments)
      descriptions.append(i_descriptions)
          
    #Add the information to the database
    df['start_dates'] = start_dates
    df['nb_comments'] = nb_comments
    df['descriptions'] = descriptions
    return

In [None]:
get_info2()

0
1
2
3
4


KeyboardInterrupt: ignored