In [2]:
from bs4 import BeautifulSoup
import lxml
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait       
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time
import pandas as pd 
import pymysql

# Preparing Information 

In [4]:
# turning the data collected on movies into separate lists 
titles_df = pd.read_csv('Letterboxd_scrape_first500_final.csv')
links = titles_df['Link'].tolist()
titles = titles_df['Title'].tolist()
years = list(map(int, titles_df['Year'].tolist()))
ratings = titles_df['Rating'].tolist()
synopses = titles_df['Synopsis'].tolist()

# Webscraping Reviews 
### An example line of code to run this: 
### parsing_through(titles, years, links, ratings, synopses, 1987, 2000)


In [3]:
# give lists of all information from csv, plus limit of movies 
# function parses through the list of titles to collect ratings by calling other functions
# collects reviews according to the indexes given which correlate to the indexes of the movies in the list
def parsing_through(titles, years, links, ratings, synopses, start, end):
    db = pymysql.connect(host='localhost', user = 'hester',
                             passwd = '****', database = "letterboxd_project")
    cursor = db.cursor()
    # for each movie 
    for i in range(start,end):
        # getting movie specific information 
        title, year, link, rating, synopsis = titles[i], years[i], links[i], ratings[i], synopses[i]
        review_list = collect_reviews(link) # list of reviews from collect function
        # creating list of rows 
        rows_to_insert = []
        # for each review collected, create new row
        for review in review_list:
            # first_100 is an identifier for the first 100 characters to define uniqueness
            try:
                first_100 = review[0:99]
            except:
                first_100 = review
            new_row = (title, year, synopsis,review, first_100)
            rows_to_insert.append(new_row)
        # insert rows into sql data base 
        insert_query = "INSERT IGNORE INTO movie_reviews (movie_title, release_year, synopsis, review, first_100) VALUES (%s, %s, %s, %s, %s)"
        cursor.executemany(insert_query, rows_to_insert)
        db.commit()
    cursor.close()
    db.close()
    
    

In [4]:
# function to reveal the more button when a review is long 
def more_review(count_li, p_tags):
    #p_tags = reviews because reviews catches all the p tags
    #using len(p_tags) as the number for which child of the p tag
    chrome_options = webdriver.ChromeOptions() 
    chrome_options.add_argument("--blink-settings=imagesEnabled=false") # disabling images to improve run time 
    chrome_options.add_argument('--headless')  # Enable headless mode
    chrome_options.add_argument('--no-sandbox')
    #chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration in headless mode (optional)
    driver = webdriver.Chrome(options=chrome_options)
    if len(p_tags) > 1:
        more = driver.find_element(By.CSS_SELECTOR, f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text > div > p:nth-child({len(p_tags)}) > a")
        #where the "more" button, always at the last paragraph so number would equal len(p_tags)
        #count_li corresponds to which of the 12 reviews
        more.click()
        wait = WebDriverWait(driver, 20)
        wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text > div > p:nth-child({len(p_tags)}) > a")))
        time.sleep(1)
        # get the review 
        review = driver.find_element(By.CSS_SELECTOR, f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text")
    else:
        more = driver.find_element(By.CSS_SELECTOR, f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text > div > p > a")
        more.click()
        wait = WebDriverWait(driver, 20)
        wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text > div > p > a")))
        time.sleep(1)
        review = driver.find_element(By.CSS_SELECTOR, f"#content > div > div > section > section > ul > li:nth-child({count_li}) > div > div.body-text.-prose.collapsible-text")
    driver.close()
    return review

In [5]:
# collects reviews from each page
def collect_reviews(link):
    chrome_options = webdriver.ChromeOptions() 
    chrome_options.add_argument("--blink-settings=imagesEnabled=false") # disabling images to improve run time 
    chrome_options.add_argument('--headless')  # Enable headless mode
    chrome_options.add_argument('--no-sandbox')
    #chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration in headless mode (optional)
    driver = webdriver.Chrome(options=chrome_options)
    movie_reviews = [] # creating list of movie reviews 
    for url_count in range(1, 5): # iterate through each review page
        url = f"{link}reviews/by/activity/page/{url_count}/"
        source = requests.get(url).text
        soup = BeautifulSoup(source, 'lxml')
        divs = soup.find_all("div", class_ = 'body-text -prose collapsible-text') # these contain the reviews
        driver.get(url)
        count_li = 1
        for i in divs: #iterate through reviews of first page,sorted by activit, 12 for each page 
            reviews = i.find_all("p")
            #find all p tags to be able to get over line breaks
            if reviews[0].text == "This review may contain spoilers. I can handle the truth.": 
                #reviews[0] because spoilers line is always the only line (paragraph 1)
                review = i.find("div", class_ = "hidden-spoilers expanded-text").text

            elif reviews[-1].text.find("… more") != -1:
                #reviews[-1] because "..." always last line (last paragraph)
                review = more_review(count_li, reviews).text
            else:
                #if neither spoilers or too long but still paragraph(works for single paragraph too)
                review = ""
                for j in reviews:
                    review += j.text + "\n"
            review = review.strip()
            # movies are appending in one big list 
            movie_reviews.append(review)
            count_li += 1
    driver.close()
    return(movie_reviews)

            