In [9]:
#import package
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed


In [4]:
#set you folder
my_folder="C:/Users/melika/Desktop/master/thesis/github/Recommandation_Systems/Data/"

# first we scrap the films link

In [3]:
def scrap_film_links_by_genre(pages, genres):
    # Set up the WebDriver (e.g., for Chrome)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    
    # Base URL for the pages
    base_url = 'https://letterboxd.com/films/popular/genre/'
    
    # Initialize a list to store the data
    data = []

    # Loop through each genre
    for genre in genres:
        print(f'Scraping genre: {genre}')
        film_links = []

        # Loop through pages from 1 to the specified page number
        for page_num in range(1, pages + 1):
            url = f'{base_url}{genre}/page/{page_num}/'
            print(f'Fetching {url}')
            driver.get(url)

            # Wait for the specific 'ul' tag to be present
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'poster-list'))
                )
                # Parse the HTML content using BeautifulSoup
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')

                # Search for the specific 'ul' tag
                target_section = soup.find('ul', class_='poster-list -p70 -grid')
                
                if target_section:
                    list_items = target_section.find_all('li')
                    for list_item in list_items:
                        # Find the div element with the specified class
                        div_element = list_item.find('div')
                        
                        # Check if the div element was found
                        if div_element:
                            # Extract data-film-link attribute
                            film_link = div_element.get('data-film-link')
                            if film_link:  # Check if the attribute is not None
                                film_links.append(film_link)
                            else:
                                print("data-film-link attribute not found for a div element")
                        else:
                            print("Div element with the specified class not found")
                else:
                    print(f'No target section found on page {page_num}')
            
            except Exception as e:
                print(f'Error fetching page {page_num}: {e}')

        # Append the results to the data list
        for link in film_links:
            data.append({'Genre': genre, 'Film Link': link})
    
    # Close the WebDriver
    driver.quit()
    
    print('Finished fetching pages.')
    
    # Create a DataFrame from the data
    film_links = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    film_links.to_csv('film_links_by_genre.csv', index=False)
    
    return film_links



In [7]:
# Example usage
genres = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance', 'science-fiction', 'war', 'thriller', 'tv-movie', 'western']  # Add more genres as needed
pages = 100  # Specify the number of pages to scrape for each genre
df = scrap_film_links_by_genre(pages, genres)
df.head()
 

Scraping genre: action
Fetching https://letterboxd.com/films/popular/genre/action/page/1/
Fetching https://letterboxd.com/films/popular/genre/action/page/2/
Fetching https://letterboxd.com/films/popular/genre/action/page/3/
Fetching https://letterboxd.com/films/popular/genre/action/page/4/
Fetching https://letterboxd.com/films/popular/genre/action/page/5/
Fetching https://letterboxd.com/films/popular/genre/action/page/6/
No target section found on page 6
Fetching https://letterboxd.com/films/popular/genre/action/page/7/
Fetching https://letterboxd.com/films/popular/genre/action/page/8/
Fetching https://letterboxd.com/films/popular/genre/action/page/9/
Fetching https://letterboxd.com/films/popular/genre/action/page/10/
Fetching https://letterboxd.com/films/popular/genre/action/page/11/
No target section found on page 11
Fetching https://letterboxd.com/films/popular/genre/action/page/12/
No target section found on page 12
Fetching https://letterboxd.com/films/popular/genre/action/page/13

Unnamed: 0,Genre,Film Link
0,action,/film/everything-everywhere-all-at-once/
1,action,/film/spider-man-into-the-spider-verse/
2,action,/film/the-dark-knight/
3,action,/film/inception/
4,action,/film/spider-man-across-the-spider-verse/


## read the film csv

In [5]:
film = pd.read_csv(f"{my_folder}film_links_by_genre.csv", delimiter=',')
# Split the 'Genre,Film Link' column into 'Genre' and 'Film Link'
film[['Genre', 'Film Link']] = film['Genre,Film Link'].str.split(',', expand=True)
# Drop the original 'Genre,Film Link' column
film = film.drop(columns=['Genre,Film Link'])

film.head()

Unnamed: 0,Genre,Film Link
0,action,/film/everything-everywhere-all-at-once/
1,action,/film/spider-man-into-the-spider-verse/
2,action,/film/the-dark-knight/
3,action,/film/inception/
4,action,/film/spider-man-across-the-spider-verse/


In [6]:
len(film)

49787

In [7]:
film["Genre"].value_counts()

Genre
drama              5379
comedy             4839
war                4782
horror             4236
science-fiction    3657
tv-movie           3344
thriller           2806
romance            2627
fantasy            2202
action             2169
documentary        2117
animation          2099
western            1892
crime              1833
history            1549
mystery            1403
adventure          1152
family             1055
music               646
Name: count, dtype: int64

In [19]:
# Randomly select 10,000 rows
film = film.sample(n=10000, random_state=1)  # random_state is optional but ensures reproducibility
film

Unnamed: 0,Genre,Film Link
31635,romance,/film/morocco/
17332,drama,/film/the-barefoot-contessa/
30310,mystery,/film/satans-triangle/
23382,history,/film/the-last-days/
39662,war,/film/the-battle-of-the-eagles/
...,...,...
40481,war,/film/ca-ira-il-fiume-della-rivolta/
20196,family,/film/norm-of-the-north-keys-to-the-kingdom/
17866,drama,/film/toute-une-nuit/
49226,western,/film/the-arizona-kid/


# scrap user name

In [14]:

def setup_driver():
    start_time = time.time()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    setup_time = time.time() - start_time
    print(f'Setup Driver Time: {setup_time:.2f} seconds')
    return driver

def scrap_username(driver, film_name, page):
    # Measure time to construct and load the URL
    start_time = time.time()
    url = f'https://letterboxd.com{film_name}reviews/page/{page}/'
    print(f'Fetching {url}')
    driver.get(url)
    load_time = time.time() - start_time
    print(f'Page Load Time: {load_time:.2f} seconds')
    
    # Measure time to parse HTML
    start_time = time.time()
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    parse_time = time.time() - start_time
    print(f'HTML Parse Time: {parse_time:.2f} seconds')
    
    # Measure time to find all film detail elements
    start_time = time.time()
    avatars = soup.find_all('a', class_='avatar -a40')
    find_time = time.time() - start_time
    print(f'Find Elements Time: {find_time:.2f} seconds')
    
    username_list = []
    
    # Measure time to extract data from each film detail
    start_time = time.time()
    for avatar in avatars:
        href = avatar.get('href')
        username_list.append({'Film': film_name, 'Username': href})
    extraction_time = time.time() - start_time
    print(f'Data Extraction Time: {extraction_time:.2f} seconds')
    
    return username_list

def scrap_multiple_usernames(film_df):
    all_data = []

    # Setup WebDriver once
    driver = setup_driver()

    def fetch_film_page(film_id, page):
        try:
            return scrap_username(driver, film_id, page)
        except Exception as e:
            print(f'Error scraping {film_id} on page {page}: {str(e)}')
            return []

    # Use ThreadPoolExecutor to scrape multiple pages concurrently
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = []
        for i in range(len(film_df)):
            film_id = film_df['Film Link'][i]
            for j in range(1, 2):  # Pages from 1 to 2
                futures.append(executor.submit(fetch_film_page, film_id, j))
        
        for future in as_completed(futures):
            all_data.extend(future.result())
    
    # Quit the driver after all scraping is done
    driver.quit()
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data)
    return df


In [12]:
film_5=film.head()

In [15]:
df_user = scrap_multiple_usernames(film)
df_user.head()

Setup Driver Time: 2.04 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/1/
Fetching https://letterboxd.com/film/spider-man-into-the-spider-verse/reviews/page/1/
Fetching https://letterboxd.com/film/the-dark-knight/reviews/page/1/
Fetching https://letterboxd.com/film/inception/reviews/page/1/
Fetching https://letterboxd.com/film/spider-man-across-the-spider-verse/reviews/page/1/
Page Load Time: 2.25 seconds
Page Load Time: 3.14 seconds
Page Load Time: 3.97 seconds
Page Load Time: 4.76 seconds
Page Load Time: 5.58 seconds
HTML Parse Time: 5.32 seconds
HTML Parse Time: 6.41 seconds
Find Elements Time: 0.13 seconds
Data Extraction Time: 0.00 seconds
Find Elements Time: 0.10 seconds
Data Extraction Time: 0.00 seconds
HTML Parse Time: 3.31 seconds
HTML Parse Time: 5.01 seconds
HTML Parse Time: 4.23 seconds
Find Elements Time: 0.10 seconds
Data Extraction Time: 0.00 seconds
Find Elements Time: 0.08 seconds
Data Extraction Time: 0.00 seconds
Find Ele

Unnamed: 0,Film,Username
0,/film/spider-man-into-the-spider-verse/,/harrystile/
1,/film/spider-man-into-the-spider-verse/,/madmaulgldstone/
2,/film/spider-man-into-the-spider-verse/,/shardin1/
3,/film/spider-man-into-the-spider-verse/,/arifigueroa/
4,/film/spider-man-into-the-spider-verse/,/landonlepper/
5,/film/spider-man-into-the-spider-verse/,/buswapiti27/
6,/film/spider-man-into-the-spider-verse/,/mk__/
7,/film/spider-man-into-the-spider-verse/,/ezzflash1/
8,/film/spider-man-into-the-spider-verse/,/angellika/
9,/film/spider-man-into-the-spider-verse/,/lapouletta/


# scrap user reviews

In [21]:
users=pd.read_csv(f"{my_folder}usernames_action.csv")
users.head(2)

Unnamed: 0,Film,Username
0,/film/spider-man-into-the-spider-verse/,/teeshaphamu/
1,/film/spider-man-into-the-spider-verse/,/sweetbabyh/


In [35]:

def setup_driver():
    start_time = time.time()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    setup_time = time.time() - start_time
    print(f'Setup Driver Time: {setup_time:.2f} seconds')
    return driver

def scrap_user_reviews_page(driver, user_id, page):
    # Measure time to construct and load the URL
    start_time = time.time()
    url = f'https://letterboxd.com/{user_id}/films/reviews/page/{page}/'
    print(f'Fetching {url}')
    driver.get(url)
    load_time = time.time() - start_time
    print(f'Page Load Time: {load_time:.2f} seconds')
    
    # Measure time to parse HTML
    start_time = time.time()
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    parse_time = time.time() - start_time
    print(f'HTML Parse Time: {parse_time:.2f} seconds')
    
    # Measure time to find all film detail elements
    start_time = time.time()
    film_details = soup.find_all('li', class_='film-detail viewing-poster-container')
    find_time = time.time() - start_time
    print(f'Find Elements Time: {find_time:.2f} seconds')
    
    user_data = []
    
    # Measure time to extract data from each film detail
    start_time = time.time()
    for film_detail in film_details:
        # Extract film slug
        film_slug = film_detail.find('div', class_='linked-film-poster').get('data-film-slug')
        
        # Extract comment text
        text_div = film_detail.find('div', class_='body-text -prose collapsible-text')
        comment = text_div.find('p').text.strip() if text_div else 'No comment available'
        
        # Extract date
        date_span = film_detail.find('span', class_='date').find('span', class_='_nobr')
        date = date_span.text.strip() if date_span else 'No date available'
        
        # Extract rating
        rating_span = film_detail.find('span', class_='rating')
        rating = rating_span.text.strip() if rating_span else 'No rating available'
        
        # Append data to list
        user_data.append({'film': film_slug, 'comment': comment, 'date': date, 'rating': rating})
    extraction_time = time.time() - start_time
    print(f'Data Extraction Time: {extraction_time:.2f} seconds')
    
    return user_data

def scrap_multiple_users(users):
    all_data = []
    
    # Set up the WebDriver once
    driver = setup_driver()
    
    for i in range(0, len(users)):
        user_id = users['Username'][i]
        for j in range(1, 11):  # Pages from 1 to 10
            try:
                page_data = scrap_user_reviews_page(driver, user_id, j)
                all_data.extend(page_data)  # Add the page data to the main list
            except Exception as e:
                print(f'Error scraping {user_id} on page {j}: {str(e)}')
    
    # Quit the driver after all scraping is done
    driver.quit()
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data)
    return df



In [37]:
users['Username'][1]

'/sweetbabyh/'

In [32]:
user_5 = users.head() # This retrieves the entire 3rd row
user_5 


Unnamed: 0,Film,Username
0,/film/spider-man-into-the-spider-verse/,/teeshaphamu/
1,/film/spider-man-into-the-spider-verse/,/sweetbabyh/
2,/film/spider-man-into-the-spider-verse/,/joshpedia69/
3,/film/spider-man-into-the-spider-verse/,/sophiestaples/
4,/film/spider-man-into-the-spider-verse/,/chrismk777/


In [41]:
df_reviews = scrap_multiple_users(user_5)

Setup Driver Time: 2.60 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/1/
Page Load Time: 4.33 seconds
HTML Parse Time: 0.13 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.01 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/2/
Page Load Time: 0.52 seconds
HTML Parse Time: 0.08 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/3/
Page Load Time: 0.29 seconds
HTML Parse Time: 0.07 seconds
Find Elements Time: 0.00 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/4/
Page Load Time: 0.31 seconds
HTML Parse Time: 0.07 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/5/
Page Load Time: 0.30 seconds
HTML Parse Time: 0.08 seconds
Find Elements Time: 0.00 seconds
Data Extraction Time: 0.00 s

In [40]:
df_reviews.head()

Unnamed: 0,film,comment,date,rating
0,rrr,insane,23 Jul 2024,★★★★★
1,sweeney-todd-the-demon-barber-of-fleet-street-...,i like whimsey,23 Jul 2024,★★
2,judas-and-the-black-messiah,😭😭😭,23 Jan 2019,★★★★½
3,digimon-adventure-our-war-game,GORGEOUS 😍,23 Mar 2022,★★★★★
4,cat-soup,i like how they move,23 Jul 2024,★★★½
