In [1]:
#import package
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import threading


In [2]:
#set you folder
my_folder="C:/Users/melika/Desktop/master/thesis/Recommandation_Systems/"

# first we scrap the films link

In [3]:
def scrap_film_links_by_genre(pages, genres):
    # Set up the WebDriver (e.g., for Chrome)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    
    # Base URL for the pages
    base_url = 'https://letterboxd.com/films/popular/genre/'
    
    # Initialize a list to store the data
    data = []

    # Loop through each genre
    for genre in genres:
        print(f'Scraping genre: {genre}')
        film_links = []

        # Loop through pages from 1 to the specified page number
        for page_num in range(1, pages + 1):
            url = f'{base_url}{genre}/page/{page_num}/'
            print(f'Fetching {url}')
            driver.get(url)

            # Wait for the specific 'ul' tag to be present
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'poster-list'))
                )
                # Parse the HTML content using BeautifulSoup
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')

                # Search for the specific 'ul' tag
                target_section = soup.find('ul', class_='poster-list -p70 -grid')
                
                if target_section:
                    list_items = target_section.find_all('li')
                    for list_item in list_items:
                        # Find the div element with the specified class
                        div_element = list_item.find('div')
                        
                        # Check if the div element was found
                        if div_element:
                            # Extract data-film-link attribute
                            film_link = div_element.get('data-film-link')
                            if film_link:  # Check if the attribute is not None
                                film_links.append(film_link)
                            else:
                                print("data-film-link attribute not found for a div element")
                        else:
                            print("Div element with the specified class not found")
                else:
                    print(f'No target section found on page {page_num}')
            
            except Exception as e:
                print(f'Error fetching page {page_num}: {e}')

        # Append the results to the data list
        for link in film_links:
            data.append({'Genre': genre, 'Film Link': link})
    
    # Close the WebDriver
    driver.quit()
    
    print('Finished fetching pages.')
    
    # Create a DataFrame from the data
    film_links = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    film_links.to_csv('film_links_by_genre.csv', index=False)
    
    return film_links



In [7]:
# Example usage
genres = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance', 'science-fiction', 'war', 'thriller', 'tv-movie', 'western']  # Add more genres as needed
pages = 100  # Specify the number of pages to scrape for each genre
df = scrap_film_links_by_genre(pages, genres)
df.head()
 

Scraping genre: action
Fetching https://letterboxd.com/films/popular/genre/action/page/1/
Fetching https://letterboxd.com/films/popular/genre/action/page/2/
Fetching https://letterboxd.com/films/popular/genre/action/page/3/
Fetching https://letterboxd.com/films/popular/genre/action/page/4/
Fetching https://letterboxd.com/films/popular/genre/action/page/5/
Fetching https://letterboxd.com/films/popular/genre/action/page/6/
No target section found on page 6
Fetching https://letterboxd.com/films/popular/genre/action/page/7/
Fetching https://letterboxd.com/films/popular/genre/action/page/8/
Fetching https://letterboxd.com/films/popular/genre/action/page/9/
Fetching https://letterboxd.com/films/popular/genre/action/page/10/
Fetching https://letterboxd.com/films/popular/genre/action/page/11/
No target section found on page 11
Fetching https://letterboxd.com/films/popular/genre/action/page/12/
No target section found on page 12
Fetching https://letterboxd.com/films/popular/genre/action/page/13

Unnamed: 0,Genre,Film Link
0,action,/film/everything-everywhere-all-at-once/
1,action,/film/spider-man-into-the-spider-verse/
2,action,/film/the-dark-knight/
3,action,/film/inception/
4,action,/film/spider-man-across-the-spider-verse/


In [8]:
len(df)

49787

## read the film csv

In [3]:
film = pd.read_csv(f"{my_folder}film_links_by_genre.csv", delimiter=',')
# Split the 'Genre,Film Link' column into 'Genre' and 'Film Link'
film[['Genre', 'Film Link']] = film['Genre,Film Link'].str.split(',', expand=True)
# Drop the original 'Genre,Film Link' column
film = film.drop(columns=['Genre,Film Link'])

film.head()

Unnamed: 0,Genre,Film Link
0,action,/film/everything-everywhere-all-at-once/
1,action,/film/spider-man-into-the-spider-verse/
2,action,/film/the-dark-knight/
3,action,/film/inception/
4,action,/film/spider-man-across-the-spider-verse/


# scrap user name

In [4]:
def scrap_usernames(film_name, pages):
    # Set up the WebDriver (e.g., for Chrome)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)

    usernames_list = []

    for page in range(1, pages + 1):
        url = f'https://letterboxd.com{film_name}reviews/page/{page}/'
        print(f'Fetching {url}')
        driver.get(url)

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'name'))
            )

            # Parse the HTML content using BeautifulSoup
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            # Find all the <a class="avatar -a40"> elements
            avatars = soup.find_all('a', class_='avatar -a40')

            # Extract and store the href from each <a class="avatar -a40"> element
            for avatar in avatars:
                href = avatar.get('href')
                usernames_list.append({'Film': film_name, 'Username': href})
        
        except Exception as e:
            print(f'Error fetching {url}: {e}')

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame from the collected data
    usernames_df = pd.DataFrame(usernames_list)

    # Save the DataFrame to a CSV file
    #usernames_df.to_csv('usernames.csv', index=False)

    return usernames_df


In [7]:
# Example usage
film_name = '/film/everything-everywhere-all-at-once/'  # The name of the film
pages = 2  # Number of pages to scrape for the film
df = scrap_usernames(film_name, pages)
df.head()

Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/1/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/2/


Unnamed: 0,Film,Username
0,/film/everything-everywhere-all-at-once/,/ashleyythi/
1,/film/everything-everywhere-all-at-once/,/bigmoviegrace/
2,/film/everything-everywhere-all-at-once/,/sobmiau/
3,/film/everything-everywhere-all-at-once/,/jpgkay/
4,/film/everything-everywhere-all-at-once/,/suspiriastrider/


In [5]:
film["Film Link"][2]

'/film/the-dark-knight/'

In [None]:
import pandas as pd

user_name_df = pd.read_csv(f"{my_folder}usernames.csv")

# Loop through the films to scrape usernames
for i in range(150, 3000):
    df = scrap_usernames(film["Film Link"][i], 10)
    user_name_df = pd.concat([user_name_df, df], ignore_index=True)
    print(i,"id of movie ")

    # Save the CSV file after every 5 films processed
    if (i % 5 == 0):
        user_name_df.to_csv(f"{my_folder}usernames.csv", index=False)
        print("save")
        user_name_df = pd.read_csv(f"{my_folder}usernames.csv")

# Final save after the loop completes
user_name_df.to_csv(f"{my_folder}usernames.csv", index=False)
print("Final save")


Fetching https://letterboxd.com/film/kick-ass/reviews/page/1/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/2/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/3/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/4/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/5/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/6/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/7/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/8/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/9/
Fetching https://letterboxd.com/film/kick-ass/reviews/page/10/
save
Fetching https://letterboxd.com/film/man-of-steel/reviews/page/1/
Fetching https://letterboxd.com/film/man-of-steel/reviews/page/2/
Fetching https://letterboxd.com/film/man-of-steel/reviews/page/3/
Fetching https://letterboxd.com/film/man-of-steel/reviews/page/4/
Fetching https://letterboxd.com/film/man-of-steel/reviews/page/5/
Fetching https://letterboxd.com/film/man-of-

# scrap user reviews

In [21]:
users=pd.read_csv(f"{my_folder}usernames_action.csv")
users.head(2)

Unnamed: 0,Film,Username
0,/film/spider-man-into-the-spider-verse/,/teeshaphamu/
1,/film/spider-man-into-the-spider-verse/,/sweetbabyh/


In [35]:

def setup_driver():
    start_time = time.time()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    setup_time = time.time() - start_time
    print(f'Setup Driver Time: {setup_time:.2f} seconds')
    return driver

def scrap_user_reviews_page(driver, user_id, page):
    # Measure time to construct and load the URL
    start_time = time.time()
    url = f'https://letterboxd.com/{user_id}/films/reviews/page/{page}/'
    print(f'Fetching {url}')
    driver.get(url)
    load_time = time.time() - start_time
    print(f'Page Load Time: {load_time:.2f} seconds')
    
    # Measure time to parse HTML
    start_time = time.time()
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    parse_time = time.time() - start_time
    print(f'HTML Parse Time: {parse_time:.2f} seconds')
    
    # Measure time to find all film detail elements
    start_time = time.time()
    film_details = soup.find_all('li', class_='film-detail viewing-poster-container')
    find_time = time.time() - start_time
    print(f'Find Elements Time: {find_time:.2f} seconds')
    
    user_data = []
    
    # Measure time to extract data from each film detail
    start_time = time.time()
    for film_detail in film_details:
        # Extract film slug
        film_slug = film_detail.find('div', class_='linked-film-poster').get('data-film-slug')
        
        # Extract comment text
        text_div = film_detail.find('div', class_='body-text -prose collapsible-text')
        comment = text_div.find('p').text.strip() if text_div else 'No comment available'
        
        # Extract date
        date_span = film_detail.find('span', class_='date').find('span', class_='_nobr')
        date = date_span.text.strip() if date_span else 'No date available'
        
        # Extract rating
        rating_span = film_detail.find('span', class_='rating')
        rating = rating_span.text.strip() if rating_span else 'No rating available'
        
        # Append data to list
        user_data.append({'film': film_slug, 'comment': comment, 'date': date, 'rating': rating})
    extraction_time = time.time() - start_time
    print(f'Data Extraction Time: {extraction_time:.2f} seconds')
    
    return user_data

def scrap_multiple_users(users):
    all_data = []
    
    # Set up the WebDriver once
    driver = setup_driver()
    
    for i in range(0, len(users)):
        user_id = users['Username'][i]
        for j in range(1, 11):  # Pages from 1 to 10
            try:
                page_data = scrap_user_reviews_page(driver, user_id, j)
                all_data.extend(page_data)  # Add the page data to the main list
            except Exception as e:
                print(f'Error scraping {user_id} on page {j}: {str(e)}')
    
    # Quit the driver after all scraping is done
    driver.quit()
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data)
    return df



In [37]:
users['Username'][1]

'/sweetbabyh/'

In [32]:
user_5 = users.head() # This retrieves the entire 3rd row
user_5 


Unnamed: 0,Film,Username
0,/film/spider-man-into-the-spider-verse/,/teeshaphamu/
1,/film/spider-man-into-the-spider-verse/,/sweetbabyh/
2,/film/spider-man-into-the-spider-verse/,/joshpedia69/
3,/film/spider-man-into-the-spider-verse/,/sophiestaples/
4,/film/spider-man-into-the-spider-verse/,/chrismk777/


In [41]:
df_reviews = scrap_multiple_users(user_5)

Setup Driver Time: 2.60 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/1/
Page Load Time: 4.33 seconds
HTML Parse Time: 0.13 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.01 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/2/
Page Load Time: 0.52 seconds
HTML Parse Time: 0.08 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/3/
Page Load Time: 0.29 seconds
HTML Parse Time: 0.07 seconds
Find Elements Time: 0.00 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/4/
Page Load Time: 0.31 seconds
HTML Parse Time: 0.07 seconds
Find Elements Time: 0.01 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com//teeshaphamu//films/reviews/page/5/
Page Load Time: 0.30 seconds
HTML Parse Time: 0.08 seconds
Find Elements Time: 0.00 seconds
Data Extraction Time: 0.00 s

In [40]:
df_reviews.head()

Unnamed: 0,film,comment,date,rating
0,rrr,insane,23 Jul 2024,★★★★★
1,sweeney-todd-the-demon-barber-of-fleet-street-...,i like whimsey,23 Jul 2024,★★
2,judas-and-the-black-messiah,😭😭😭,23 Jan 2019,★★★★½
3,digimon-adventure-our-war-game,GORGEOUS 😍,23 Mar 2022,★★★★★
4,cat-soup,i like how they move,23 Jul 2024,★★★½


# make username scrapping faster

In [6]:

def setup_driver():
    start_time = time.time()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    setup_time = time.time() - start_time
    print(f'Setup Driver Time: {setup_time:.2f} seconds')
    return driver

def scrap_username(driver, film_name, page):
    # Measure time to construct and load the URL
    start_time = time.time()
    url = f'https://letterboxd.com{film_name}reviews/page/{page}/'
    print(f'Fetching {url}')
    driver.get(url)
    load_time = time.time() - start_time
    print(f'Page Load Time: {load_time:.2f} seconds')
    
    # Measure time to parse HTML
    start_time = time.time()
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    parse_time = time.time() - start_time
    print(f'HTML Parse Time: {parse_time:.2f} seconds')
    
    # Measure time to find all film detail elements
    start_time = time.time()
    # Find all the <a class="avatar -a40"> elements
    avatars = soup.find_all('a', class_='avatar -a40')
    find_time = time.time() - start_time
    print(f'Find Elements Time: {find_time:.2f} seconds')
    
    username_list = []
    
    # Measure time to extract data from each film detail
    start_time = time.time()
    for avatar in avatars:
            href = avatar.get('href')
            username_list.append({'Film': film_name, 'Username': href})
    extraction_time = time.time() - start_time
    print(f'Data Extraction Time: {extraction_time:.2f} seconds')
    
    return username_list

def scrap_multiple_usernames(film_df):
    all_data = []
    
    # Set up the WebDriver once
    driver = setup_driver()
    
    for i in range(0, len(film_df)):
        film_id = film_df['Film Link'][i]
        for j in range(1, 11):  # Pages from 1 to 10
            try:
                page_data = scrap_username(driver, film_id, j)
                all_data.extend(page_data)  # Add the page data to the main list
            except Exception as e:
                print(f'Error scraping {film_id} on page {j}: {str(e)}')
    
    # Quit the driver after all scraping is done
    driver.quit()
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data)
    return df


In [9]:
film_5 = film.head()
len(film_5)

5

In [10]:
df_user = scrap_multiple_usernames(film_5)
df_user

Setup Driver Time: 1.93 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/1/
Page Load Time: 0.57 seconds
HTML Parse Time: 1.63 seconds
Find Elements Time: 0.00 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/2/
Page Load Time: 1.00 seconds
HTML Parse Time: 0.66 seconds
Find Elements Time: 0.02 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/3/
Page Load Time: 0.93 seconds
HTML Parse Time: 0.59 seconds
Find Elements Time: 0.02 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/4/
Page Load Time: 0.84 seconds
HTML Parse Time: 0.63 seconds
Find Elements Time: 0.03 seconds
Data Extraction Time: 0.00 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/5/
Page Load Time: 0.87 seco

Unnamed: 0,Film,Username
0,/film/everything-everywhere-all-at-once/,/jessyanachng/
1,/film/everything-everywhere-all-at-once/,/a1shthefish/
2,/film/everything-everywhere-all-at-once/,/feararcane/
3,/film/everything-everywhere-all-at-once/,/xrjn/
4,/film/everything-everywhere-all-at-once/,/libegafrate/
...,...,...
583,/film/spider-man-across-the-spider-verse/,/deeg0/
584,/film/spider-man-across-the-spider-verse/,/ink_161/
585,/film/spider-man-across-the-spider-verse/,/lukewspencer/
586,/film/spider-man-across-the-spider-verse/,/beckdel_test/


# try to scrap page but load page at same times

In [13]:
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from bs4 import BeautifulSoup

def setup_driver():
    start_time = time.time()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode for better performance
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    setup_time = time.time() - start_time
    print(f'Setup Driver Time: {setup_time:.2f} seconds')
    return driver

def scrap_username(driver, film_name, page):
    # Measure time to construct and load the URL
    start_time = time.time()
    url = f'https://letterboxd.com{film_name}reviews/page/{page}/'
    print(f'Fetching {url}')
    driver.get(url)
    load_time = time.time() - start_time
    print(f'Page Load Time: {load_time:.2f} seconds')
    
    # Measure time to parse HTML
    start_time = time.time()
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    parse_time = time.time() - start_time
    print(f'HTML Parse Time: {parse_time:.2f} seconds')
    
    # Measure time to find all film detail elements
    start_time = time.time()
    avatars = soup.find_all('a', class_='avatar -a40')
    find_time = time.time() - start_time
    print(f'Find Elements Time: {find_time:.2f} seconds')
    
    username_list = []
    
    # Measure time to extract data from each film detail
    start_time = time.time()
    for avatar in avatars:
        href = avatar.get('href')
        username_list.append({'Film': film_name, 'Username': href})
    extraction_time = time.time() - start_time
    print(f'Data Extraction Time: {extraction_time:.2f} seconds')
    
    return username_list

def scrap_multiple_usernames(film_df):
    all_data = []

    # Setup WebDriver once
    driver = setup_driver()

    def fetch_film_page(film_id, page):
        try:
            return scrap_username(driver, film_id, page)
        except Exception as e:
            print(f'Error scraping {film_id} on page {page}: {str(e)}')
            return []

    # Use ThreadPoolExecutor to scrape multiple pages concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for i in range(len(film_df)):
            film_id = film_df['Film Link'][i]
            for j in range(1, 11):  # Pages from 1 to 10
                futures.append(executor.submit(fetch_film_page, film_id, j))
        
        for future in as_completed(futures):
            all_data.extend(future.result())
    
    # Quit the driver after all scraping is done
    driver.quit()
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data)
    return df


In [14]:
df_user = scrap_multiple_usernames(film_5)
df_user

Setup Driver Time: 3.00 seconds
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/1/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/2/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/3/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/4/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/5/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/6/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/7/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/8/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/9/
Fetching https://letterboxd.com/film/everything-everywhere-all-at-once/reviews/page/10/
Page Load Time: 5.42 seconds
Page Load Time: 6.35 seconds
Page Load Time: 7.22 seconds
Page Load 

Unnamed: 0,Film,Username
0,/film/everything-everywhere-all-at-once/,/meredithleanza/
1,/film/everything-everywhere-all-at-once/,/javichuuu/
2,/film/everything-everywhere-all-at-once/,/hilinaad/
3,/film/everything-everywhere-all-at-once/,/benvandervoet/
4,/film/everything-everywhere-all-at-once/,/lesbianshortage/
...,...,...
595,/film/spider-man-across-the-spider-verse/,/poopoofart223/
596,/film/spider-man-across-the-spider-verse/,/demyanaderias/
597,/film/spider-man-across-the-spider-verse/,/filthy9foot8/
598,/film/spider-man-across-the-spider-verse/,/leilafradera/
