In [4]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests

In [5]:
base_url = "https://letterboxd.com"
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

In [6]:
chrome_options = Options()
chrome_options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(60)

movies_data = []

for page in range(1, 11):  # Pages 1 to 10
    url = f'{base_url}/films/popular/country/india/page/{page}/'
    try:
        driver.get(url)
        time.sleep(3)
        html = driver.page_source
        soup_main = BeautifulSoup(html, "html.parser")
        poster_items = soup_main.find_all('li', attrs={'class': 'posteritem'})
        for poster in poster_items:
            a_tag = poster.find('a', href=True)
            if a_tag:
                movie_url = base_url + a_tag['href']
                response = requests.get(movie_url, headers=header)
                soup = BeautifulSoup(response.text, "html.parser")
                
                # Title
                title_tag = soup.find('h1', attrs={'class': 'headline-1'})
                title = title_tag.text.strip() if title_tag else None
                
                #Release Year
                link = a_tag['href']
                release_year_tag = soup.find('span', attrs={'class':'releasedate'})
                release_year = release_year_tag.text.strip() if release_year_tag else None
                
                # Director
                director_tag = soup.find('a', attrs={'href': lambda x: x and '/director/' in x})
                director = director_tag.text.strip() if director_tag else None
                
                # Cast
                cast_tags = soup.find_all('a', attrs={'href': lambda x: x and '/actor/' in x})
                cast = ', '.join([c.text.strip() for c in cast_tags]) if cast_tags else None
                
                # Rating and Number of Ratings
                rating_tag = soup.find('meta', attrs={'name': 'twitter:data2'})
                rating = rating_tag['content'] if rating_tag else None
                
                # Details page
                details_url = movie_url.rstrip('/') + '/details/'
                details_resp = requests.get(details_url, headers=header)
                details_soup = BeautifulSoup(details_resp.text, "html.parser")
                
                # Studio
                studio_tag = details_soup.find('a', attrs={'href': lambda x: x and '/studio/' in x})
                studio = studio_tag.text.strip() if studio_tag else None
                
                # Country
                country_tag = details_soup.find('a', attrs={'href': lambda x: x and '/country/' in x})
                origin_country = country_tag.text.strip() if country_tag else None
                
                # Language
                lang_tag = details_soup.find('a', attrs={'href': lambda x: x and '/language/' in x})
                original_language = lang_tag.text.strip() if lang_tag else None

                # Runtime
                runtime_tag = soup.find('p', attrs={'class': 'text-link text-footer'})
                runtime = runtime_tag.text.strip() if runtime_tag else None

                # genres page
                genres_url = movie_url.rstrip('/') + '/genres/'
                genres_rep = requests.get(genres_url, headers=header)
                genres_soup = BeautifulSoup(genres_rep.text, "html.parser")
                genres_div = genres_soup.find('div', class_='text-sluglist capitalize')
                if genres_div:
                    genre_links = genres_div.find_all('a', class_='text-slug')
                    genres = ', '.join([g.text.strip() for g in genre_links])
                else:
                    genres = None
                    
                movies_data.append({
                    "MovieTitle": title,
                    "ReleaseYear": release_year,
                    "RunTime": runtime,
                    "Genres": genres,
                    "DirectorName": director,
                    "Rating": rating,
                    "Studio": studio,
                    "OriginCountry": origin_country,
                    "OriginalLanguage": original_language,
                    "cast": cast,
                })
    except Exception as e:
        print(f"Error loading page {page}: {e}")

driver.quit()

In [7]:
# Save to CSV
df = pd.DataFrame(movies_data)
df.to_csv("movies_data.csv", index=False)

# Load DataFrame and preview
df = pd.read_csv("movies_data.csv")

In [8]:
df.head()

Unnamed: 0,MovieTitle,ReleaseYear,RunTime,Genres,DirectorName,Rating,Studio,OriginCountry,OriginalLanguage,cast
0,1917,2019,119 mins More at IMDb TMDB Report this page,"War, History, Thriller",Sam Mendes,4.11 out of 5,DreamWorks Pictures,India,English,"George MacKay, Dean-Charles Chapman, Mark Stro..."
1,Life of Pi,2012,127 mins More at IMDb TMDB Report this page,"Drama, Adventure",Ang Lee,3.74 out of 5,Fox 2000 Pictures,India,English,"Suraj Sharma, Irrfan Khan, Ayush Tandon, Gauta..."
2,RRR,2022,185 mins More at IMDb TMDB Report this page,"Adventure, Action, Drama",S. S. Rajamouli,4.20 out of 5,DVV Entertainment,India,Telugu,"N.T. Rama Rao Jr., Ram Charan, Olivia Morris, ..."
3,Pixels,2015,106 mins More at IMDb TMDB Report this page,"Science Fiction, Action, Fantasy, Comedy",Chris Columbus,2.33 out of 5,Columbia Pictures,China,English,"Adam Sandler, Kevin James, Michelle Monaghan, ..."
4,Lincoln,2012,149 mins More at IMDb TMDB Report this page,"History, Drama",Steven Spielberg,3.58 out of 5,DreamWorks Pictures,India,English,"Daniel Day-Lewis, Sally Field, David Strathair..."


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   MovieTitle        720 non-null    object
 1   ReleaseYear       720 non-null    int64 
 2   RunTime           720 non-null    object
 3   Genres            719 non-null    object
 4   DirectorName      720 non-null    object
 5   Rating            720 non-null    object
 6   Studio            713 non-null    object
 7   OriginCountry     720 non-null    object
 8   OriginalLanguage  720 non-null    object
 9   cast              718 non-null    object
dtypes: int64(1), object(9)
memory usage: 56.4+ KB
