<h1 style="color:#2471a3"><i><b>🌐 Data Collection (Web Scraping)</b></i></h1>


<h3 style="color:#a93226"><i><b>📦 Import Necessary Libraries</b></i></h3>

In [1]:
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import Selector
import time

KeyboardInterrupt: 

<h3 style="color:#a93226"><i><b>🎬 Getting Movie Links Using Selenium with User Agent</b></i></h3>


In [None]:
HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'})

In [None]:
# Start Chrome in headless mode
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run Chrome without GUI
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")

# Start WebDriver
service = Service()  # Auto-detects chromedriver
driver = webdriver.Chrome(service=service, options=options)

# Load IMDb Top 250 page
url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Wait for movies to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='chart-layout-main-column']"))
)

# Scroll down to ensure all movies load
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)  # Wait for lazy-loaded content

# Get fully rendered page source
sel = Selector(text=driver.page_source)

# Extract movie titles (updated IMDb selector)
movies = sel.css("div[data-testid='chart-layout-main-column'] a.ipc-title-link-wrapper::attr(href)").getall()

# Close browser
driver.quit()

# Print first 10 movies
print(movies[:10])

# Print total count
print(f"Total movies found: {len(movies)}")


['/title/tt0111161/?ref_=chttp_t_1', '/title/tt0068646/?ref_=chttp_t_2', '/title/tt0468569/?ref_=chttp_t_3', '/title/tt0071562/?ref_=chttp_t_4', '/title/tt0050083/?ref_=chttp_t_5', '/title/tt0167260/?ref_=chttp_t_6', '/title/tt0108052/?ref_=chttp_t_7', '/title/tt0110912/?ref_=chttp_t_8', '/title/tt0120737/?ref_=chttp_t_9', '/title/tt0060196/?ref_=chttp_t_10']
Total movies found: 250


<h3 style="color:#a93226"><i><b>🧪 Testing Using Single Link</b></i></h3>


In [None]:
test_1="https://www.imdb.com"+movies[3]

In [None]:
response_3=requests.get(test_1,headers=HEADERS)
select_3=Selector(text=response_3.text)

In [None]:
select_3.css("div.ipc-chip-list a span.ipc-chip__text::text").getall()

['Epic', 'Gangster', 'Tragedy', 'Crime', 'Drama']

In [None]:
select_3.css("section:nth-child(3) > div > ul > li > a.ipc-metadata-list-item__label.ipc-metadata-list-item__label--link::text").getall()

['Won 6 Oscars']

In [None]:
select_3.css(" div.sc-c0933c3e-1.fQDAuW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section.ipc-page-section.ipc-page-section--base.sc-cd7dc4b7-0.ycheS.title-cast.title-cast--movie.celwidget > ul > li:nth-child(2) > div > ul > li> a::text").getall()

['Francis Ford Coppola', 'Mario Puzo']

In [None]:
select_3.css(" section:nth-child(3) > div > ul > li > div > ul > li > span::text").get()

'17 wins & 21 nominations total'

In [None]:
select_3.css("div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(2) > div > ul > li > a::text").getall()[:-1]

['United States']

<h3 style="color:#a93226"><i><b>After checking using single link Creating List and getting all movies details</b></i></h3>

In [None]:
titles=[]
release_years=[]
motion_pic_rating=[]
runtimes=[]
runtime_motionpicrate_releaseyear=[]
imdp_rating=[]
directors=[]
writers=[]
genres=[]
budgets=[]
budgets_grossincomes=[]
gross_us_and_canada=[]
grosss_worldwide=[]
Opening_weekend_US_Canada=[]
languages=[]
origins=[]
production_co=[]
# wins=[]
nominations_and_wins=[]
oscars=[]

In [None]:

for link in movies:
    web="https://www.imdb.com"+link
    response_2=requests.get(web,headers=HEADERS)
    select_2=Selector(text=response_2.text)
    title=select_2.css(".hero__primary-text::text").get()
    titles.append(title)
    detail=select_2.css(".ipc-inline-list.ipc-inline-list--show-dividers.sc-ec65ba05-2.joVhBE.baseAlt .ipc-inline-list__item ::text").getall()
    runtime_motionpicrate_releaseyear.append(detail)
    rating=select_2.css(".sc-d541859f-1.imUuxf::text").get()
    imdp_rating.append(rating)
    director=select_2.css(".ipc-metadata-list-item__list-content-item.ipc-metadata-list-item__list-content-item--link::text").get()
    directors.append(director)
    writer=select_2.css(" div.sc-c0933c3e-1.fQDAuW.ipc-page-grid__item.ipc-page-grid__item--span-2 > section.ipc-page-section.ipc-page-section--base.sc-cd7dc4b7-0.ycheS.title-cast.title-cast--movie.celwidget > ul > li:nth-child(2) > div > ul > li> a::text").getall()
    writers.append(writer)
    language=select_2.css("li:nth-child(4) > div > ul > li > a::text").getall()
    languages.append(language)
    oscar=select_2.css("section:nth-child(3) > div > ul > li > a.ipc-metadata-list-item__label.ipc-metadata-list-item__label--link::text").getall()
    oscars.append(oscar)
    nomination_and_win=select_2.css(" section:nth-child(3) > div > ul > li > div > ul > li > span::text").get()
    nominations_and_wins.append(nomination_and_win)
    production=select_2.css("li:nth-child(7) > div > ul > li > a::text").getall()
    production_co.append(production)
    origin=select_2.css("div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(2) > div > ul > li > a::text").getall()[:-1]
    origins.append(origin)
    bud = select_2.css(".ipc-metadata-list__item.sc-db2ddaec-2.jyjTbZ .ipc-metadata-list-item__list-content-item::text").getall() or [None]
    budgets_grossincomes.append(bud)
    genre=select_2.css("div.ipc-chip-list a span.ipc-chip__text::text").getall()
    genres.append(genre)

<h3 style="color:#a93226"><i><b>📊 Splitting and Arranging List Data</b></i></h3>

In [None]:
for details in runtime_motionpicrate_releaseyear:  
    try:
        release_years.append(details[0])
        motion_pic_rating.append(details[1])
        runtimes.append(details[2])
    except IndexError:
        print(None)  # Print None when index 2 is out of range


None
None
None
None


In [None]:
for budget_and_grosss in budgets_grossincomes:
    budgets.append(budget_and_grosss[0] if len(budget_and_grosss) > 0 else None)
    gross_us_and_canada.append(budget_and_grosss[1] if len(budget_and_grosss) > 1 else None)
    Opening_weekend_US_Canada.append(budget_and_grosss[2] if len(budget_and_grosss) > 2 else None)
    grosss_worldwide.append(budget_and_grosss[4] if len(budget_and_grosss) > 4 else None)


In [None]:
min_length = min(len(titles), len(directors),len(imdp_rating),
                 len(budgets), len(gross_us_and_canada), len(grosss_worldwide), len(Opening_weekend_US_Canada),
                 len(release_years),len(motion_pic_rating),len(runtimes),len(languages),len(origins),len(oscars),
                 len(nominations_and_wins),len(writers),len(genres),len(production_co))

# Truncate each list to the minimum length
titles = titles[:min_length]
directors = directors[:min_length]
budgets = budgets[:min_length]
gross_us_and_canada = gross_us_and_canada[:min_length]
grosss_worldwide = grosss_worldwide[:min_length]
Opening_weekend_US_Canada = Opening_weekend_US_Canada[:min_length]
release_years=release_years[:min_length]
motion_pic_rating=motion_pic_rating[:min_length]
runtimes=runtimes[:min_length]
writers=writers[:min_length]
imdp_rating=imdp_rating[:min_length]
genres=genres[:min_length]
nominations_and_wins=nominations_and_wins[:min_length]
oscars=oscars[:min_length]
production_co=production_co[:min_length]
origins=origins[:min_length]
languages=languages[:min_length]


<h3 style="color:#a93226"><i><b>📑 Converting List Data to DataFrame and 💾 Saving as CSV</b></i></h3>


In [None]:
df=pd.DataFrame({"title":titles,"directors":directors,"Writers":writers,"Imdb_Ratings":imdp_rating,"Release_year":release_years,"Runtime":runtimes,"Motion_picture_Rating":motion_pic_rating,"budgets":budgets,"gross_us_and_canada":gross_us_and_canada,"gross_worldwide":grosss_worldwide,"opening_weekend_gross_in_uscanada":Opening_weekend_US_Canada,"Languages":languages,"Origin":origins,"Genre":genres,"Production_company":production_co,"Nominations_and_wins":nominations_and_wins,"Oscar":oscars})
df

Unnamed: 0,title,directors,Writers,Imdb_Ratings,Release_year,Runtime,Motion_picture_Rating,budgets,gross_us_and_canada,gross_worldwide,opening_weekend_gross_in_uscanada,Languages,Origin,Genre,Production_company,Nominations_and_wins,Oscar
0,The Shawshank Redemption,Frank Darabont,"[Stephen King, Frank Darabont]",9.3,1994,2h 22m,A,"$25,000,000 (estimated)","$28,767,189","$29,332,133","$727,327",[English],[United States],"[Epic, Period Drama, Prison Drama, Drama]",[Castle Rock Entertainment],21 wins & 42 nominations total,[Nominated for 7 Oscars]
1,The Godfather,Francis Ford Coppola,"[Mario Puzo, Francis Ford Coppola]",9.2,1972,2h 55m,A,"$6,000,000 (estimated)","$136,381,073","$250,342,198","$302,393","[English, Italian, Latin]",[United States],"[Epic, Gangster, Tragedy, Crime, Drama]","[Paramount Pictures, Albert S. Ruddy Productio...",31 wins & 31 nominations total,[Won 3 Oscars]
2,The Dark Knight,Christopher Nolan,"[Jonathan Nolan, Christopher Nolan, David S. G...",9.0,2008,2h 32m,UA,"$185,000,000 (estimated)","$534,987,076","$1,009,057,329","$158,411,483","[English, Mandarin]","[United States, United Kingdom]","[Action Epic, Epic, Superhero, Action, Crime, ...","[Warner Bros., Legendary Entertainment, Syncopy]",164 wins & 165 nominations total,[Won 2 Oscars]
3,The Godfather: Part II,Francis Ford Coppola,"[Francis Ford Coppola, Mario Puzo]",9.0,1974,3h 22m,A,"$13,000,000 (estimated)","$47,834,595","$47,964,222","$171,417","[English, Italian, Spanish, Latin, Sicilian]",[United States],"[Epic, Gangster, Tragedy, Crime, Drama]","[Paramount Pictures, The Coppola Company, Amer...",17 wins & 21 nominations total,[Won 6 Oscars]
4,12 Angry Men,Sidney Lumet,[Reginald Rose],9.0,1957,1h 36m,U,"$350,000 (estimated)","$2,945",,,[],[United States],"[Legal Drama, Psychological Drama, Crime, Drama]",[],16 wins & 12 nominations total,[Nominated for 3 Oscars]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,Into the Wild,Sean Penn,"[Sean Penn, Jon Krakauer]",8.1,2007,2h 34m,U,"$15,000,000 (estimated)","$18,354,356","$56,676,733","$212,440","[English, Danish]",[United States],"[Coming-of-Age, Docudrama, Road Trip, Survival...","[Paramount Vantage, Art Linson Productions, In...",23 wins & 106 nominations total,[Nominated for 2 Oscars]
242,The Grapes of Wrath,John Ford,"[Nunnally Johnson, John Steinbeck]",8.1,1940,2h 26m,Approved,"$800,000 (estimated)","$7,304",,,[],[United States],[Drama],[],10 wins & 6 nominations total,[Won 2 Oscars]
243,Groundhog Day,Harold Ramis,"[Danny Rubin, Harold Ramis]",8.0,1993,2h 10m,U,"$14,600,000 (estimated)","$71,107,962","$71,108,778","$12,517,672",[],[United States],"[Feel-Good Romance, High-Concept Comedy, Holid...",[],7 wins & 17 nominations total,[Won 1 BAFTA Award]
244,A Man Escaped,Robert Bresson,"[André Devigny, Robert Bresson]",8.2,1956,2h 10m,Not Rated,,,,,[],[France],"[Prison Drama, Drama, Thriller, War]",[],4 wins & 3 nominations total,[Nominated for 1 BAFTA Award]


In [17]:
df.to_csv("../data/raw/imdb_raw_data.csv")