In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import random
from time import sleep

In [2]:
#Read csv which has wiki urls
df = pd.read_csv("old_imdb_with_wiki_urls.csv", na_values = "NaN")
df.head()

Unnamed: 0,title,year,wikipedia_url
0,Kanoon,1960,https://en.wikipedia.org/wiki/Kanoon
1,Bewaqoof,1960,https://en.wikipedia.org/wiki/Bewaqoof
2,Honeymoon,1960,
3,Lal Quila,1960,https://en.wikipedia.org/wiki/Lal_Qila_(disamb...
4,Maa Baap,1960,


In [3]:
df.shape

(8340, 3)

In [4]:
df_new = df[df['wikipedia_url'].notna()]

In [5]:
df_filter = df_new[~df_new['wikipedia_url'].str.contains("List_of")]

In [6]:
df_filter

Unnamed: 0,title,year,wikipedia_url
0,Kanoon,1960,https://en.wikipedia.org/wiki/Kanoon
1,Bewaqoof,1960,https://en.wikipedia.org/wiki/Bewaqoof
3,Lal Quila,1960,https://en.wikipedia.org/wiki/Lal_Qila_(disamb...
7,Love in Simla,1960,https://en.wikipedia.org/wiki/Love_in_Simla
8,Anuradha,1960,https://en.wikipedia.org/wiki/Anuradha_(1960_f...
...,...,...,...
8329,Aankhon Ki Gustaakhiyan,2025,https://en.wikipedia.org/wiki/Aankhon_Ki_Gusta...
8332,Sant Tukaram,2025,https://en.wikipedia.org/wiki/Tukaram_(disambi...
8334,Mahavatar Narsimha,2025,https://en.wikipedia.org/wiki/Mahavatar_Narsimha
8335,Son of Sardaar 2,2025,https://en.wikipedia.org/wiki/Son_of_Sardaar_2


In [7]:
batch_size = 100
total_rows = len(df_filter)

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

for start in range(0, total_rows, batch_size):
    end = min(start + batch_size, total_rows)
    batch = df_filter.iloc[start:end]
    rows = []

    print(f"Starting batch {start}–{end}")
    

    for _, row in batch.iterrows():
        url = row["wikipedia_url"]

        if not url or not url.startswith("http"):
            print(f"Skipping invalid URL: {url}")
            continue

        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code == 429:
                print(f"429 Too Many Requests at {url} — sleeping 60s")
                sleep(60)
                continue
            elif response.status_code != 200:
                print(f"HTTP {response.status_code} at {url}")
                continue

            doc = BeautifulSoup(response.text, "html.parser")
            table = doc.find('table', class_=lambda x: x and 'infobox' in x)
            if not table:
                continue

            data = {
                'title': row["title"],
                'year': row["year"],
                'wikipedia_url': url
            }

            try:
                data['Title_wiki'] = table.find('th', class_=lambda x: x and 'infobox-above' in x).get_text(strip=True)
            except:
                data['Title_wiki'] = None

            try:
                data['Director'] = table.find('th', string="Directed by").find_next_sibling('td').get_text(strip=True)
            except:
                data['Director'] = None

            try:
                data['Cast'] = table.find('th', string="Starring").find_next_sibling('td').get_text(separator=", ", strip=True)
            except:
                data['Cast'] = None

            try:
                data['Budget'] = table.find('th', string="Budget").find_next_sibling('td').get_text(separator=" ", strip=True).split('[')[0]
            except:
                data['Budget'] = None

            try:
                data['Box_Office'] = table.find('th', string="Box office").find_next_sibling('td').get_text(separator=" ", strip=True).split('[')[0]
            except:
                data['Box_Office'] = None

            try:
                data['Distributor'] = table.find('th', string="Distributed by").find_next_sibling('td').get_text(strip=True)
            except:
                data['Distributor'] = None

            data['Production'] = None
            try:
                for th in table.find_all('th'):
                    if 'Production' in th.get_text() and 'company' in th.get_text():
                        data['Production'] = th.find_next_sibling('td').get_text(strip=True)
                        break
            except:
                data['Production'] = None

            data['Genre'] = None
            try:
                for p in doc.find_all('p'):
                    text = p.get_text().lower()
                    match = re.search(r'(?:(?:hindi(?:-language)?|indian)[ ,]*)*([\w\s/-]{3,40}?) film', text)
                    if match:
                        genre = match.group(1).strip().title()
                        if genre and not genre.lower().startswith(("this", "it is", "the film")):
                            data['Genre'] = genre
                            break
            except:
                data['Genre'] = None

            rows.append(data)

            # Pause to avoid 429
            sleep(random.uniform(0.5, 1.0))

        except Exception as e:
            print(f"Error fetching {url}: {e}")

    # Save this batch
    batch_df = pd.DataFrame(rows)
    batch_df.to_csv(f"/Users/Jasmin.Nihalani/Desktop/imdb_batch_{start}_{end}.csv", index=False)
    print(f"Saved batch {start}–{end} with {len(rows)} rows")

Starting batch 0–100
Saved batch 0–100 with 95 rows
Starting batch 100–200
Saved batch 100–200 with 94 rows
Starting batch 200–300
Saved batch 200–300 with 94 rows
Starting batch 300–400
Saved batch 300–400 with 95 rows
Starting batch 400–500
Saved batch 400–500 with 95 rows
Starting batch 500–600
Saved batch 500–600 with 92 rows
Starting batch 600–700
Saved batch 600–700 with 95 rows
Starting batch 700–800
Saved batch 700–800 with 93 rows
Starting batch 800–900
Saved batch 800–900 with 97 rows
Starting batch 900–1000
Saved batch 900–1000 with 96 rows
Starting batch 1000–1100
Saved batch 1000–1100 with 97 rows
Starting batch 1100–1200
Saved batch 1100–1200 with 91 rows
Starting batch 1200–1300
Saved batch 1200–1300 with 92 rows
Starting batch 1300–1400
Saved batch 1300–1400 with 97 rows
Starting batch 1400–1500
Saved batch 1400–1500 with 100 rows
Starting batch 1500–1600
Saved batch 1500–1600 with 98 rows
Starting batch 1600–1700
Saved batch 1600–1700 with 97 rows
Starting batch 1700–1

In [33]:
df_final.head(11)

Unnamed: 0,title,year,wikipedia_url,Title_wiki,Director,Cast,Budget,Box_Office,Distributor,Production,Genre
0,Kanoon,1960,https://en.wikipedia.org/wiki/Kanoon,Kanoon,B. R. Chopra,"Rajendra Kumar, Ashok Kumar, Nanda, Mehmood",,,,,60 Indian Hindi-Language Courtroom Drama
1,Bewaqoof,1960,https://en.wikipedia.org/wiki/Bewaqoof,Bewaqoof (Movie),I. S. Johar,"Kishore Kumar, Mala Sinha, I. S. Johar, Pran, ...",,,,,Is A 1960 Indian Drama
2,Love in Simla,1960,https://en.wikipedia.org/wiki/Love_in_Simla,Love in Simla,R. K. Nayyar,"Joy Mukherjee, Sadhana",,,,,Love In Simla Is A 1960 Indian Romance
3,Anuradha,1960,https://en.wikipedia.org/wiki/Anuradha_(1960_f...,Anuradha,Hrishikesh Mukherjee,"Balraj Sahni, Leela Naidu",,,,,Is A 1960 Hindi-Language Indian
4,Mughal-E-Azam,1960,https://en.wikipedia.org/wiki/Mughal-e-Azam,Mughal-e-Azam,K. Asif,"Prithviraj Kapoor, Dilip Kumar, Madhubala, Dur...",₹ 10.5–15 million,est. ₹ 110 million (India),,Sterling Investment Corporation,Is A 1960 Indian Epic Historical Drama
5,Zameen Ke Tare,1960,https://en.wikipedia.org/wiki/Zameen_Ke_Tare,Zameen Ke Tare,Chandulal Shah,"Agha, Master Bhagwan, Daisy Irani, Honey Irani...",,,Chandra Movies,Chandra Movies,S Drama
6,Kohinoor,1960,https://en.wikipedia.org/wiki/Kohinoor_(1960_f...,Kohinoor,S. U. Sunny,"Dilip Kumar, Meena Kumari, Leela Chitnis",,1.5 crore,,,Oor Is A 1960 Bollywood Action Adventure
7,Barsaat Ki Raat,1960,https://en.wikipedia.org/wiki/Barsaat_Ki_Raat,Barsaat Ki Raat,P. L. Santoshi,"Madhubala, Bharat Bhushan, Shyama",,est. ₹35 million,,,0 Indian Hindi-Language Romantic Musical
8,Jis Desh Men Ganga Behti Hai,1960,https://en.wikipedia.org/wiki/Jis_Desh_Mein_Ga...,Jis Desh Mein Ganga Behti Hai,Radhu Karmakar,"Raj Kapoor, Padmini Ramachandran, Pran",,₹2 crore,,,Was An Indian
