In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# year 1930-39
url = "https://en.wikipedia.org/wiki/List_of_Tamil_films_of_the_1930s"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
movies = []
current_year = None

for tag in soup.find_all(['h2', 'table']):
    # If it's an h2 tag, try to find the year inside it
    if tag.name == 'h2':
        year_match = re.search(r'\b(193\d)\b', tag.get_text())
        if year_match:
            current_year = year_match.group(1)
            
    # If it's a wikitable, extract it and add the current year
    elif tag.name == 'table' and 'wikitable' in tag.get('class', []):
        df = pd.read_html(str(tag))[0]
        if current_year:
            df['Year'] = current_year
        df['Language'] = 'Tamil'
        df['url'] = url
        movies.append(df)
        print(current_year)

1931
1932
1933
1934
1935
1936
1937
1938
1939


In [3]:
# year 1940 - 2025
years = list(range(1940, 2026))
for year in years:
    url = "https://en.wikipedia.org/wiki/List_of_Tamil_films_of_" + str(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    for table in tables:
        df = pd.read_html(str(table))[0]
        col_names = [str(col).lower().strip() for col in df.columns]
        if "title" in col_names and "director" in col_names and 'cast' in col_names:
            df['Year'] = year
            df['Language'] = "Tamil"
            df['url'] = url
            movies.append(df)
            print(url)

https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1940
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1941
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1942
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1943
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1944
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1945
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1946
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1947
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1948
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1949
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1950
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1951
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1952
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1953
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1954
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1955
https://en.wikipedia.org/wiki/List_of_Tamil_films_of_1956
https://en.wik

In [4]:
df = pd.concat(movies, ignore_index=True)

In [5]:
df.head()

Unnamed: 0,Title,Director,Production,Music,Cast,Year,Language,url,Release date (D-M-Y),Release date,...,Ref.,Studio,Music director,Genre,Notes,Production Studio,Unnamed: 6,Ref.(s),Production company,Production Company
0,Kalidas,H. M. Reddy,Ardeshir Irani,Madhurakavi Bhaskara Das,"T. P. Rajalakshmi, P. G. Venkatesan, L. V. Prasad",1931,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,...,,,,,,,,,,
1,Harichandra (Sampoorna Harichandra),"Sarvottam Badami, T. C. Vadivelu Naicker",Sagar Film Company Chimanlal Desai,Raja Chandrasekar,"V. S Sunderasa Iyer, D. R. Muthulakshmi, T. P....",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,...,,,,,,,,,,
2,Kalava (Galavarishi),P. B. Rangachari,Sagar Film Company,G. Sundara Bhagavathar,"P. B. Rangachari, V. S. Sunderasa Iyer, T. R. ...",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,...,,,,,,,,,,
3,Paarijaatha Pushpaha ragam,P. K. Raja Sandow,Imperial Film Company,,"R. Nagendra Rao, K. T. Rukmani, Leela, Narasmi...",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,...,,,,,,,,,,
4,Ramayanam,,East India Film Company,,"T. P. Rajalakshmi, T. S. Mani",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,...,,,,,,,,,,


In [6]:
df.columns

Index(['Title', 'Director', 'Production', 'Music', 'Cast', 'Year', 'Language',
       'url', 'Release date (D-M-Y)', 'Release date', 'Producer', 'Opening',
       'Opening.1', 'Ref', 'Date', 'Music Director', 'Ref.', 'Studio',
       'Music director', 'Genre', 'Notes', 'Production Studio', 'Unnamed: 6',
       'Ref.(s)', 'Production company', 'Production Company'],
      dtype='object')

In [7]:
# keep non null values from all columns, row by row
df['music'] = df[['Music', 'Music Director', 'Music director']].bfill(axis=1).iloc[:, 0]
df['production'] = df[['Production', 'Studio', 'Production Studio', 'Production company', 'Production Company']].bfill(axis=1).iloc[:, 0]
df['release'] = df[['Release date (D-M-Y)', 'Release date', 'Date']].bfill(axis=1).iloc[:, 0]
df['ref'] = df[['Ref', 'Ref.', 'Ref.(s)']].bfill(axis=1).iloc[:, 0]

In [8]:
df = df[['Title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'release', 'ref']]

In [9]:
df

Unnamed: 0,Title,Director,Cast,Year,Language,url,Producer,Genre,production,music,release,ref
0,Kalidas,H. M. Reddy,"T. P. Rajalakshmi, P. G. Venkatesan, L. V. Prasad",1931,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Ardeshir Irani,Madhurakavi Bhaskara Das,,
1,Harichandra (Sampoorna Harichandra),"Sarvottam Badami, T. C. Vadivelu Naicker","V. S Sunderasa Iyer, D. R. Muthulakshmi, T. P....",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Sagar Film Company Chimanlal Desai,Raja Chandrasekar,,
2,Kalava (Galavarishi),P. B. Rangachari,"P. B. Rangachari, V. S. Sunderasa Iyer, T. R. ...",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Sagar Film Company,G. Sundara Bhagavathar,,
3,Paarijaatha Pushpaha ragam,P. K. Raja Sandow,"R. Nagendra Rao, K. T. Rukmani, Leela, Narasmi...",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Imperial Film Company,,,
4,Ramayanam,,"T. P. Rajalakshmi, T. S. Mani",1932,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,East India Film Company,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7707,The Proof,I.Radhika,"Sai Dhanshika, Rudhvir Vadhan, Ashok Kumar Bal...",2025,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Golden Studios,,,[194]
7708,Titanic,M. Janakiraman,"Kalaiyarasan, Anandhi, Kaali Venkat, Ashna Zaveri",2025,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Thirukumaran Entertainment,,,[195]
7709,Train,Mysskin,"Vijay Sethupathi, Nassar, Shruti Haasan, Yugi ...",2025,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,V Creations,,,[196]
7710,Vaa Vaathiyaar,Nalan Kumarasamy,"Karthi, Krithi Shetty, Sathyaraj, Anandaraj",2025,Tamil,https://en.wikipedia.org/wiki/List_of_Tamil_fi...,,,Studio Green,,,[197]


In [10]:
df.isnull().sum().sort_values(ascending = True)

Year             0
Language         0
url              0
Title           45
Director       174
Cast           180
production    2853
music         4826
ref           4902
Producer      6060
Genre         6587
release       7067
dtype: int64

In [11]:
df.to_csv("tamil_movies_1930_2025.csv", index=False)