In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
years = ['1930s','1940s','1950s']
movies = []
for year in years:
    url = "https://en.wikipedia.org/wiki/List_of_Kannada_films_of_the_" + year
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    current_year = None
    
    for tag in soup.find_all(['h2', 'table']):
        # If it's an h2 tag, try to find the year inside it
        if tag.name == 'h2':
            year_match = re.search(r'\b(195\d)\b', tag.get_text())
            if year_match:
                current_year = year_match.group(1)
            # If it's a wikitable, extract it and add the current year
        elif tag.name == 'table' and 'wikitable' in tag.get('class', []):
            df = pd.read_html(str(tag))[0]
            if current_year:
                df['Year'] = current_year
                print(current_year)     
            df['Language'] = 'Kannada'
            df['url'] = url
            movies.append(df)        

1950
1951
1952
1953
1954
1955
1956
1957
1958
1959


In [3]:
# year 1960 - 2025
years = list(range(1960, 2026))
for year in years:
    url = "https://en.wikipedia.org/wiki/List_of_Kannada_films_of_" + str(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    print(len(tables))
    for table in tables:
        df = pd.read_html(str(table))[0]
        col_names = [str(col).lower().strip() for col in df.columns]
        print(col_names)
        if "title" in col_names and "director" in col_names and 'cast' in col_names:
            df['Year'] = year
            df['Language'] = "Kannada"
            df['url'] = url
            movies.append(df)
            print(url)

2
['0', '1']
['title', 'director', 'cast', 'music director', 'producer', 'banner']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1960
2
['0', '1']
['title', 'director', 'cast', 'music director', 'producer', 'banner']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1961
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1962
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1963
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1964
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1965
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kannada_films_of_1966
1
['title', 'director', 'cast', 'music director', 'producer']
https://en.wikipedia.org/wiki/List_of_Kann

In [4]:
df = pd.concat(movies, ignore_index=True)

In [5]:
df

Unnamed: 0,Year,Title,Director,Cast,Music Director,Producer,Banner,Language,url,Director(s),...,Opening.1,Genre,Other notes,Ref.,Ref,#,Studio,Unnamed: 7,Note,Production company
0,1934,Sati Sulochana,Y. V. Rao,"R. Nagendra Rao, Tripuramba, Subbaiah Naidu, L...",R. Nagendra Rao H. R. Padmanabha Shastry,Chamanlal Doongaji,South India Movietone,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,
1,1934,Bhakta Dhruva,Parshwanatha Altekar,"Master Muthu, G. Nagesha Rao, T. Kanakalakshma...",Harmonium Sheshagirirao,U. L. Narayana Rao,Jayavani Talkies,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,
2,1935,Sadaarame,Raja Chandrashekar,"Gubbi Veeranna, K.Ashwathamma, Muraaraachaar, ...",Venkataramaiah (Papanna),Gubbi Veeranna,Shakuntala Films,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,
3,1936,Samsara Nauka,H. L. N. Simha,"B. R. Panthulu, M. V. Rajamma, S. K. Padma Dev...",M. Madhavarao,K. Nanjappa,Devi Films,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,
4,1936,Chiranjeevi,K. P. Bhave,"Devudu Narasimha Sastri, Sharada, Master Naray...",Harmonium Sheshagirirao,,Canarese Talkies,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4812,2025,Rachayya,Jadesh Hampi,"Duniya Vijay, Rachita Ram, Raj B. Shetty",,,,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,Sarathi Films
4813,2025,Ramarasa,B. M. Giriraj,Karthik Mahesh,,,,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,G Cinemas
4814,2025,Richard Anthony,Rakshit Shetty,Rakshit Shetty,,,,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,Hombale Films
4815,2025,Theertharoopa Tandeyavarige,Ramenahalli Jagannatha,Nihar Mukesh,,,,Kannada,https://en.wikipedia.org/wiki/List_of_Kannada_...,,...,,,,,,,,,,Jai Chamundeshwari Productions


In [6]:
df.columns

Index(['Year', 'Title', 'Director', 'Cast', 'Music Director', 'Producer',
       'Banner', 'Language', 'url', 'Director(s)', 'Production House', 'Film',
       'Music director', 'Music', 'Cinematographer', 'Reference',
       'Title in Kannada', 'Notes', 'Month', 'Opening', 'Opening.1', 'Genre',
       'Other notes', 'Ref.', 'Ref', '#', 'Studio', 'Unnamed: 7', 'Note',
       'Production company'],
      dtype='object')

In [7]:
# keep non null values from all columns, row by row
df['music'] = df[['Music Director', 'Music director', 'Music']].bfill(axis=1).iloc[:, 0]
df['production'] = df[['Production House', 'Production company']].bfill(axis=1).iloc[:, 0]
df['ref'] = df[['Reference', 'Ref.', 'Ref']].bfill(axis=1).iloc[:, 0]
df['title'] = df[['Title', 'Film']].bfill(axis=1).iloc[:, 0]

In [8]:
df = df[['title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'ref']]

In [9]:
df.columns = ['Title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'ref']

In [10]:
df.to_csv("kannada_movies_1930_2025.csv", index=False)