### Scraping Telugu movies from Wiki

In [1]:
# import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# year 1930-39
url = "https://en.wikipedia.org/wiki/List_of_Telugu_films_of_the_1930s"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})
movies = []
for table in tables:
    df = pd.read_html(str(table))[0]
    col_names = [str(col).lower().strip() for col in df.columns]
    if "title" in col_names and "director" in col_names and 'cast' in col_names:
        df['Year'] = '1930s'
        df['Language'] = "Telugu"
        df['url'] = url
        movies.append(df)
        print(url)

https://en.wikipedia.org/wiki/List_of_Telugu_films_of_the_1930s


In [3]:
# year 1940 - 2025
years = list(range(1940, 2026))
for year in years:
    url = "https://en.wikipedia.org/wiki/List_of_Telugu_films_of_" + str(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    for table in tables:
        df = pd.read_html(str(table))[0]
        col_names = [str(col).lower().strip() for col in df.columns]
        if "title" in col_names and "director" in col_names and 'cast' in col_names:
            df['Year'] = year
            df['Language'] = "Telugu"
            df['url'] = url
            movies.append(df)
            print(url)

https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1940
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1941
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1942
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1943
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1944
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1945
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1946
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1947
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1948
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1949
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1950
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1951
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1952
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1953
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1954
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_1955
https://en.wikipedia.org/wiki/List_of_Telugu_films_of_19

In [4]:
final_df = pd.concat(movies, ignore_index=True)

In [5]:
final_df.head()

Unnamed: 0,Title,Director,Production,Cast,Release,Unnamed: 5,Unnamed: 6,Year,Language,url,...,Source,Sources,Production studio,Opening,Opening.1,Ref,Studio,Production house,Production house,Production House
0,1931,1931,1931,1931,1931,1931.0,,1930s,Telugu,https://en.wikipedia.org/wiki/List_of_Telugu_f...,...,,,,,,,,,,
1,Kalidas,H. M. Reddy,Imperial Movi-Tone,"T. P. Rajalakshmi, P. G. Venkatesan, L. V. Prasad",31 October 1931,,,1930s,Telugu,https://en.wikipedia.org/wiki/List_of_Telugu_f...,...,,,,,,,,,,
2,1932,1932,1932,1932,1932,1932.0,1932.0,1930s,Telugu,https://en.wikipedia.org/wiki/List_of_Telugu_f...,...,,,,,,,,,,
3,Bhakta Prahlada,H. M. Reddy,Sri Krishna Film Company,"Venkata Subbaiah Municipalle, Surabhi Kamalaba...",6 February 1932,,,1930s,Telugu,https://en.wikipedia.org/wiki/List_of_Telugu_f...,...,,,,,,,,,,
4,Sri Rama Paduka Pattabhishekam,Sarvottam Badami,Sagar Movietone,"C. S. R. Anjaneyulu, Surabhi Kamalabai, Yadava...",,,,1930s,Telugu,https://en.wikipedia.org/wiki/List_of_Telugu_f...,...,,,,,,,,,,


In [6]:
final_df.columns

Index(['Title', 'Director', 'Production', 'Cast', 'Release', 'Unnamed: 5',
       'Unnamed: 6', 'Year', 'Language', 'url', 'Producer', 'Genre', 'Ref.',
       'Composer', 'Release Date', 'Production company', 'Music',
       'Music Director', 'Unnamed: 4', 'Music director', 'Notes', 'Source',
       'Sources', 'Production studio', 'Opening', 'Opening.1', 'Ref', 'Studio',
       'Production house', 'Production house', 'Production House'],
      dtype='object')

In [7]:
# keep non-null values from all columns, row by row
final_df['production'] = final_df[['Production', 'Production company', 'Production studio', 'Studio','Production house','Production House']].bfill(axis=1).iloc[:, 0]
final_df['music'] = final_df[['Composer', 'Music', 'Music Director', 'Music director']].bfill(axis=1).iloc[:, 0]
final_df['release'] = final_df[['Release', 'Release Date', 'Opening', 'Opening.1']].bfill(axis=1).iloc[:, 0]
final_df['ref'] = final_df[['Ref.', 'Ref', 'Source', 'Sources']].bfill(axis=1).iloc[:, 0]

In [8]:
# once merged drop extras
final_df.drop(columns=['Production', 'Production company', 'Production studio', 'Studio','Production house','Production House'], inplace=True)
final_df.drop(columns=['Composer', 'Music', 'Music Director', 'Music director'], inplace=True)
final_df.drop(columns=['Release', 'Release Date', 'Opening', 'Opening.1'], inplace=True)
final_df.drop(columns=['Ref.', 'Ref', 'Source', 'Sources'], inplace=True)

In [13]:
final_df.columns

Index(['Title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer',
       'Genre', 'Production house', 'production', 'music', 'release', 'ref'],
      dtype='object')

In [12]:
final_df.drop(columns=['Unnamed: 5','Unnamed: 6','Unnamed: 4','Notes'],inplace=True)

In [14]:
df = final_df[['Title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'release', 'ref']]

In [15]:
# Drop any column where >90% of values are NaN:
null_threshold = 0.9
df = df.loc[:, df.isnull().mean() < null_threshold]

In [18]:
df.isnull().sum()

Title           28
Director       213
Cast           140
Year             0
Language         0
url              0
production    2852
music         4084
release       1921
ref           3234
dtype: int64

In [19]:
df.to_csv("telugu_movies_1930_2025.csv", index=False)