In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Before 1960
movies = []
url = "https://en.wikipedia.org/wiki/List_of_Malayalam_films_before_1960"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})
print(len(tables))
x = 0
for table in tables:
    df = pd.read_html(str(table))[0]
    df['Language'] = "Malayalam"
    df['url'] = url
    movies.append(df)
    print(url)

2
https://en.wikipedia.org/wiki/List_of_Malayalam_films_before_1960
https://en.wikipedia.org/wiki/List_of_Malayalam_films_before_1960


In [3]:
# year 1960 - 2025
# skipped year 2000 due error in scraping
years = list(range(1960, 2026))
for year in years:
    if year != 2000:
        url = "https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_" + str(year)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', {'class': 'wikitable'})
        # print(len(tables))
        for table in tables:
            df = pd.read_html(str(table))[0]
            col_names = [str(col).lower().strip() for col in df.columns]
            # print(col_names)
            if ("title" in col_names or "film" in col_names) and "director" in col_names and ('cast' in col_names or 'cast(s)' in col_names):
                df['Year'] = year
                df['Language'] = "Malayalam"
                df['url'] = url
                movies.append(df)
                print(url)

1
['opening', 'opening.1', 'sl. no.', 'title', 'director', 'story', 'cast(s)', 'music director']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1960
1
['opening', 'opening.1', 'sl. no.', 'film', 'cast', 'director', 'music director', 'notes']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1961
2
['opening', 'opening.1', 'sl. no.', 'film', 'cast', 'director', 'music director', 'notes']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1962
['opening', 'opening.1', 'sl. no.', 'film', 'cast', 'director', 'music director', 'basic language', 'notes']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1962
1
['opening', 'opening.1', 'sl. no.', 'film', 'cast', 'director', 'music director', 'notes']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1963
1
['opening', 'opening.1', 'sl. no.', 'film', 'cast', 'director', 'music director', 'notes']
https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_1964
1
['opening', 'opening.1', 'sl. no.', 'film', 'cas

In [4]:
df = pd.concat(movies, ignore_index=True)

In [5]:
df

Unnamed: 0,Movies,Year,Director,Story,Screenplay,Main actors,Language,url,No.,Movie,...,Music Director,Lyricist,Mo,Unnamed: 1,Producer,Genre,Ref,Production house,OTT,Production Company / Studio
0,Vigathakumaran,1928.0,J. C. Daniel,J. C. Daniel,J. C. Daniel,"J. C. Daniel, P. K. Rosy",Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,,,,
1,Marthanda Varma,1933.0,P. V. Rao,C. V. Raman Pillai,P. V. Rao,"Jaidev, A. V. P. Menon, Devaki, Padmini",Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,,,,
2,Balan,1938.0,S. Nottani,Sundaram Pillai A,Muthukulam Raghavan Pillai,"K. K. Aroor, Alleppey Vincent, M. K. Kamalam",Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,,,,
3,Gnanambika,1940.0,S. Nottani,C. Madhavan Pillai,S. P. Pillai (debut),"K. K. Aroor, Alleppey Vincent, Sebastian Kunju...",Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,,,,
4,Prahlada,1941.0,K. Subramanyam,Mythology,N. P. Chellappan Nair,"T. K. Balachandran, N. P. Chellappan Nair, Gur...",Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5993,,2025.0,Sreedev Kappur,,,,Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,[176][177],,,Kalarikkal Films
5994,,2025.0,Sai Krishna,,,,Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,[178],,,Vinreels Digital
5995,,2025.0,Sathyan Anthikad,,,,Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,[179],,,Aashirvad Cinemas
5996,,2025.0,Fazil Faziludeen,,,,Malayalam,https://en.wikipedia.org/wiki/List_of_Malayala...,,,...,,,,,,,[180],,,Spire Productions


In [6]:
df.columns

Index(['Movies', 'Year', 'Director', 'Story', 'Screenplay', 'Main actors',
       'Language', 'url', 'No.', 'Movie', 'Date', 'Direction', 'Music',
       'Opening', 'Opening.1', 'Sl. no.', 'Title', 'Cast(s)', 'Music director',
       'Film', 'Cast', 'Notes', 'Basic Language', 'Sl. No.', 'Music Director',
       'Lyricist', 'Mo', 'Unnamed: 1', 'Producer', 'Genre', 'Ref',
       'Production house', 'OTT', 'Production Company / Studio'],
      dtype='object')

In [7]:
# keep non null values from all columns, row by row
df['music'] = df[['Music', 'Music director', 'Music Director']].bfill(axis=1).iloc[:, 0]

In [8]:
df['title'] = df[['Movies', 'Movie', 'Title','Film']].bfill(axis=1).iloc[:, 0]
df['production'] = df[['Production house', 'Production Company / Studio']].bfill(axis=1).iloc[:, 0]
df['cast'] = df[['Main actors', 'Cast(s)', 'Cast']].bfill(axis=1).iloc[:, 0]
df['year'] = df[['Year', 'Date', 'Music Director']].bfill(axis=1).iloc[:, 0]

In [9]:
df = df[['title', 'Director', 'cast', 'year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'Ref']]

In [10]:
df.columns = ['Title', 'Director', 'Cast', 'Year', 'Language', 'url', 'Producer','Genre', 'production', 'music', 'ref']

In [11]:
df.to_csv("malayalam_movies_1928_2025.csv", index=False)