# Webscraping script for Movie Recommendation engine

## Import libraries

In [None]:
!pip install BeautifulSoup4



In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Define DataFrame structure

In [None]:
columns=['Movie_ID','Title','Year','Genre']
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,Movie_ID,Title,Year,Genre


## 2000-2007

In [None]:
URL = {
    2000: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2000',
    2001: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2001',
    2002: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2002',
    2003: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2003',
    2004: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2004',
    2005: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2005',
    2006: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2006',
    2007: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2007'
}

movie_id = 10000

# Scrape data for each year
for year, url in URL.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.findAll('table', {'class': 'wikitable'})[1].tbody
    rows = table.find_all('tr')

    # Iterate over rows in the table skipping the header row
    for i in range(1, len(rows)):
        tds = rows[i].find_all('td')
        if len(tds) >= 4:  # Check if row has enough columns
            values = [movie_id, tds[0].text.strip(), year, tds[3].text.strip()]
            new_row = pd.DataFrame([values], columns=columns)
            df = pd.concat([df, new_row], ignore_index=True)
            movie_id += 1

## 2008-2009


In [None]:
URL = {
    2008: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2008',
    2009: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2009',
}

movie_id = 20000

# Scrape data for each year
for year, url in URL.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tbl = soup.findAll('table', {'class': 'wikitable'})

    for t in tbl:
        rows = t.find_all('tr')
        for i in range(1, len(rows)):
            tds = rows[i].find_all('td')
            if len(tds) == 5:
                values = [movie_id, tds[1].text.strip(), year, tds[4].text.strip().replace('\n', '')]
            elif len(tds) == 4:
                values = [movie_id, tds[0].text.strip(), year, tds[3].text.strip().replace('\n', '')]
            elif len(tds) == 6:
                values = [movie_id, tds[2].text.strip(), year, tds[5].text.strip().replace('\n', '')]

            new_row = pd.DataFrame([values], columns=columns)
            df = pd.concat([df, new_row], ignore_index=True)
            movie_id += 1

## 2010


In [None]:
import pandas as pd

# Define the URL for the year 2010
url_2010 = 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2010'

# Define the starting Movie_ID for the year 2010
movie_id_start = 30000  # Update this to your actual starting Movie_ID

# Initialize an empty DataFrame to hold the combined data for 2010
df_2010 = pd.DataFrame(columns=['Movie_ID', 'Title', 'Year', 'Genre'])

# Read the HTML tables from the Wikipedia page
tables = pd.read_html(url_2010)

# Process the tables corresponding to each quarter
for i in range(4, 8):  # The specified indices for the tables
    # Select only the 'Title' and 'Genre' columns, which are the second and last columns respectively
    df_quarter = tables[i].iloc[:, [2, -1]].copy()
    df_quarter.columns = ['Title', 'Genre']

    # Insert 'Movie_ID' and 'Year' columns
    df_quarter.insert(0, 'Movie_ID', range(movie_id_start, movie_id_start + len(df_quarter)))
    df_quarter.insert(2, 'Year', 2010)

    # Update movie_id_start for the next iteration
    movie_id_start += len(df_quarter)

    # Concatenate the DataFrame with the df_2010 DataFrame
    df_2010 = pd.concat([df_2010, df_quarter], ignore_index=True)


# Append final_df at the end of the main df
df = pd.concat([df, df_2010], ignore_index=True)

## 2011

In [None]:
URL = {
    2011: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2011'
}

movie_id = 40000

# Scrape data for the year 2011
for year, url in URL.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tbl = soup.findAll('table', {'class': 'wikitable'})

    for t in tbl:
        rows = t.find_all('tr')
        for i in range(1, len(rows)):
            tds = rows[i].find_all('td')
            if len(tds) == 7:
                values = [movie_id, tds[2].text.strip(), year, tds[3].text.strip().replace('\n', '')]
            elif len(tds) == 5:
                values = [movie_id, tds[0].text.strip(), year, tds[1].text.strip().replace('\n', '')]
            elif len(tds) == 6:
                values = [movie_id, tds[1].text.strip(), year, tds[2].text.strip().replace('\n', '')]

            new_row = pd.DataFrame([values], columns=columns)
            df = pd.concat([df, new_row], ignore_index=True)
            movie_id += 1

## 2012

In [None]:
movie_id_start = 50000

tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2012')
final_df = pd.DataFrame(columns=columns)

# Process the tables for the year 2012
for i in range(3, 7):
    df_2012 = tables[i]
    df_2012.drop(['Opening', 'Opening.1', 'Director', 'Cast'], axis=1, inplace=True)
    df_2012.columns = ['Title', 'Genre']
    df_2012.insert(0, 'Movie_ID', range(movie_id_start, movie_id_start + len(df_2012)))
    df_2012.insert(2, 'Year', 2012)
    movie_id_start += len(df_2012)  # Update the Movie_ID start for the next iteration
    final_df = pd.concat([final_df, df_2012], ignore_index=True)

# Append final_df at the end of the main df
df = pd.concat([df, final_df], ignore_index=True)


## 2013-2015

In [None]:
import pandas as pd

# Define the URLs for each year
URLs = {
    2013: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2013',
    2014: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2014',
    2015: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2015'
}

# Define the starting Movie_ID for the first year
movie_id_start = 60000

# Initialize an empty DataFrame to hold all combined data
final_df = pd.DataFrame(columns=['Movie_ID', 'Title', 'Year', 'Genre'])

# Iterate over the years
for year, url in URLs.items():
    # Read the HTML tables from the Wikipedia page
    tables = pd.read_html(url)

    # Process the tables corresponding to each quarter
    for i in range(2, 6):  # Use the specified indices for the tables
        df_year = tables[i]
        # Drop unnecessary columns, some tables may not have all the columns
        df_year.drop(['Opening', 'Opening.1', 'Director', 'Cast', 'Source'], axis=1, errors='ignore', inplace=True)

        # Check if the table is in the expected format (2 columns remaining)
        if len(df_year.columns) != 2:
            print(f"Table format unexpected in year {year}, table index {i}. Skipping this table.")
            continue

        # Rename columns to the desired names
        df_year.columns = ['Title', 'Genre']
        # Insert 'Movie_ID' and 'Year' columns
        df_year.insert(0, 'Movie_ID', range(movie_id_start, movie_id_start + len(df_year)))
        df_year.insert(2, 'Year', year)
        # Update movie_id_start for the next iteration
        movie_id_start += len(df_year)
        # Concatenate the DataFrame with the final DataFrame
        final_df = pd.concat([final_df, df_year], ignore_index=True)

# Append final_df at the end of the main df
df = pd.concat([df, final_df], ignore_index=True)


## 2016-2017

In [None]:
import pandas as pd

# Define the URLs for each year
URLs = {
    2016: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2016',
    2017: 'https://en.wikipedia.org/wiki/List_of_Bollywood_films_of_2017'
}

# Define the starting Movie_ID for the first year
movie_id_start = 70000

# Initialize an empty DataFrame to hold all combined data
final_df = pd.DataFrame(columns=['Movie_ID', 'Title', 'Year', 'Genre'])

# Iterate over the years
for year, url in URLs.items():
    # Read the HTML tables from the Wikipedia page
    tables = pd.read_html(url)

    # Process the tables corresponding to each quarter
    for i in range(2, 6):  # Use the specified indices for the tables
        df_year = tables[i]
        # Assuming the structure is Opening, Title, Director, Cast, Genre, Studio, Ref. (as per screenshots)
        # We want to keep only Title and Genre columns, which are in positions 1 and 4 (zero-indexed)
        df_year = df_year.iloc[:, [2, 5]]
        df_year.columns = ['Title', 'Genre']
        df_year.insert(0, 'Movie_ID', range(movie_id_start, movie_id_start + len(df_year)))
        df_year.insert(2, 'Year', year)
        movie_id_start += len(df_year)  # Update movie_id_start for the next iteration
        final_df = pd.concat([final_df, df_year], ignore_index=True)

# Append final_df at the end of the main df
df = pd.concat([df, final_df], ignore_index=True)

In [None]:
df

Unnamed: 0,Movie_ID,Title,Year,Genre
0,10000,Aaghaaz,2000,Thriller
1,10001,Aaj Ka Ravan,2000,Drama
2,10002,Anjaane,2000,Romance
3,10003,Anokha Moti,2000,Family
4,10004,Apradhi Kaun,2000,Thriller
...,...,...,...,...
1930,70248,Fukrey Returns,2017,Comedy
1931,70249,Game Over,2017,Suspense/thriller
1932,70250,Sallu Ki Shaadi,2017,Comedy drama
1933,70251,Monsoon Shootout,2017,Crime/thriller


## Download the .csv file

In [None]:
df = df.apply(lambda x : x.lower().strip() if isinstance(x, str) else x)

df.to_csv('movies_dataset.csv', index=False)