In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm
tqdm.pandas()
from dateutil.relativedelta import relativedelta
pd.set_option("display.max_columns", None)

In [2]:
input_folder = "board_data"
output_folder = "director_merge_output"

## Helper Functions

In [3]:
# Reading Individual Years Data files and skipping 1 row. Table starts from row = 1, with header at row = 1
# This also inserts AsOnDate column and fills with the date from the file name

def read_data(sheet_name: str) -> pd.DataFrame:
    dfs = []
    for i,year in enumerate(range(2006, 2025)):
        file_path = rf"{input_folder}\Indian Boards Data as on 31-Mar-{year}.xlsx"
        try:
            df = pd.read_excel(file_path, sheet_name = sheet_name, skiprows = 1)
            df.insert(2, column = "AsOnDate", value = pd.to_datetime(f"{year}-03-31", format = "%Y-%m-%d"))
            df.set_index(["Symbol", "Company", "AsOnDate"], inplace = True)
            dfs.append(df)

        except ValueError:
            print(f"No {sheet_name} data found for year {year} in the file. Skipping...")
        except FileNotFoundError:
            print(f"No file found for year {year}. Skipping...")
        except Exception as e:
            print(f"Unexpected error with file year {year}: {e}")

    if dfs:
        merged = pd.concat(dfs, axis=0).reset_index().sort_values(["Company", "AsOnDate"]).reset_index(drop = True)
        return merged

    else:
        print("No data files found using this sheet name...")
        return pd.DataFrame()

## Individual Director Data Only

In [4]:
main_direct = read_data("Master Profile")

In [5]:
len(main_direct)

446909

In [6]:
main_direct.to_pickle(rf"{output_folder}\Main_Director_incl Cess.pkl")

## Committees Data Only

In [7]:
main_comm = read_data("Committee Details")

No Committee Details data found for year 2006 in the file. Skipping...
No Committee Details data found for year 2007 in the file. Skipping...
No Committee Details data found for year 2008 in the file. Skipping...
No Committee Details data found for year 2009 in the file. Skipping...
No Committee Details data found for year 2010 in the file. Skipping...
No Committee Details data found for year 2011 in the file. Skipping...
No Committee Details data found for year 2012 in the file. Skipping...
No Committee Details data found for year 2013 in the file. Skipping...
No Committee Details data found for year 2014 in the file. Skipping...
No Committee Details data found for year 2015 in the file. Skipping...


In [8]:
len(main_comm)

267668

In [9]:
main_comm.to_pickle(rf"{output_folder}\Main_Committees_incl Cess.pkl")

## Board Attendance Data Only

In [10]:
main_board = read_data("Board Meetings Attendance")

No Board Meetings Attendance data found for year 2006 in the file. Skipping...
No Board Meetings Attendance data found for year 2007 in the file. Skipping...
No Board Meetings Attendance data found for year 2008 in the file. Skipping...
No Board Meetings Attendance data found for year 2009 in the file. Skipping...
No Board Meetings Attendance data found for year 2010 in the file. Skipping...
No Board Meetings Attendance data found for year 2011 in the file. Skipping...
No Board Meetings Attendance data found for year 2012 in the file. Skipping...
No Board Meetings Attendance data found for year 2013 in the file. Skipping...
No Board Meetings Attendance data found for year 2014 in the file. Skipping...


In [11]:
len(main_board)

150650

In [12]:
main_board.to_pickle(rf"{output_folder}\Main_Board Attendance_incl Cess.pkl")