In [1]:
import pandas_datareader.data as web #to collect data
import datetime as dt #to specify start and end dates

# import yfinance as yf

import eventstudy as es
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.rolling import RollingOLS

from patsy import dmatrices
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
import_folder_path = r"..\..\[IN USE] Rookie Directors\[1] Director Level\director_wrangle_output"
output_folder_path = "car_filtering"
supporting_folder_path = "supporting_datafiles"

## Data reading and melting

In [3]:
data = pd.read_csv(rf"{supporting_folder_path}\Adjusted Clos_collated.csv").drop("Unnamed: 0", axis = 1)

In [4]:
dataLong = data.melt( id_vars = "AsOnDate", value_vars = data.columns[1:3908]).rename({"value":"ACP", "variable":"CompanyName"}, axis = 1).drop_duplicates().reset_index(drop = True)
dataLong["AsOnDate"] = pd.to_datetime(dataLong["AsOnDate"], format = "%Y-%m-%d")
dataLong = dataLong.loc[~( (dataLong.duplicated(subset = ["CompanyName", "AsOnDate"], keep = False)) & (dataLong["ACP"].isnull())) ]
dataLong = dataLong.loc[~ dataLong.duplicated(subset = ["CompanyName", "AsOnDate"], keep = False)].drop_duplicates().reset_index(drop = True)

## Data Cleaning

## Data Snipping from either ends

In [5]:
def dataSnip(frame):
    
    first_valid_idx = frame["ACP"].first_valid_index()
    
    if first_valid_idx is not None:
        frame = frame.loc[first_valid_idx:]
        
    else:
        frame = frame.iloc[0:0]

    last_valid_idx = frame["ACP"].last_valid_index()
    
    if last_valid_idx is not None:
        frame = frame.loc[:last_valid_idx]
        
    else:
        frame = frame
        
    return frame

In [6]:
dataLong2 = dataLong.groupby(by="CompanyName").progress_apply(dataSnip).reset_index(drop=True)

  0%|          | 0/3907 [00:00<?, ?it/s]

## Missing Data dummies

### Missing acp Data dummy

In [7]:
dataLong2["missing_acp_dummy"] = pd.isnull(dataLong2["ACP"]).astype(int)

In [8]:
dataLong2.missing_acp_dummy.value_counts()

missing_acp_dummy
0    12485135
1      979958
Name: count, dtype: int64

In [9]:
dataLong2

Unnamed: 0,AsOnDate,CompanyName,ACP,missing_acp_dummy
0,2008-10-06,20 Microns Ltd.,16.82,0
1,2008-10-07,20 Microns Ltd.,15.05,0
2,2008-10-08,20 Microns Ltd.,13.25,0
3,2008-10-10,20 Microns Ltd.,11.60,0
4,2008-10-13,20 Microns Ltd.,12.32,0
...,...,...,...,...
13465088,2024-03-21,Zylog Systems Ltd.,0.35,0
13465089,2024-03-22,Zylog Systems Ltd.,0.35,0
13465090,2024-03-26,Zylog Systems Ltd.,0.35,0
13465091,2024-03-27,Zylog Systems Ltd.,0.35,0


### Missing 50% trading days dummy at estimation period

In [10]:
def sufficient_trading_days_dummy(frame: pd.DataFrame, estimation_period: int) -> pd.DataFrame:
    frame["na_trading_days"] = frame["missing_acp_dummy"].rolling(estimation_period,
                                                                  center = False).sum()
    frame["na_trading_days"] = frame["na_trading_days"].shift(31)
    frame["insufficient_data_dummy"] = np.where(
        (frame["na_trading_days"] < (0.5*estimation_period)) | pd.isnull(frame["na_trading_days"]),
        0, 1)
    
    return frame.loc[ frame["insufficient_data_dummy"] == 1]

In [11]:
CARs = [120, 150, 180, 210]

insuffDataLong = []
for CAR in CARs:
    result = dataLong2.groupby("CompanyName").progress_apply(
        sufficient_trading_days_dummy, estimation_period = CAR)
    insuffDataLong.append(result.reset_index(drop = True))

  0%|          | 0/3721 [00:00<?, ?it/s]

  0%|          | 0/3721 [00:00<?, ?it/s]

  0%|          | 0/3721 [00:00<?, ?it/s]

  0%|          | 0/3721 [00:00<?, ?it/s]

In [12]:
insuffDataLong[3]

Unnamed: 0,AsOnDate,CompanyName,ACP,missing_acp_dummy,na_trading_days,insufficient_data_dummy
0,2005-10-13,Aarti Pharmalabs Ltd.,,1,105.0,1
1,2005-10-14,Aarti Pharmalabs Ltd.,,1,106.0,1
2,2005-10-17,Aarti Pharmalabs Ltd.,,1,107.0,1
3,2005-10-18,Aarti Pharmalabs Ltd.,,1,108.0,1
4,2005-10-19,Aarti Pharmalabs Ltd.,,1,109.0,1
...,...,...,...,...,...,...
972446,2010-05-27,Zydus Wellness Ltd.,484.75,0,109.0,1
972447,2010-05-28,Zydus Wellness Ltd.,485.85,0,108.0,1
972448,2010-05-31,Zydus Wellness Ltd.,484.60,0,107.0,1
972449,2010-06-01,Zydus Wellness Ltd.,472.60,0,106.0,1


In [14]:
output_col = ["AsOnDate", "CompanyName", "insufficient_data_dummy", "na_trading_days"]

for i,CAR in enumerate(CARs):
        insuffDataLong[i][output_col].to_pickle(rf"{output_folder_path}\Insufficient Data for {CAR}CAR.pkl")

In [None]:
insuffDataLong[3].na_trading_days.max()