In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import statsmodels.api as sm

In [None]:
#excel files are read and a ticker column is added to each sheet

repertoire = "data/stocks" 
dataframes_dict = {}

for fichier in os.listdir(repertoire): 
    chemin_complet = os.path.join(repertoire, fichier)

    if fichier.endswith((".xls", ".xlsx")):  
        print(f"File found : {fichier}")

    try:
        worksheet = pd.read_excel(chemin_complet, sheet_name=None)

        for sheet_name, df in worksheet.items():
            if 'Ticker' not in df.columns:
                df['Ticker'] = sheet_name  
                print(f"Ticker added to {sheet_name}")
                dataframes_dict[f"{fichier}_{sheet_name}"] = df
            else:
                print(f"Column 'Ticker' already existed in {fichier}")

    except Exception as e:
        print(f"Error during the process {fichier} : {e}")

In [None]:
#variables are renamed and split by frequency
monthly_list = []
yearly_list = []
quarterly_list = []

daily_data_list = [] #we create multiple lists because the start dates differ
daily_tot_return_list = []
daily_askbid_list = []

gics_list = []

for name, df in dataframes_dict.items():
    rename_map = {
        df.columns.values[0]: "Date.1",
        df.columns.values[1]: "Total_Assets",
        df.columns.values[2]: "Common_Equity",
        df.columns.values[3]: "Cash_And_Investments",
        df.columns.values[4]: "R&D_Expenses",
        df.columns.values[5]: "Inventories",
        df.columns.values[6]: "Dividends_Paid",
        df.columns.values[7]: "Gross_Fixed_Assets",
        df.columns.values[8]: "Income_Before_Extra_Items",
        df.columns.values[9]: "Sales_Revenue",
        df.columns.values[10]: "Depreciation_Amortization",

        df.columns.values[12]: "Date.2",
        df.columns.values[13]: "Mkt_Cap_Yearly",
        df.columns.values[14]: "Shares_Outstanding_Yearly",
        df.columns.values[15]: "Long_Term_Debt",

        df.columns.values[17]: "Date.3",
        df.columns.values[18]: "Net_Income",

        df.columns.values[20]: "Date.4",
        df.columns.values[21]: "Shares_Outstanding_Monthly",
        df.columns.values[22]: "Mkt_Cap_Monthly",

        df.columns.values[24]: "Date.5",
        df.columns.values[25]: "Px_Last",
        df.columns.values[26]: "Shares_Outstanding_Daily",
        df.columns.values[27]: "Volume",

        df.columns.values[29]: "Date.6",
        df.columns.values[30]: "Total_Return",

        df.columns.values[32]: "Date.7",
        df.columns.values[33]: "Px_Ask",
        df.columns.values[34]: "Px_Bid",

        df.columns.values[42]: "Industry",
        df.columns.values[43]: "Sector",

    }

    df.rename(columns=rename_map, inplace=True)

    try:
        df_yearly = df[[
            "Date.1", "Total_Assets", "Common_Equity", "Cash_And_Investments",
            "R&D_Expenses", "Inventories", "Dividends_Paid", "Gross_Fixed_Assets",
            "Income_Before_Extra_Items", "Sales_Revenue", "Depreciation_Amortization",
            "Mkt_Cap_Yearly", "Shares_Outstanding_Yearly", "Long_Term_Debt"
        ]].copy()
        
        df_quarterly = df[["Date.3", "Net_Income"]].copy()
       
        df_monthly = df[[
            "Date.4", "Shares_Outstanding_Monthly", "Mkt_Cap_Monthly" 
        ]].copy()

        df_daily_data = df[[
            "Date.5", "Px_Last", "Shares_Outstanding_Daily", "Volume"
        ]].copy()

        df_daily_tot_return = df[[
            "Date.6", "Total_Return"
        ]].copy()

        df_daily_askbid= df[[
            "Date.7", "Px_Ask", "Px_Bid"
        ]].copy()

        df_gics = df[[
            "Industry", "Sector"
        ]].copy()
        
        df_yearly['Ticker'] = df['Ticker'].iloc[0]
        df_quarterly['Ticker'] = df['Ticker'].iloc[0]
        df_monthly['Ticker'] = df['Ticker'].iloc[0]
        df_daily_data['Ticker'] = df['Ticker'].iloc[0]
        df_daily_tot_return['Ticker'] = df['Ticker'].iloc[0]
        df_daily_askbid['Ticker'] = df['Ticker'].iloc[0]
        df_gics['Ticker'] = df['Ticker'].iloc[0]

        yearly_list.append(df_yearly)
        quarterly_list.append(df_quarterly)
        monthly_list.append(df_monthly)
        daily_data_list.append(df_daily_data)
        daily_tot_return_list.append(df_daily_tot_return)
        daily_askbid_list.append(df_daily_askbid)
        gics_list.append(df_gics)


        print(f"{name} : successful yearly / monthly / quarterly / gics split")

    except Exception as e:
        print(f"{name} : error during split : {e}")

yearly_df = pd.concat(yearly_list, ignore_index=True)
quarterly_df = pd.concat(quarterly_list, ignore_index=True)
monthly_df = pd.concat(monthly_list, ignore_index=True)

daily_data_df = pd.concat(daily_data_list, ignore_index=True)
daily_tot_return_df = pd.concat(daily_tot_return_list, ignore_index=True)
daily_askbid_df = pd.concat(daily_askbid_list, ignore_index=True)

gics_df = pd.concat(gics_list, ignore_index=True)


In [None]:
lists = {
    "yearly": yearly_df,
    "quarterly": quarterly_df,
    "monthly": monthly_df,
    "daily_tot_return": daily_tot_return_df,
    "daily_data": daily_data_df,
    "daily_askbid": daily_askbid_df,
}

for name, df in lists.items():
    #we rename columns starting with "Date.xxx" to "Date"
    df.rename(columns={col: "Date" for col in df.columns if col.startswith("Date")}, inplace=True)
    
    #we convert "Date" column to datetime
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    
    #we convert all other columns to float64 (excluding "Date" and "Ticker")
    for col in df.columns:
        if col not in ["Date", "Ticker"]:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype("float64")

    #display column information for verification
    print(f"--- {name.capitalize()} DataFrame ---")
    print(f"  - Colonnes et Types:")
    for col in df.columns:
        print(f"    - {col}: {df[col].dtype}")
    print(f"  - Nombre de lignes: {df.shape[0]}")
    print(f"  - Nombre de colonnes: {df.shape[1]}")
    print("-" * 50)

In [None]:
#NaN Cells
"""
We remove rows where the date column is NaN. When splitting the data by frequency, each row is assigned a ticker.
However, because daily data have more rows than other frequencies, the ticker is excessively duplicated in the lower-frequency 
DataFrames (monthly, quarterly, yearly), leading to rows that are mostly empty.
Dropping rows without a date removes these redundancies without any loss of actual data.
"""


lists["yearly"] = yearly_df.dropna(subset=["Date"])
lists["quarterly"] = quarterly_df.dropna(subset=["Date"])
lists["monthly"] = monthly_df.dropna(subset=["Date"])
lists["daily_data"] = daily_data_df.dropna(subset=["Date"])
lists["daily_tot_return"] = daily_tot_return_df.dropna(subset=["Date"])
lists["daily_askbid"] = daily_askbid_df.dropna(subset=["Date"])

all_nan_matrices = {}

for name, df in lists.items():  
    if "Ticker" in df.columns:
        cols = [col for col in df.columns if col != "Ticker"]
        nan_matrix = (
            df.groupby("Ticker")[cols]
              .apply(lambda g: g.isna().mean() * 100)
              .reset_index()
        )
        all_nan_matrices[name] = nan_matrix
        print(f"\n % Pourcentage de NaN pour {name} :\n", nan_matrix)

In [None]:
#NaN handling using SimpleImputer
from sklearn.impute import SimpleImputer

#create an imputer to replace NaNs with the mean
imputer = SimpleImputer(strategy="mean")

for name, df in lists.items():
    cols_to_impute = [col for col in df.columns if col not in ["Date", "Ticker"]]
    df.loc[:, cols_to_impute] = df.groupby("Ticker")[cols_to_impute].transform(lambda x: x.fillna(x.mean()))

In [None]:
yearly_df = lists["yearly"]
quarterly_df = lists["quarterly"]
monthly_df = lists["monthly"]
daily_data_df = lists["daily_data"]
daily_tot_return_df = lists["daily_tot_return"]
daily_askbid_df = lists["daily_askbid"]

In [None]:
#yearly computations    

#mkt cap and shares outstanding were extracted with a BDH formula and are expressed in millions so we have to multiply by 10^6
cols_to_scale = ["Mkt_Cap_Yearly", "Shares_Outstanding_Yearly", "Long_Term_Debt"]
yearly_df[cols_to_scale] = yearly_df[cols_to_scale] * 1e6

daily_data_df["Shares_Outstanding_Daily"] = daily_data_df["Shares_Outstanding_Daily"] * 1e6



In [None]:
print(yearly_df.head())

In [None]:
#yearly computations

#asset growth (agr)
yearly_df["agr"] = (
    yearly_df["Total_Assets"] - yearly_df.groupby("Ticker")["Total_Assets"].shift(1)
) / yearly_df.groupby("Ticker")["Total_Assets"].shift(1)


#cash productivity (cashpr)
yearly_df.loc[:, "cashpr"] = (
    yearly_df["Mkt_Cap_Yearly"] + yearly_df["Long_Term_Debt"] - yearly_df["Total_Assets"]
) / yearly_df["Cash_And_Investments"]


#change in inventory (chinv)
yearly_df["chinv"] = (
    yearly_df["Inventories"] - yearly_df.groupby("Ticker")["Inventories"].shift(1)
) / yearly_df["Total_Assets"]

#change in shares outstanding (chsh)   
yearly_df["chsh"] = (
    yearly_df["Shares_Outstanding_Yearly"] - yearly_df.groupby("Ticker")["Shares_Outstanding_Yearly"].shift(1)
) / yearly_df.groupby("Ticker")["Shares_Outstanding_Yearly"].shift(1)

#depreciation / Gross Fixed Assets (depr)
yearly_df["depr"] = (
    yearly_df["Depreciation_Amortization"] / 
    yearly_df["Gross_Fixed_Assets"]
)

#dividends to Market Cap (dy)
yearly_df["dy"] = (
    yearly_df["Dividends_Paid"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#earnings to Price (ep)
yearly_df["ep"] = (
    yearly_df["Income_Before_Extra_Items"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#investment to assets (invest)
yearly_df.loc[:, "invest"] = (
    yearly_df.groupby("Ticker")["Gross_Fixed_Assets"].transform(lambda x: x - x.shift(1))
    + yearly_df.groupby("Ticker")["Inventories"].transform(lambda x: x - x.shift(1))
) / yearly_df.groupby("Ticker")["Total_Assets"].transform(lambda x: x.shift(1))


#R&D to Market Value of Equity (rd_mve)
yearly_df["rd_mve"] = (
    yearly_df["R&D_Expenses"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#sales to Price (sp)
yearly_df["sp"] = (
    yearly_df["Sales_Revenue"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

In [None]:
#quarterly computations

quarterly_df["delta_income"] = (
    quarterly_df["Net_Income"] - quarterly_df["Net_Income"].shift(1)
)

quarterly_df["direction"] = np.sign(quarterly_df["delta_income"])

def compute_nincr(direction_series):
    nincr = []
    count = 0
    prev = 0
    for d in direction_series:
        if d == prev and d != 0:
            count += 1
        elif d != 0:
            count = 1
        else:
            count = 0
        nincr.append(count * d if d != 0 else 0)
        prev = d if d != 0 else prev
    return nincr

quarterly_df["nincr"] = (
    quarterly_df.groupby("Ticker")["direction"].transform(compute_nincr)
)

In [None]:
#monthly computations

tot_return_raw = daily_tot_return_df[["Date", "Ticker", "Total_Return"]] #we'll need it to compute weekly returns for beta afterwards 

monthly_data = {
    "daily_tot_return_df": daily_tot_return_df,
    "monthly_df": monthly_df,
    "daily_data_df": daily_data_df,
    "daily_askbid_df": daily_askbid_df,
}

#date to monthly period
for name, df in monthly_data.items():
    if pd.api.types.is_datetime64_any_dtype(df["Date"]):
        df["Date"] = df["Date"].dt.to_period("M")
        print(f"{name} : Date converted to monthly period")
    else:
        print(f"{name} : Date already converted")

#size (mvel1)
monthly_df["mvel1"] = (
    monthly_df.groupby("Ticker")["Mkt_Cap_Monthly"]
    .transform(lambda x: np.log(x).shift(1))
)


#dolvol computation
dolvol_df = daily_data_df[["Date", "Ticker", "Volume", "Px_Last"]]
dolvol_df["dolvol"] = dolvol_df["Volume"] * dolvol_df["Px_Last"]

dolvol_df = (
    dolvol_df.groupby(["Ticker", "Date"])["dolvol"]
    .sum()
    .reset_index(name="dolvol_sum")
)

dolvol_df["dolvol_monthly"] = np.log(dolvol_df["dolvol_sum"])
dolvol_df["dolvol_lag2"] = dolvol_df.groupby("Ticker")["dolvol_monthly"].shift(2)

#bid-ask spread
daily_askbid_df["mean bid-ask spread"] = (
    daily_askbid_df["Px_Ask"]- daily_askbid_df["Px_Bid"]
    .groupby([daily_askbid_df["Ticker"], daily_askbid_df["Date"]])
    .transform("mean")
) 

daily_askbid_df["mean daily spread"] = (
    daily_askbid_df["Px_Ask"] + daily_askbid_df["Px_Bid"]
) / 2 

daily_askbid_df["baspread"] = (
    daily_askbid_df["mean bid-ask spread"] - daily_askbid_df["mean daily spread"]
)

#momentums and maxret computations 

#daily return computation
daily_tot_return_df["daily return"] = (
    (daily_tot_return_df["Total_Return"] - daily_tot_return_df.groupby("Ticker")["Total_Return"].transform("shift", 1))
    / daily_tot_return_df.groupby("Ticker")["Total_Return"].transform("shift", 1)
)   

#max return (maxret)
maxret_df = (
    daily_tot_return_df.groupby(["Ticker", "Date"])["daily return"]
    .max()
    .groupby(level=0)
    .shift(1)
    .reset_index()
    .rename(columns={"daily return": "maxret"})
)

#return volatility (retvol)
retvol_df = (
    daily_tot_return_df
    .groupby(["Ticker", "Date"])["daily return"]
    .std()
    .groupby(level=0)  
    .shift(1)          
    .reset_index()
    .rename(columns={"daily return": "retvol"})
)

#share turnover (turn)
daily_data_df["Mean_Volume"] = (
    daily_data_df.groupby("Ticker")["Volume"].transform("mean")
)

turn_df = (
    daily_data_df.groupby(["Ticker", "Date"])["Mean_Volume"].first().reset_index()
)

turn_df = turn_df.merge(
    monthly_df[["Ticker", "Date", "Shares_Outstanding_Monthly"]],
    how="left",
    left_on=["Ticker", "Date"],
    right_on=["Ticker", "Date"]
)

turn_df["turn"] = (
    turn_df["Mean_Volume"]
    .groupby(monthly_df["Ticker"])
    .transform(lambda x: x.rolling(window=3, min_periods=3).mean())
    / turn_df["Shares_Outstanding_Monthly"]
)

share_turnover_df = daily_data_df[["Ticker", "Date", "Volume", "Shares_Outstanding_Daily"]].copy()

share_turnover_df["daily_share_turnover"] = (
    share_turnover_df["Volume"] / share_turnover_df["Shares_Outstanding_Daily"]
)

stdturn_df = (
    share_turnover_df.groupby(["Ticker", "Date"])["daily_share_turnover"]
    .std()
    .reset_index()
    .rename(columns={"daily_share_turnover": "stdturn"})
)

#momentums computations
#monthly compounded return computation
monthly_return_df = (
    daily_tot_return_df.groupby(["Ticker", "Date"])["daily return"]
    .apply(lambda x: (1 + x).prod() - 1)
    .reset_index()
    .rename(columns={"daily return": "monthly return"})
)

#36-month momentum (mom36m)
monthly_return_df["mom36m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(13)).rolling(window=24, min_periods=24).apply(np.prod, raw=True) - 1)
)

#12-month mommentum (mom12m)
monthly_return_df["mom12m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(2)).rolling(window=11, min_periods=11).apply(np.prod, raw=True) - 1)
)

#6-month momentum (mom6m)
monthly_return_df["mom6m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(2)).rolling(window=5, min_periods=5).apply(np.prod, raw=True) - 1)
)

#1-month momentum (mom1m)
monthly_return_df["mom1m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"].shift(1)
)

#chmom 
monthly_return_df["chmom"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(1)).rolling(window=6, min_periods=6).apply(np.prod, raw=True) - 1) - 
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(7)).rolling(window=6, min_periods=6).apply(np.prod, raw=True) - 1)

)

#illiquidity 
ill_df = daily_data_df[["Date", "Ticker", "Volume", "Px_Last"]].copy()

ill_df["dolvol"] = ill_df["Volume"] * ill_df["Px_Last"]

ill_df["daily return"] = (
    ill_df.groupby("Ticker")["Px_Last"].transform(lambda x: x - x.shift(1)) /
    ill_df.groupby("Ticker")["Px_Last"].transform(lambda x: x.shift(1))
)

ill_df["abs return"] = ill_df["daily return"].abs()

ill_df["ill daily"] = ill_df["abs return"] / ill_df["dolvol"]

if not pd.api.types.is_period_dtype(ill_df["Date"]):
    ill_df["Date"] = ill_df["Date"].dt.to_period("M")
    print("Date column converted to monthly period")
else:
    print("Date column is already a period object - no conversion needed")

illiquidity_monthly = (
    ill_df.groupby(["Ticker", "Date"])["ill daily"]
    .mean()
    .reset_index()
    .rename(columns={"ill daily": "illiq"})
)

print(illiquidity_monthly.head())

In [None]:
#indmom
gics_df = gics_df.drop_duplicates(subset=["Ticker"])

df_indmom = monthly_return_df[["Ticker", "Date", "monthly return"]] 
df_indmom = df_indmom.merge(gics_df[['Ticker', 'Industry']], on='Ticker', how='left')
df_indmom = df_indmom.sort_values(['Ticker', 'Date'])

df_indmom['ret_12m'] = (
    df_indmom.groupby('Ticker')['monthly return']
    .transform(lambda x: (1 + x.shift(1)).rolling(window=12, min_periods=12).apply(np.prod, raw=True) - 1)
)

indret_df = (
    df_indmom.dropna(subset=['ret_12m', 'Industry'])
    .groupby(['Date', 'Industry'])['ret_12m']
    .mean()
    .reset_index()
    .rename(columns={'ret_12m': 'indret_12m'})
)

indmom_df = (
    indret_df.groupby('Date')['indret_12m']
    .mean()
    .reset_index()
    .rename(columns={'indret_12m': 'indmom'})
)

df_indmom = df_indmom.merge(indmom_df, on='Date', how='left')

In [None]:
merge_tasks = [
    {"df": dolvol_df, "col": "dolvol_lag2"},
    {"df": maxret_df, "col": "maxret"},
    {"df": retvol_df, "col": "retvol"},
    {"df": monthly_return_df, "col": "mom36m"},
    {"df": monthly_return_df, "col": "mom12m"},
    {"df": monthly_return_df, "col": "mom6m"},
    {"df": monthly_return_df, "col": "mom1m"},
    {"df": monthly_return_df, "col": "chmom"},
    {"df": turn_df, "col": "turn"},
    {"df": df_indmom, "col": "indmom"},
    {"df": illiquidity_monthly, "col": "illiq"},
    {"df": stdturn_df, "col": "stdturn"},
]


for task in merge_tasks:
    df_to_merge = task["df"]
    col = task["col"]

    if col not in monthly_df.columns:
        print(f"{col} merged into monthly_df")
        monthly_df = monthly_df.merge(
            df_to_merge[["Ticker", "Date", col]],
            how="left",
            on=["Ticker", "Date"]
        )
    else:
        print(f"{col} already in monthly_df — skipping.")


In [None]:
#monthly computations → beta

#we need the weekly returns to compute beta
tot_return_raw = tot_return_raw.set_index("Date")
weekly_prices = tot_return_raw.groupby("Ticker")["Total_Return"].resample("W").last().reset_index()

#weekly returns computation
weekly_prices["weekly_return"] = (
    (weekly_prices["Total_Return"] - weekly_prices.groupby("Ticker")["Total_Return"].transform("shift", 1))
    / weekly_prices.groupby("Ticker")["Total_Return"].transform("shift", 1)
)   

pivot = weekly_prices.pivot(index="Date", columns="Ticker", values="weekly_return") #transposée : 1 ligne par date 
pivot["Market"] = pivot.mean(axis=1, skipna=True) #moyenne des returns par semaine

In [None]:
from tqdm import tqdm  #since the computation is long, we use tqdm to display a progress bar
import statsmodels.api as sm

results = []

for ticker in tqdm(pivot.columns.drop("Market")):
    #we loop over each month, starting 3 years after the first date; freq="ME" since we move month by month
    for current_month in pd.date_range(start=pivot.index.min() + pd.DateOffset(years=3), 
                                       end=pivot.index.max(), freq="ME"):

        #current_month is already a timestamp
        end_date = current_month - pd.DateOffset(months=1)
        start_date = end_date - pd.DateOffset(years=3)

        #extract weekly data for this 3-year window, containing the stock and market returns
        try:
            window = pivot.loc[start_date:end_date, [ticker, "Market"]].dropna()
        except KeyError:
            #fallback if the date does not exist exactly
            window = pivot.loc[(pivot.index >= start_date) & (pivot.index <= end_date), [ticker, "Market"]].dropna()

        #I thought I could remove the if, but apparently not?
        if len(window) >= 156:  #156 weeks = 3 years
            X = sm.add_constant(window["Market"])  #explicative variable + constant (alpha)
            y = window[ticker]  #dependent variable

            model = sm.OLS(y, X).fit()  #linear regression with statsmodels
            beta = model.params["Market"]  #slope = beta
            alpha = model.params["const"]  #intercept = alpha
            r_squared = model.rsquared  #model explanatory power
            p_value = model.pvalues["Market"] #pvalue of beta

            results.append({
                "Ticker": ticker,
                "Month": current_month.to_period("M"),
                "Beta": beta,
                "Alpha": alpha,
                "R2": r_squared,
                "p_value": p_value
            })

# final DataFrame
beta_df = pd.DataFrame(results)

In [None]:
import matplotlib.pyplot as plt

#beta test : are the beta coherent 
# pvalue, beta, R², alpha → mean
beta_cols = ["Beta", "Alpha", "R2", "p_value"]
beta_means = beta_df[beta_cols].mean()
beta_std = beta_df[beta_cols].std()

print("Standard deviation is:", beta_std)
print("Means is:", beta_means)

#beta histogram
beta_df["Beta"].hist(bins=30); plt.title("Distribution des beta"); plt.show()

In [None]:
#beta squared
beta_df["Beta_squared"] = beta_df["Beta"] ** 2

#merge with merge_monthly_df
beta_cols = beta_df[["Ticker", "Month", "Beta", "Beta_squared"]]


In [None]:
#idiovol computation 

pivot["Market"] = pivot.mean(axis=1, skipna=True)

idio_vol_results = []

for ticker in tqdm(pivot.columns.drop("Market")):
    for current_month in pd.date_range(start=pivot.index.min() + pd.DateOffset(years=3),
                                       end=pivot.index.max(), freq="ME"):

        end_date = current_month
        start_date = end_date - pd.DateOffset(years=3)

        try:
            window = pivot.loc[start_date:end_date, [ticker, "Market"]].dropna()
        except KeyError:
            window = pivot.loc[(pivot.index >= start_date) & 
                                      (pivot.index <= end_date), [ticker, "Market"]].dropna()

        if len(window) >= 52:  # At least one year of data
            X = sm.add_constant(window["Market"])
            y = window[ticker]
            model = sm.OLS(y, X).fit()
            residuals = model.resid
            idio_std = np.std(residuals)

            idio_vol_results.append({
                "Ticker": ticker,
                "Month": current_month.to_period("M"),
                "Idiovol": idio_std
            })


idio_vol_df = pd.DataFrame(idio_vol_results)

In [None]:
# merge Beta and Beta_squared
if "Beta" not in monthly_df.columns:
    monthly_df = monthly_df.merge(
        beta_df[["Ticker", "Month", "Beta", "Beta_squared"]],
        how="left",
        left_on=["Ticker", "Date"],
        right_on=["Ticker", "Month"]
    )
    print("Beta and Beta_squared merged.")
else:
    print("Beta already in monthly_df — skipping.")

# merge Idiovol
if "Idiovol" not in monthly_df.columns:
    monthly_df = monthly_df.merge(
        idio_vol_df,
        how="left",
        left_on=["Ticker", "Date"],
        right_on=["Ticker", "Month"]
    )
    print("Idiovol merged.")
else:
    print("Idiovol already in monthly_df — skipping")

In [None]:
#yearly : cross-sectional ranking and normalization

#first we need to standardize the date (because we exported the fiscal year, it's not the same for each stock)
yearly_df["Year"] = pd.to_datetime(yearly_df["Date"]).dt.year

# Filter to keep only years >= 1990
yearly_df = yearly_df[yearly_df["Year"] >= 1990]

covariates = ["agr", "cashpr", "chinv", "depr", "dy", "ep", "invest", "rd_mve", "sp"]

#Dictionnary to store the rankings
rankings = {}

for covariate in covariates:
    print(f"\n=== RANKING PAR {covariate.upper()} ===")
    
    for date, group in yearly_df.groupby("Year"): 
        
        #ranked by covariates (ascending)
        ranked_group = group[["Ticker", covariate]].sort_values(covariate).reset_index(drop=True)
        
        ranked_group["rank"] = range(1, len(ranked_group) + 1) #ranked from 1 to n
        n = len(ranked_group)
        ranked_group["score"] = ranked_group["rank"].apply(  # →[-1, 1]
            lambda r: 0 if n == 1 else 2*(r - 1)/(n - 1) - 1
        )
        
        #the ranking is stored in a dictionary
        if covariate not in rankings:
            rankings[covariate] = {}
        rankings[covariate][date] = ranked_group
        
        #print the first dates
        if date in list(yearly_df["Year"].unique())[:1]:  
            print(f"\nDate {date}:")
            print(ranked_group.head())

# Create yearly_covariates with original variables first
cols = ["Year", "Ticker"] + covariates
yearly_covariates = yearly_df[cols].dropna()   # optional .dropna() to keep complete cases

# Add normalized scores to yearly_covariates
for covariate in covariates:
    normalized_scores = []
    for _, row in yearly_covariates.iterrows():
        year = row["Year"]
        ticker = row["Ticker"]
        if covariate in rankings and year in rankings[covariate]:
            ticker_data = rankings[covariate][year][rankings[covariate][year]["Ticker"] == ticker]
            if not ticker_data.empty:
                normalized_scores.append(ticker_data["score"].iloc[0])
            else:
                normalized_scores.append(np.nan)
        else:
            normalized_scores.append(np.nan)
    
    yearly_covariates[f"{covariate}_normalized"] = normalized_scores

In [1]:
pip install git-filter-repo


Collecting git-filter-repo
  Downloading git_filter_repo-2.47.0-py3-none-any.whl.metadata (31 kB)
Downloading git_filter_repo-2.47.0-py3-none-any.whl (76 kB)
Installing collected packages: git-filter-repo
Successfully installed git-filter-repo-2.47.0
Note: you may need to restart the kernel to use updated packages.


