In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import statsmodels.api as sm

In [None]:
#excel files are read and a ticker column is added to each sheet

repertoire = "data" 
dataframes_dict = {}

for fichier in os.listdir(repertoire): 
    chemin_complet = os.path.join(repertoire, fichier)

    if fichier.endswith((".xls", ".xlsx")):  
        print(f"File found : {fichier}")

    try:
        worksheet = pd.read_excel(chemin_complet, sheet_name=None)

        for sheet_name, df in worksheet.items():
            if 'Ticker' not in df.columns:
                df['Ticker'] = sheet_name  
                print(f"Ticker added to {sheet_name}")
                dataframes_dict[f"{fichier}_{sheet_name}"] = df
            else:
                print(f"Column 'Ticker' already existed in {fichier}")

    except Exception as e:
        print(f"Error during the process {fichier} : {e}")

In [None]:
#variables are renamed and split by frequency
monthly_list = []
yearly_list = []
quarterly_list = []
daily_tot_return_list = []
daily_vol_list = [] #two daily lists are created because the start dates differ

for name, df in dataframes_dict.items():
    rename_map = {
        df.columns.values[0]: "Date.1",
        df.columns.values[1]: "Total_Assets",
        df.columns.values[2]: "Common_Equity",
        df.columns.values[3]: "Cash_And_Investments",
        df.columns.values[4]: "R&D_Expenses",
        df.columns.values[5]: "Inventories",
        df.columns.values[6]: "Dividends_Paid",
        df.columns.values[7]: "Gross_Fixed_Assets",
        df.columns.values[8]: "Income_Before_Extra_Items",
        df.columns.values[9]: "Sales_Revenue",
        df.columns.values[10]: "Depreciation_Amortization",

        df.columns.values[12]: "Date.2",
        df.columns.values[13]: "Mkt_Cap_Yearly",
        df.columns.values[14]: "Shares_Outstanding_Yearly",

        df.columns.values[16]: "Date.3",
        df.columns.values[17]: "Net_Income",

        df.columns.values[19]: "Date.4",
        df.columns.values[20]: "Px_Bid",
        df.columns.values[21]: "Px_Ask",
        df.columns.values[22]: "Shares_Outstanding_Monthly",
        df.columns.values[23]: "Mkt_Cap_Monthly",

        df.columns.values[25]: "Date.5",
        df.columns.values[26]: "Px_Last",

        df.columns.values[28]: "Date.6",
        df.columns.values[29]: "Volume",
    }

    df.rename(columns=rename_map, inplace=True)

    try:
        df_yearly = df[[
            "Date.1", "Total_Assets", "Common_Equity", "Cash_And_Investments",
            "R&D_Expenses", "Inventories", "Dividends_Paid", "Gross_Fixed_Assets",
            "Income_Before_Extra_Items", "Sales_Revenue", "Depreciation_Amortization",
            "Mkt_Cap_Yearly", "Shares_Outstanding_Yearly"
        ]].copy()
        
        df_quarterly = df[["Date.3", "Net_Income"]].copy()
       
        df_monthly = df[[
            "Date.4", "Px_Bid", "Px_Ask", "Shares_Outstanding_Monthly", "Mkt_Cap_Monthly", 
        ]].copy()

        df_daily_tot_return = df[[
            "Date.5", "Px_Last"
        ]].copy()

        df_daily_vol= df[[
            "Date.6", "Volume"
        ]].copy()
        
        df_yearly['Ticker'] = df['Ticker'].iloc[0]
        df_monthly['Ticker'] = df['Ticker'].iloc[0]
        df_quarterly['Ticker'] = df['Ticker'].iloc[0]
        df_daily_tot_return['Ticker'] = df['Ticker'].iloc[0]
        df_daily_vol['Ticker'] = df['Ticker'].iloc[0]
        
        yearly_list.append(df_yearly)
        monthly_list.append(df_monthly)
        quarterly_list.append(df_quarterly)
        daily_tot_return_list.append(df_daily_tot_return)
        daily_vol_list.append(df_daily_vol)
        
        print(f"{name} : successful yearly / monthly / quarterly split")

    except Exception as e:
        print(f"{name} : error during split : {e}")

yearly_df = pd.concat(yearly_list, ignore_index=True)
monthly_df = pd.concat(monthly_list, ignore_index=True)
quarterly_df = pd.concat(quarterly_list, ignore_index=True)
daily_tot_return_df = pd.concat(daily_tot_return_list, ignore_index=True)
daily_vol_df = pd.concat(daily_vol_list, ignore_index=True)

In [None]:
print(f"Type de yearly_df : {type(yearly_df)}")
print(f"Type de monthly_df : {type(monthly_df)}")
print(f"Type de daily_tot_return_df : {type(daily_tot_return_df)}")
print(f"Type de daily_vol_df : {type(daily_vol_df)}")
print(f"Type de quarterly_df : {type(quarterly_df)}")

In [None]:
lists = {
    "yearly": yearly_df,
    "quarterly": quarterly_df,
    "monthly": monthly_df,
    "daily_tot_return": daily_tot_return_df,
    "daily_vol": daily_vol_df
}

for name, df in lists.items():
    #we rename columns starting with "Date.xxx" to "Date"
    df.rename(columns={col: "Date" for col in df.columns if col.startswith("Date")}, inplace=True)
    
    #we convert "Date" column to datetime
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
    
    #we convert all other columns to float64 (excluding "Date" and "Ticker")
    for col in df.columns:
        if col not in ["Date", "Ticker"]:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype("float64")

    #display column information for verification
    print(f"--- {name.capitalize()} DataFrame ---")
    print(f"  - Colonnes et Types:")
    for col in df.columns:
        print(f"    - {col}: {df[col].dtype}")
    print(f"  - Nombre de lignes: {df.shape[0]}")
    print(f"  - Nombre de colonnes: {df.shape[1]}")
    print("-" * 50)

In [None]:
#NaN Cells
"""
We remove rows where the date column is NaN. When splitting the data by frequency, each row is assigned a ticker.
However, because daily data have more rows than other frequencies, the ticker is excessively duplicated in the lower-frequency 
DataFrames (monthly, quarterly, yearly), leading to rows that are mostly empty.
Dropping rows without a date removes these redundancies without any loss of actual data.
"""


lists["yearly"] = yearly_df.dropna(subset=["Date"])
lists["quarterly"] = quarterly_df.dropna(subset=["Date"])
lists["monthly"] = monthly_df.dropna(subset=["Date"])
lists["daily_vol"] = daily_vol_df.dropna(subset=["Date"])

all_nan_matrices = {}

for name, df in lists.items():  
    
    if "Ticker" in df.columns:
        #compute the percentage of NaNs per column for each ticker
        nan_matrix = df.groupby("Ticker").apply(lambda g: g.isna().mean() * 100)
        all_nan_matrices[name] = nan_matrix
        print(f"\n % Pourcentage de NaN pour {name} :\n", nan_matrix)

In [None]:
#NaN handling using SimpleImputer
from sklearn.impute import SimpleImputer

#create an imputer to replace NaNs with the mean
imputer = SimpleImputer(strategy="mean")

for name, df in lists.items():
    cols_to_impute = [col for col in df.columns if col not in ["Date", "Ticker"]]
    df.loc[:, cols_to_impute] = imputer.fit_transform(df[cols_to_impute])

In [None]:
yearly_df = lists["yearly"]
quarterly_df = lists["quarterly"]
monthly_df = lists["monthly"]
daily_vol_df = lists["daily_vol"]

In [None]:
#yearly computations    

#mkt cap and shares outstanding were extracted with a BDH formula and are expressed in millions so we have to multiply by 10^6
yearly_df["Mkt_Cap_Yearly"] = yearly_df["Mkt_Cap_Yearly"] * 1e6
yearly_df["Shares_Outstanding_Yearly"] = yearly_df["Shares_Outstanding_Yearly"] * 1e6

In [None]:
#yearly computations

#asset growth (agr)
yearly_df["agr"] = (
    yearly_df["Total_Assets"] - yearly_df.groupby("Ticker")["Total_Assets"].shift(1)
) / yearly_df.groupby("Ticker")["Total_Assets"].shift(1)


#cash productivity (cashpr)
yearly_df.loc[:, "cashpr"] = (
    yearly_df["Mkt_Cap_Yearly"] - yearly_df["Common_Equity"]
) / yearly_df["Cash_And_Investments"]


#change in inventory (chinv)
yearly_df["chinv"] = (
    yearly_df["Inventories"] - yearly_df.groupby("Ticker")["Inventories"].shift(1)
) / yearly_df["Total_Assets"]



#depreciation / Gross Fixed Assets (depr)
yearly_df["depr"] = (
    yearly_df["Depreciation_Amortization"] / 
    yearly_df["Gross_Fixed_Assets"]
)

#dividends to Market Cap (dy)
yearly_df.loc[:, "dy"] = (
    yearly_df["Dividends_Paid"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#earnings to Price (ep)
yearly_df.loc[:, "ep"] = (
    yearly_df["Income_Before_Extra_Items"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#investment to assets (invest)
yearly_df.loc[:, "invest"] = (
    yearly_df.groupby("Ticker")["Gross_Fixed_Assets"].transform(lambda x: x - x.shift(1))
    + yearly_df.groupby("Ticker")["Inventories"].transform(lambda x: x - x.shift(1))
) / yearly_df.groupby("Ticker")["Total_Assets"].transform(lambda x: x.shift(1))


#R&D to Market Value of Equity (rd_mve)
yearly_df.loc[:, "rd_mve"] = (
    yearly_df["R&D_Expenses"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

#sales to Price (sp)
yearly_df.loc[:, "sp"] = (
    yearly_df["Sales_Revenue"] / 
    yearly_df["Mkt_Cap_Yearly"]
)

In [None]:
yearly_df.to_excel("yearly_df.xlsx")

In [None]:
#quarterly computations

quarterly_df["delta_income"] = (
    quarterly_df["Net_Income"] - quarterly_df["Net_Income"].shift(1)
)

quarterly_df["direction"] = np.sign(quarterly_df["delta_income"])


def compute_nincr(direction_series):
    nincr = []
    count = 0
    prev = 0
    for d in direction_series:
        if d == prev and d != 0:
            count += 1
        elif d != 0:
            count = 1
        else:
            count = 0
        nincr.append(count * d if d != 0 else 0)
        prev = d if d != 0 else prev
    return nincr

quarterly_df["nincr"] = (
    quarterly_df.groupby("Ticker")["direction"].transform(compute_nincr)
)

print(quarterly_df.head())

In [None]:
#monthly computations

#px_last and volume are needed to compute the monthly covariates

#merge for Px_Last
monthly_df = monthly_df.merge(
    daily_tot_return_df[["Date", "Ticker", "Px_Last"]],
    on=["Date", "Ticker"],
    how="left"
)

#merge for Volume
monthly_df = monthly_df.merge(
    daily_vol_df[["Date", "Ticker", "Volume"]],
    on=["Date", "Ticker"],
    how="left"
)

#sort to ensure ffill works correctly
monthly_df = monthly_df.sort_values(["Ticker", "Date"])

monthly_df["Px_Last"] = monthly_df.groupby("Ticker")["Px_Last"].ffill()
monthly_df["Volume"] = monthly_df.groupby("Ticker")["Volume"].ffill()

In [None]:
#monthly computations

#convert date to monthly period si date encore un datatime on convertit sinon on  ne fait rien
monthly_df = monthly_df.rename(columns={"Date": "Month"})

if pd.api.types.is_datetime64_any_dtype(monthly_df["Month"]):
    monthly_df["Month"] = monthly_df["Month"].dt.to_period("M")

#bid-ask spread (baspread)
monthly_df.loc[:, "baspread"] = (
    monthly_df["Px_Ask"] - monthly_df["Px_Bid"]
) / ((monthly_df["Px_Ask"] + monthly_df["Px_Bid"]) / 2)


#dollar trading volume (dolvol
monthly_df.loc[:, "dolvol"] = (np.log(
    monthly_df.groupby("Ticker")["Volume"].shift(2) * 
    monthly_df.groupby("Ticker")["Px_Last"].shift(2))
)

#size (mvel1)
monthly_df.loc[:, "mvel1"] = np.log(monthly_df["Mkt_Cap_Monthly"]).shift(1)

#more complex monthly calculations: maxret, momentums : create a dedicated DataFrame for these computations

#convert daily date to monthly period
if daily_tot_return_df.index.name == "Date":
    daily_tot_return_df = daily_tot_return_df.reset_index()

daily_tot_return_df["Month"] = daily_tot_return_df["Date"].dt.to_period("M")
monthly_calculations_df = daily_tot_return_df["Month"].copy()

#daily return computation
daily_tot_return_df["daily return"] = (
    (daily_tot_return_df["Px_Last"] - daily_tot_return_df.groupby("Ticker")["Px_Last"].transform("shift", 1))
    / daily_tot_return_df.groupby("Ticker")["Px_Last"].transform("shift", 1)
)   

#max return (maxret)
maxret_df = (
    daily_tot_return_df.groupby(["Ticker", "Month"])["daily return"]
    .max()
    .groupby(level=0)
    .shift(1)
    .reset_index()
    .rename(columns={"daily return": "maxret"})
)

#return volatility (retvol)
retvol_df = (
    daily_tot_return_df
    .groupby(["Ticker", "Month"])["daily return"]
    .std()
    .groupby(level=0)  # level=0 = Ticker
    .shift(1)          # shift par Ticker
    .reset_index()
    .rename(columns={"daily return": "retvol"})
)


#momentums computations

#monthly compounded return computation
monthly_return_df = (
    daily_tot_return_df.groupby(["Ticker", "Month"])["daily return"]
    .apply(lambda x: (1 + x).prod() - 1)
    .reset_index()
    .rename(columns={"daily return": "monthly return"})
)

#36-month momentum (mom36m)
monthly_return_df["mom36m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(13)).rolling(window=24, min_periods=24).apply(np.prod, raw=True) - 1)
)

#12-month mommentum (mom12m)
monthly_return_df["mom12m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(2)).rolling(window=11, min_periods=11).apply(np.prod, raw=True) - 1)
)

#6-month momentum (mom6m)
monthly_return_df["mom6m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(2)).rolling(window=5, min_periods=5).apply(np.prod, raw=True) - 1)
)

#1-month momentum (mom1m)
monthly_return_df["mom1m"] = (
    monthly_return_df.groupby("Ticker")["monthly return"].shift(1)
)


#chmom 
monthly_return_df["chmom"] = (
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(1)).rolling(window=6, min_periods=6).apply(np.prod, raw=True) - 1) - 
    monthly_return_df.groupby("Ticker")["monthly return"]
    .transform(lambda x: (1 + x.shift(7)).rolling(window=6, min_periods=6).apply(np.prod, raw=True) - 1)

)

#share turnover (turn)
daily_vol_df["Date(M)"] = daily_vol_df["Date"].dt.to_period("M")
daily_vol_df["Mean_Volume"] = daily_vol_df.groupby(["Ticker", "Date(M)"])["Volume"].transform("mean")  

turn_df = (
    daily_vol_df.groupby(["Ticker", "Date(M)"])["Mean_Volume"].first().reset_index()
)

turn_df = turn_df.merge(
    monthly_df[["Ticker", "Month", "Shares_Outstanding_Monthly"]],
    how="left",
    left_on=["Ticker", "Date(M)"],
    right_on=["Ticker", "Month"]
)

turn_df["turn"] = (
    turn_df["Mean_Volume"]
    .groupby(monthly_df["Ticker"])
    .transform(lambda x: x.rolling(window=3, min_periods=3).mean())
    / turn_df["Shares_Outstanding_Monthly"]
)

#merging of all df
monthly_calculations_df = (
    monthly_return_df
    .merge(maxret_df, on=["Ticker", "Month"], how="left")
    .merge(retvol_df, on=["Ticker", "Month"], how="left")
)

In [None]:
print(turn_df.head())
print(merged_monthly_df.head())

In [None]:
monthly_calculations_df["Month"] = pd.PeriodIndex(monthly_calculations_df["Month"], freq="M")

merged_monthly_df = monthly_calculations_df.merge(
    monthly_df,                 
    on=["Ticker", "Month"],    
    how="left"                  
)

merged_monthly_df = turn_df[["Ticker", "Month", "turn"]].merge(
    merged_monthly_df,
    on=["Ticker", "Month"],
    how="left"
)

In [None]:
#monthly computations → beta

#we need the weekly returns to compute beta
if daily_tot_return_df.index.name != "Date":
    daily_tot_return_df.set_index("Date", inplace=True)
    print("Index defined on Date")
else:
    print("Date is already in the index")


weekly_prices = daily_tot_return_df.groupby("Ticker")["Px_Last"].resample("W").last().reset_index()

#weekly returns computation
weekly_prices["weekly_return"] = (
    (weekly_prices["Px_Last"] - weekly_prices.groupby("Ticker")["Px_Last"].transform("shift", 1))
    / weekly_prices.groupby("Ticker")["Px_Last"].transform("shift", 1)
)   


pivot = weekly_prices.pivot(index="Date", columns="Ticker", values="weekly_return") #transposée : 1 ligne par date 
pivot["Market"] = pivot.mean(axis=1, skipna=True) #moyenne des returns par semaine

In [None]:
from tqdm import tqdm  #since the computation is long, we use tqdm to display a progress bar
import statsmodels.api as sm

results = []

for ticker in tqdm(pivot.columns.drop("Market")):
    #we loop over each month, starting 3 years after the first date; freq="ME" since we move month by month
    for current_month in pd.date_range(start=pivot.index.min() + pd.DateOffset(years=3), 
                                       end=pivot.index.max(), freq="ME"):

        #current_month is already a timestamp
        end_date = current_month - pd.DateOffset(months=1)
        start_date = end_date - pd.DateOffset(years=3)

        #extract weekly data for this 3-year window, containing the stock and market returns
        try:
            window = pivot.loc[start_date:end_date, [ticker, "Market"]].dropna()
        except KeyError:
            #fallback if the date does not exist exactly
            window = pivot.loc[(pivot.index >= start_date) & (pivot.index <= end_date), [ticker, "Market"]].dropna()

        #I thought I could remove the if, but apparently not?
        if len(window) >= 156:  #156 weeks = 3 years
            X = sm.add_constant(window["Market"])  #explicative variable + constant (alpha)
            y = window[ticker]  #dependent variable

            model = sm.OLS(y, X).fit()  #linear regression with statsmodels
            beta = model.params["Market"]  #slope = beta
            alpha = model.params["const"]  #intercept = alpha
            r_squared = model.rsquared  #model explanatory power
            p_value = model.pvalues["Market"] #pvalue of beta

            results.append({
                "Ticker": ticker,
                "Month": current_month.to_period("M"),
                "Beta": beta,
                "Alpha": alpha,
                "R2": r_squared,
                "p_value": p_value
            })

# final DataFrame
beta_df = pd.DataFrame(results)

In [None]:
beta_df.to_excel("beta_df.xlsx")

In [None]:
import matplotlib.pyplot as plt

#beta test : are the beta coherent 
# pvalue, beta, R², alpha → mean
beta_cols = ["Beta", "Alpha", "R2", "p_value"]
beta_means = beta_df[beta_cols].mean()
beta_std = beta_df[beta_cols].std()

print("Standard deviation is:", beta_std)
print("Means is:", beta_means)

#beta histogram
beta_df["Beta"].hist(bins=30); plt.title("Distribution des beta"); plt.show()

In [None]:
#beta squared
beta_df["Beta_squared"] = beta_df["Beta"] ** 2

#merge with merge_monthly_df
beta_cols = beta_df[["Ticker", "Month", "Beta", "Beta_squared"]]
merged_monthly_df = (
    merged_monthly_df                  # ← celui que tu as obtenu plus haut
    .merge(beta_cols, on=["Ticker", "Month"], how="left")
)

print(merged_monthly_df.tail())

In [None]:
#idiovol computation 

pivot["Market"] = pivot.mean(axis=1, skipna=True)

idio_vol_results = []

for ticker in tqdm(pivot.columns.drop("Market")):
    for current_month in pd.date_range(start=pivot.index.min() + pd.DateOffset(years=3),
                                       end=pivot.index.max(), freq="ME"):

        end_date = current_month
        start_date = end_date - pd.DateOffset(years=3)

        try:
            window = pivot.loc[start_date:end_date, [ticker, "Market"]].dropna()
        except KeyError:
            window = pivot.loc[(pivot.index >= start_date) & 
                                      (pivot.index <= end_date), [ticker, "Market"]].dropna()

        if len(window) >= 52:  # At least one year of data
            X = sm.add_constant(window["Market"])
            y = window[ticker]
            model = sm.OLS(y, X).fit()
            residuals = model.resid
            idio_std = np.std(residuals)

            idio_vol_results.append({
                "Ticker": ticker,
                "Month": current_month.to_period("M"),
                "Idiovol": idio_std
            })


idio_vol_df = pd.DataFrame(idio_vol_results)

In [None]:
# Vérifie et supprime la colonne Idiovol si elle est déjà présente
if "Idiovol" in merged_monthly_df.columns:
    print("Colonne 'Idiovol' déjà présente — suppression avant merge.")
    merged_monthly_df = merged_monthly_df.drop(columns=["Idiovol"])

# Merge proprement
merged_monthly_df = merged_monthly_df.merge(idio_vol_df, on=["Ticker", "Month"], how="left")

# Vérifie le résultat
print(merged_monthly_df.tail())


In [None]:
#check des datas : yearly (continuer le reste)
covariate_cols = ["agr", "cashpr", "chinv", "depr", "dy", "ep", "invest", "rd_mve", "sp"]

#mean of each covariates
covariate_means = yearly_df[covariate_cols].mean()
print("Moyennes des covariates :")
print(covariate_means)

#standard-deviation of each covariate
covariate_stds = yearly_df[covariate_cols].std()
print("\nÉcarts-types des covariates :")
print(covariate_stds)

In [None]:
print(yearly_df.dtypes)

In [None]:
#yearly : cross-sectional ranking and normalization

#first we need to standardize the date (because we exported the fiscal year, it's not the same for each stock)
yearly_df["Year"] = pd.to_datetime(yearly_df["Date"]).dt.year

# Filter to keep only years >= 1990
yearly_df = yearly_df[yearly_df["Year"] >= 1990]

covariates = ["agr", "cashpr", "chinv", "depr", "dy", "ep", "invest", "rd_mve", "sp"]

#Dictionnary to store the rankings
rankings = {}

for covariate in covariates:
    print(f"\n=== RANKING PAR {covariate.upper()} ===")
    
    for date, group in yearly_df.groupby("Year"): 
        
        #ranked by covariates (ascending)
        ranked_group = group[["Ticker", covariate]].sort_values(covariate).reset_index(drop=True)
        
        ranked_group["rank"] = range(1, len(ranked_group) + 1) #ranked from 1 to n
        n = len(ranked_group)
        ranked_group["score"] = ranked_group["rank"].apply(  # →[-1, 1]
            lambda r: 0 if n == 1 else 2*(r - 1)/(n - 1) - 1
        )
        
        #the ranking is stored in a dictionary
        if covariate not in rankings:
            rankings[covariate] = {}
        rankings[covariate][date] = ranked_group
        
        #print the first dates
        if date in list(yearly_df["Year"].unique())[:1]:  
            print(f"\nDate {date}:")
            print(ranked_group.head())

# Create yearly_covariates with original variables first
cols = ["Year", "Ticker"] + covariates
yearly_covariates = yearly_df[cols].dropna()   # optional .dropna() to keep complete cases

# Add normalized scores to yearly_covariates
for covariate in covariates:
    normalized_scores = []
    for _, row in yearly_covariates.iterrows():
        year = row["Year"]
        ticker = row["Ticker"]
        if covariate in rankings and year in rankings[covariate]:
            ticker_data = rankings[covariate][year][rankings[covariate][year]["Ticker"] == ticker]
            if not ticker_data.empty:
                normalized_scores.append(ticker_data["score"].iloc[0])
            else:
                normalized_scores.append(np.nan)
        else:
            normalized_scores.append(np.nan)
    
    yearly_covariates[f"{covariate}_normalized"] = normalized_scores

In [None]:
merged_monthly_df = merged_monthly_df[merged_monthly_df["Month"] >= pd.Period("1990-12", freq="M")]


nan_percent = merged_monthly_df.isna().mean() * 100
print(nan_percent.sort_values(ascending=False).round(2))


merged_monthly_df.to_excel("merged_monthly_df.xlsx")

In [None]:
beta_df.to_excel("beta_df.xlsx")

In [None]:
#monthly : cross-sectional ranking and normalizatio

merged_monthly_df = merged_monthly_df[merged_monthly_df["Month"] >= pd.Period("1990-01", freq="M")]

covariates = ["mom12m", "mom6m", "mom1m", "chmom", "maxret", "retvol", "baspread", "dolvol", "mvel1", "turn", "Beta", 
"Beta_squared", "Idiovol"]

monthly_rankings = {}

for covariate in covariates:
    print(f"\n=== RANKING PAR {covariate.upper()} ===")
    
    for date, group in merged_monthly_df.groupby("Month"):
        
        ranked_group = group[["Ticker", covariate]].sort_values(covariate).reset_index(drop=True)
        
        ranked_group["rank"] = range(1, len(ranked_group) + 1)
        n = len(ranked_group)
        ranked_group["score"] = ranked_group["rank"].apply(
            lambda r: 0 if n == 1 else 2*(r - 1)/(n - 1) - 1
        )
        
        if covariate not in monthly_rankings:
            monthly_rankings[covariate] = {}
        monthly_rankings[covariate][date] = ranked_group

        if date in sorted(merged_monthly_df["Month"].unique())[:1]:
            print(f"\nDate {date}:")
            print(ranked_group.head())

cols = ["Month", "Ticker"] + covariates
monthly_covariates = merged_monthly_df[cols].dropna()

for covariate in covariates:
    normalized_scores = []
    for _, row in monthly_covariates.iterrows():
        month = row["Month"]
        ticker = row["Ticker"]
        if covariate in monthly_rankings and month in monthly_rankings[covariate]:
            ticker_data = monthly_rankings[covariate][month][monthly_rankings[covariate][month]["Ticker"] == ticker]
            if not ticker_data.empty:
                normalized_scores.append(ticker_data["score"].iloc[0])
            else:
                normalized_scores.append(np.nan)
        else:
            normalized_scores.append(np.nan)
    
    monthly_covariates[f"{covariate}_normalized"] = normalized_scores