In [None]:
# pip install pandas numpy statsmodels
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# Exemple : lecture depuis CSV
fund_data = pd.read_csv('fund_data.csv', parse_dates=['date'])
syn_data = pd.read_csv('syn_data.csv', parse_dates=['date'])
factor_data = pd.read_csv('factor_data.csv', parse_dates=['date'])

# Merge fund_data, syn_data et factor_data
merged_df = (
    fund_data
    .merge(syn_data, on=['date','fund_id'], how='left')
    .merge(factor_data, on='date', how='left')
)

# Si fund_return n’est pas déjà un "excess return", on fait :
merged_df['excess_fund'] = merged_df['fund_return'] - merged_df['Rf']
merged_df['excess_syn'] = merged_df['syn_return'] - merged_df['Rf']

## Vérifier que MKT_RF est déjà (Rm - Rf). Sinon, il faut faire la conversion ! 

In [None]:
merged_df['year_quarter'] = merged_df['date'].dt.year.astype(str) + '_Q' + merged_df['date'].dt.quarter.astype(str)

In [None]:
def correlation_fund_syn(group):
    corr = group['excess_fund'].corr(group['excess_syn'])
    return pd.Series({'corr_fs': corr})

corr_data = (
    merged_df
    .groupby(['fund_id', 'year_quarter'])
    .apply(correlation_fund_syn)
    .reset_index()
)

In [None]:
def fit_carhart_alpha(group):
    # On fait une régression OLS sur excess_fund 
    X_fund = group[['MKT_RF','SMB','HML','WML']]
    X_fund = sm.add_constant(X_fund)
    y_fund = group['excess_fund']
    
    # Fit
    model_fund = sm.OLS(y_fund, X_fund, missing='drop').fit()
    alpha_fund = model_fund.params['const']
    
    # Idem pour le synthetic
    X_syn = group[['MKT_RF','SMB','HML','WML']]
    X_syn = sm.add_constant(X_syn)
    y_syn = group['excess_syn']
    model_syn = sm.OLS(y_syn, X_syn, missing='drop').fit()
    alpha_syn = model_syn.params['const']
    
    # Annualisation (journalière)
    alpha_fund_annual = alpha_fund * 252
    alpha_syn_annual  = alpha_syn  * 252
    
    alpha_gap = alpha_fund_annual - alpha_syn_annual
    
    return pd.Series({
        'alpha_fund': alpha_fund_annual,
        'alpha_syn':  alpha_syn_annual,
        'alpha_gap':  alpha_gap
    })

alpha_data = (
    merged_df
    .groupby(['fund_id','year_quarter'])
    .apply(fit_carhart_alpha)
    .reset_index()
)

In [None]:
full_quarter_data = corr_data.merge(alpha_data, on=['fund_id','year_quarter'], how='inner')

In [None]:
full_quarter_data['corr_quintile'] = pd.qcut(full_quarter_data['corr_fs'], 5, labels=False) + 1
# labels=False => la sortie est 0..4, donc on ajoute +1 => quintiles 1..5

stats_quintiles = (
    full_quarter_data
    .groupby('corr_quintile')
    .agg(mean_alpha_gap=('alpha_gap','mean'),
         count=('alpha_gap','size'))
    .reset_index()
)

# Différence High (Q5) - Low (Q1)
alpha_gap_highlow = (
    stats_quintiles.loc[stats_quintiles['corr_quintile'] == 5, 'mean_alpha_gap'].values[0]
    - stats_quintiles.loc[stats_quintiles['corr_quintile'] == 1, 'mean_alpha_gap'].values[0]
)
print("High-Low difference (alpha_gap) =", alpha_gap_highlow)
print(stats_quintiles)


In [None]:
def assign_quintiles(df):
    df = df.copy()
    df['corr_quintile'] = pd.qcut(df['corr_fs'], 5, labels=False) + 1
    return df

full_quarter_data_byQ = (
    full_quarter_data
    .groupby('year_quarter')
    .apply(assign_quintiles)
    .reset_index(drop=True)
)

stats_quintiles_byQ = (
    full_quarter_data_byQ
    .groupby(['year_quarter','corr_quintile'])
    .agg(mean_alpha_gap=('alpha_gap','mean'),
         count=('alpha_gap','size'))
    .reset_index()
)

# On peut ensuite calculer High–Low par trimestre et faire la moyenne :
def high_low_by_quarter(df):
    # df a un group de la forme (year_quarter = X)
    # on récupère la moyenne en Q5 et Q1
    q1_gap = df.loc[df['corr_quintile'] == 1, 'mean_alpha_gap'].values
    q5_gap = df.loc[df['corr_quintile'] == 5, 'mean_alpha_gap'].values
    if len(q1_gap) == 1 and len(q5_gap) == 1:
        return q5_gap[0] - q1_gap[0]
    else:
        return np.nan

high_low_perQ = (
    stats_quintiles_byQ
    .groupby('year_quarter')
    .apply(high_low_by_quarter)
    .reset_index(name='HL_diff')
)

print(high_low_perQ.head(10))
print("Moyenne High–Low sur tous les trimestres =", high_low_perQ['HL_diff'].mean())

In [None]:
fund_chars = pd.read_csv('fund_chars.csv')
reg_df = full_quarter_data.merge(fund_chars, on=['fund_id','year_quarter'], how='left')

# Exemple : on veut expliquer alpha_fund en fonction de corr_fs + controls
# On fait une regression OLS en formula syntax
model_ols = smf.ols(
    formula="alpha_fund ~ corr_fs + np.log(size) + np.log(age) + np.log(manager_tenure) + np.log(turnover) + vol_mkt",
    data=reg_df
).fit(cov_type='HC3')  # robust aux hétéroscédasticités

print(model_ols.summary())

In [None]:
model_gap = smf.ols(
    formula="alpha_gap ~ corr_fs + np.log(size) + np.log(age) + np.log(manager_tenure) + np.log(turnover) + vol_mkt",
    data=reg_df
).fit(cov_type='HC3')

print(model_gap.summary())