**ÉTAPE 0** : préparation des données

In [16]:
import process 
import pandas as pd

# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
# Jerome path : 'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DATA_Statapp.csv'
# Mohamed path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
df = pd.read_csv('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv')

# Apply conversion function to 'open' and 'close' columns
df['open'] = df['open'].apply(process.safe_literal_eval)
df['close'] = df['close'].apply(process.safe_literal_eval)

# Calculate returns for each line
df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)

new_df = df[['ticker', 'return']] # create a new data frame with the column ticker and return 

# Créons le DataFrame à partir des listes dans 'return'
# On suppose ici que 'new_df' est déjà défini et contient la colonne 'return'

# Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
returns_df = pd.DataFrame(new_df['return'].tolist())

# Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
returns_df.insert(0, 'ticker', new_df['ticker'])

# Renommer les colonnes pour refléter qu'elles sont des rendements
returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

df_cleaned = process.remove_rows_with_nan(returns_df)
df_cleaned.reset_index(drop=True, inplace=True)

process.check_nan_inf(df_cleaned)

df_cleaned.shape

There are no NaN values in the dataframe


(632, 5532)

**ÉTAPE 1** : Phase d'entraînement

1. Obtention de la matrice de corrélation des actifs sur une fenêtre arrière de 30 jours (1 mois)

In [17]:
lookback_window = 30
correlation_matrix = process.correlation_matrix(lookback_window, df_cleaned)

In [18]:
correlation_matrix

ticker,AA,ABM,ABT,ADI,ADM,ADX,AEE,AEG,AEM,AEP,...,XLI,XLK,XLP,XLU,XLV,XLY,XOM,XRX,YUM,ZTR
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,1.000000,0.084826,0.069125,-0.152919,-0.046098,-0.154451,0.317171,-0.015085,-0.170085,0.342500,...,0.088587,0.016432,0.046136,0.048944,0.065589,0.042056,0.368304,0.183335,0.002706,-0.007626
ABM,0.084826,1.000000,0.044162,0.217418,-0.163298,0.213833,0.018391,-0.080820,0.530876,0.162230,...,-0.020913,0.141027,-0.168587,0.205586,-0.011564,-0.092234,-0.106523,-0.123164,-0.066744,0.077440
ABT,0.069125,0.044162,1.000000,-0.005490,0.042437,0.197073,-0.022161,0.052698,0.258604,0.091366,...,0.440291,0.110833,0.659009,0.239819,0.097978,0.340625,0.201946,0.227100,0.123941,0.165431
ADI,-0.152919,0.217418,-0.005490,1.000000,0.250068,0.004143,-0.147510,0.151744,0.212227,-0.078771,...,0.143722,0.538367,0.052274,0.206315,0.172525,0.236995,-0.019961,0.193005,0.129434,0.076021
ADM,-0.046098,-0.163298,0.042437,0.250068,1.000000,0.316557,-0.063487,0.313152,0.094817,0.134954,...,0.440868,0.214385,0.310297,0.041392,0.244821,0.560923,0.173681,0.345404,0.155754,0.258549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLY,0.042056,-0.092234,0.340625,0.236995,0.560923,0.414531,0.097922,0.288977,-0.008969,0.197951,...,0.781828,0.519283,0.628317,0.323088,0.562699,1.000000,0.226892,0.352852,0.235341,0.260681
XOM,0.368304,-0.106523,0.201946,-0.019961,0.173681,0.130167,0.281450,-0.072953,-0.206771,0.320972,...,0.475662,0.244207,0.429145,0.552873,0.117290,0.226892,1.000000,0.334096,-0.017089,0.115952
XRX,0.183335,-0.123164,0.227100,0.193005,0.345404,0.244077,0.281934,0.310500,0.031733,0.207695,...,0.438615,0.323883,0.324876,0.048833,0.239357,0.352852,0.334096,1.000000,0.080223,0.110155
YUM,0.002706,-0.066744,0.123941,0.129434,0.155754,0.197853,0.051666,0.207824,-0.048061,0.367187,...,0.064949,-0.072242,0.049041,-0.097012,0.338998,0.235341,-0.017089,0.080223,1.000000,0.246080


2. Obtention de la composition de chaque cluster et du centroïde de chacun d'entre eux

In [19]:
import numpy as np
number_of_clusters = 20

result = pd.DataFrame(index=list(correlation_matrix.columns), columns=['Cluster label'], data=process.apply_SPONGE(correlation_matrix, number_of_clusters))

df_cleaned.set_index('ticker', inplace=True)

## STEP 2: compute the composition of each cluster (in terms of stocks)

cluster_composition = []


  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)


In [20]:
for i in range(1, number_of_clusters):

    if i in result['Cluster label'].values: ## we check that the i-th cluster is not empty

        cluster_composition.append([f'cluster {i}', list(result[result['Cluster label'] == i].index)])


In [42]:
import pandas as pd

# Définir l'option pour afficher toutes les décimales
pd.set_option('display.float_format', '{:.5f}'.format)

In [44]:
lookback_window = 30
x = df_cleaned.loc['AA', :][:lookback_window].to_numpy()
x

array([-0.01,  0.01,  0.05, -0.01, -0.02, -0.03,  0.02, -0.01, -0.02,
       -0.02, -0.03,  0.  , -0.03,  0.02, -0.03, -0.02, -0.02, -0.02,
       -0.02, -0.  ,  0.02,  0.04, -0.03, -0.03, -0.04, -0.04, -0.02,
        0.01,  0.02,  0.05])

In [40]:
lookback_window = 30

## STEP 3: compute the centroid of each cluster 

for i in range(len(cluster_composition)):

    cluster_size = len(cluster_composition[i][1]) ## we average over the size of each cluster to compute the centroid

    return_centroid = np.zeros(lookback_window) ## we prepare the return_centroid array to stock the centroid

    for elem in cluster_composition[i][1]:

        return_centroid = return_centroid + df_cleaned.loc[elem, :][:lookback_window].values

    cluster_composition[i].append(return_centroid/cluster_size) ## the third element contains the centroid of the cluster in question

In [3]:
cluster_composition = process.cluster_composition_and_centroid(df_cleaned=df_cleaned, correlation_matrix=correlation_matrix, number_of_clusters=20)

  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)


**ÉTAPE 2** : construction de portefeuille

1. On donne, au sein d'un même cluster, un poids à chaque actif selon sa distance au centroïde de celui-ci. Cela nous servira plus tard pour calculer le rendement de chaque cluster (alors vu comme un nouvel actif synthétique)

In [4]:
constituent_weights = process.constituent_weights(df_cleaned=df_cleaned, cluster_composition=cluster_composition, sigma=5)

In [9]:
constituent_weights

[['cluster 1',
  [['DLX', 0.018142901897572294],
   ['FLS', 0.01716308411358549],
   ['FNF', 0.016585674701872083],
   ['GCI', 0.018884629672179635],
   ['GPI', 0.01982325557470948],
   ['HIO', 0.012860341887080023],
   ['HIX', 0.012818880659385096],
   ['HR', 0.01596209730992781],
   ['IQI', 0.011671818517530942],
   ['KMB', 0.012017669774928462],
   ['KSM', 0.011619845429169646],
   ['KTF', 0.011779952438135951],
   ['LEO', 0.011892332804546149],
   ['LNC', 0.02122028899197471],
   ['MMU', 0.011480656164162948],
   ['MPA', 0.01141490102532497],
   ['MUA', 0.011617737967993494],
   ['MUJ', 0.011396175511515581],
   ['MVF', 0.01187868298212229],
   ['MYC', 0.011689761849812429],
   ['MYI', 0.011454274794808022],
   ['MYJ', 0.011511476193768053],
   ['SEE', 0.015881151819124587],
   ['SJR', 0.01327295025762739],
   ['TD', 0.01416418281832764],
   ['THO', 0.017035861567575576],
   ['VVI', 0.017051771681523078]]],
 ['cluster 2',
  [['AMT', 0.0006634693100062819],
   ['AVA', 0.000576179624

Le choix des rendements attendus (expected_returns) dans le modèle de Markowitz peut être un défi car il nécessite des prévisions pour chaque actif inclus dans le portefeuille. 

In [6]:
## on récupère le dataframe contenant les return de chaque cluster

cluster_return = process.cluster_return(constituent_weights=constituent_weights, df_cleaned=df_cleaned) 

## on construit la matrice de corrélation associée à ces returns, c'est donc une matrice de corrélation de return de cluster

cov_matrix = cluster_return.corr(method='pearson')

## on construit le vecteur d'expected return du cluster 
expected_returns = cluster_return.mean(axis=0) ## on fait ici le choix de prendre le rendement moyen comme objectif

## !!! LA FENÊTRE DE 30 JOURS --> PAS 5530 !!

In [215]:
cluster_return

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8,cluster 9,cluster 10,cluster 11,cluster 12,cluster 13,cluster 14,cluster 15,cluster 16,cluster 17,cluster 18,cluster 19
0,-0.003916,-1.206012e-05,-1.149447e-09,-3.243262e-13,-2.390094e-10,-3.189495e-06,-0.000483,0.030415,-0.000108,-1.661727e-23,-2.569692e-10,-0.000038,-2.624204e-26,0.016385,0.000013,-0.000548,-0.001794,-0.002300,0.008031
1,-0.001972,-1.399414e-05,-2.917476e-09,-1.617835e-13,-5.815658e-11,-1.984405e-06,-0.000774,0.012295,-0.000561,-2.491607e-24,-2.543656e-10,-0.000038,-1.463054e-26,0.103831,-0.000027,0.000357,0.001052,-0.000637,0.002494
2,-0.001007,4.255347e-06,3.786536e-09,1.045364e-13,6.104471e-12,1.504349e-06,-0.000148,0.000669,0.000245,2.825542e-24,-4.453236e-12,0.000026,-1.125874e-27,-0.045667,0.000028,0.000154,0.003016,0.003460,0.011291
3,0.002157,-2.050375e-06,8.185370e-09,1.409293e-13,-5.518844e-11,4.162841e-07,0.000095,-0.014567,0.000039,1.816023e-23,1.955365e-10,0.000034,-4.172411e-27,-0.004408,0.000003,0.000283,-0.000328,0.005910,0.006265
4,-0.000066,1.363914e-05,5.578441e-09,1.335361e-13,1.392485e-10,7.047807e-07,0.000165,0.015102,-0.000101,1.027468e-23,6.217763e-11,0.000002,4.170944e-26,-0.021683,0.000038,0.000264,0.000852,0.007958,0.005879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0.002229,3.877053e-06,2.564759e-09,1.313502e-13,8.888332e-11,2.165783e-06,0.000188,0.009063,0.000207,6.669544e-24,6.690253e-11,0.000025,1.085184e-26,0.025376,0.000021,0.000358,0.000295,0.004280,-0.001239
5527,0.001067,2.160810e-06,7.361260e-10,7.362058e-14,1.608172e-11,3.958965e-07,0.000055,-0.000709,0.000088,-2.632170e-24,1.662342e-11,0.000006,2.860437e-27,0.019329,0.000005,0.000090,0.000617,0.000925,-0.004077
5528,0.000324,2.151880e-06,8.753316e-10,3.604102e-14,2.534453e-11,4.047564e-07,0.000147,0.003484,0.000092,-3.863334e-24,6.721514e-12,0.000006,4.241108e-27,0.013394,0.000006,0.000106,0.000261,-0.000382,0.000179
5529,-0.000775,-3.217251e-07,6.353169e-11,-7.715083e-14,-2.817443e-11,-1.077624e-06,0.000259,-0.002904,-0.000096,-3.384603e-24,-1.615852e-11,-0.000003,-5.393119e-27,-0.022351,-0.000003,-0.000057,-0.000025,0.002409,0.010259


In [213]:
from pypfopt.efficient_frontier import EfficientFrontier

# Assuming risk_free_rate is the appropriate value for your analysis
risk_free_rate = 0.02

ef = EfficientFrontier(expected_returns, cov_matrix)
weights = ef.max_sharpe(risk_free_rate=risk_free_rate)

ValueError: at least one of the assets must have an expected return exceeding the risk-free rate