**ÉTAPE 0** : préparation des données

In [9]:
import process 
import pandas as pd

# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
# Jerome path : 'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DATA_Statapp.csv'
# Mohamed path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
df = pd.read_csv('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv')

# Apply conversion function to 'open' and 'close' columns
df['open'] = df['open'].apply(process.safe_literal_eval)
df['close'] = df['close'].apply(process.safe_literal_eval)

# Calculate returns for each line
df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)

new_df = df[['ticker', 'return']] # create a new data frame with the column ticker and return 

# Créons le DataFrame à partir des listes dans 'return'
# On suppose ici que 'new_df' est déjà défini et contient la colonne 'return'

# Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
returns_df = pd.DataFrame(new_df['return'].tolist())

# Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
returns_df.insert(0, 'ticker', new_df['ticker'])

# Renommer les colonnes pour refléter qu'elles sont des rendements
returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

df_cleaned = process.remove_rows_with_nan(returns_df)
df_cleaned.reset_index(drop=True, inplace=True)

process.check_nan_inf(df_cleaned)

df_cleaned.shape

There are no NaN values in the dataframe


(632, 5532)

**ÉTAPE 1** : Phase d'entraînement

1. Obtention de la matrice de corrélation des actifs sur une fenêtre arrière de 30 jours (1 mois)

In [10]:
lookback_window = 30
correlation_matrix = process.correlation_matrix(lookback_window, df_cleaned)

2. Obtention de la composition de chaque cluster et du centroïde de chacun d'entre eux

In [11]:
cluster_composition = process.cluster_composition_and_centroid(df_cleaned=df_cleaned, correlation_matrix=correlation_matrix, number_of_clusters=20)

  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)


**ÉTAPE 2** : construction de portefeuille

1. On donne, au sein d'un même cluster, un poids à chaque actif selon sa distance au centroïde de celui-ci. Cela nous servira plus tard pour calculer le rendement de chaque cluster (alors vu comme un nouvel actif synthétique)

In [15]:
constituent_weights = process.constituent_weights(df_cleaned=df_cleaned, cluster_composition=cluster_composition, sigma=5)

In [16]:
constituent_weights

[['cluster 1',
  [['AFL', 6.131250756357077e-12],
   ['AIG', 9.941550214338748e-12],
   ['AXP', 7.0216239442504735e-12],
   ['BAC', 1.0238243059107122e-11],
   ['BK', 7.27780545258416e-12],
   ['BLK', 5.3768827599034825e-12],
   ['BMO', 3.553761818813029e-12],
   ['BTO', 6.176583522593656e-12],
   ['CFR', 5.660135726106599e-12],
   ['CMA', 8.405406752622797e-12],
   ['CNI', 3.5189177370220104e-12],
   ['COF', 1.1225620746963076e-11],
   ['DHR', 3.4171097533342298e-12],
   ['GS', 6.619291406914783e-12],
   ['HON', 4.159070183501992e-12],
   ['HRL', 2.2887238038865204e-12],
   ['HUM', 3.74492699655332e-12],
   ['JPM', 8.574590016482155e-12],
   ['KR', 2.703746902826936e-12],
   ['MTG', 2.0420564254197108e-11],
   ['NAC', 1.7987382576254425e-12],
   ['PFE', 2.8154377622465696e-12],
   ['PGR', 4.29347746403897e-12],
   ['PNC', 8.220740964842387e-12],
   ['RDN', 1.9229308351596518e-11],
   ['RY', 3.6030334956775946e-12],
   ['RYN', 4.996715255208448e-12],
   ['SLM', 6.847826248083987e-12],


In [205]:
def cluster_return(constituent_weights, df_cleaned):
    cluster_return = pd.DataFrame(index=None, columns=[f"cluster {i+1}" for i in range(len(constituent_weights))])

    for i in range(len(constituent_weights)):
        res = 0
        for elem in constituent_weights[i][1]:
            res += elem[1]*df_cleaned.loc[elem[0], :].values
        
        cluster_return[f"cluster {i+1}"] = res

    return cluster_return

Le choix des rendements attendus (expected_returns) dans le modèle de Markowitz peut être un défi car il nécessite des prévisions pour chaque actif inclus dans le portefeuille. 

In [206]:
## on récupère le dataframe contenant les return de chaque cluster

cluster_return = cluster_return(constituent_weights=constituent_weights, df_cleaned=df_cleaned) 

## on construit la matrice de corrélation associée à ces returns, c'est donc une matrice de corrélation de return de cluster

cov_matrix = cluster_return.corr(method='pearson')

## on construit le vecteur d'expected return du cluster 
expected_returns = cluster_return.mean(axis=0) ## on fait ici le choix de prendre le rendement moyen comme objectif

## !!! LA FENÊTRE DE 30 JOURS --> PAS 5530 !!

In [215]:
cluster_return

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8,cluster 9,cluster 10,cluster 11,cluster 12,cluster 13,cluster 14,cluster 15,cluster 16,cluster 17,cluster 18,cluster 19
0,-0.003916,-1.206012e-05,-1.149447e-09,-3.243262e-13,-2.390094e-10,-3.189495e-06,-0.000483,0.030415,-0.000108,-1.661727e-23,-2.569692e-10,-0.000038,-2.624204e-26,0.016385,0.000013,-0.000548,-0.001794,-0.002300,0.008031
1,-0.001972,-1.399414e-05,-2.917476e-09,-1.617835e-13,-5.815658e-11,-1.984405e-06,-0.000774,0.012295,-0.000561,-2.491607e-24,-2.543656e-10,-0.000038,-1.463054e-26,0.103831,-0.000027,0.000357,0.001052,-0.000637,0.002494
2,-0.001007,4.255347e-06,3.786536e-09,1.045364e-13,6.104471e-12,1.504349e-06,-0.000148,0.000669,0.000245,2.825542e-24,-4.453236e-12,0.000026,-1.125874e-27,-0.045667,0.000028,0.000154,0.003016,0.003460,0.011291
3,0.002157,-2.050375e-06,8.185370e-09,1.409293e-13,-5.518844e-11,4.162841e-07,0.000095,-0.014567,0.000039,1.816023e-23,1.955365e-10,0.000034,-4.172411e-27,-0.004408,0.000003,0.000283,-0.000328,0.005910,0.006265
4,-0.000066,1.363914e-05,5.578441e-09,1.335361e-13,1.392485e-10,7.047807e-07,0.000165,0.015102,-0.000101,1.027468e-23,6.217763e-11,0.000002,4.170944e-26,-0.021683,0.000038,0.000264,0.000852,0.007958,0.005879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0.002229,3.877053e-06,2.564759e-09,1.313502e-13,8.888332e-11,2.165783e-06,0.000188,0.009063,0.000207,6.669544e-24,6.690253e-11,0.000025,1.085184e-26,0.025376,0.000021,0.000358,0.000295,0.004280,-0.001239
5527,0.001067,2.160810e-06,7.361260e-10,7.362058e-14,1.608172e-11,3.958965e-07,0.000055,-0.000709,0.000088,-2.632170e-24,1.662342e-11,0.000006,2.860437e-27,0.019329,0.000005,0.000090,0.000617,0.000925,-0.004077
5528,0.000324,2.151880e-06,8.753316e-10,3.604102e-14,2.534453e-11,4.047564e-07,0.000147,0.003484,0.000092,-3.863334e-24,6.721514e-12,0.000006,4.241108e-27,0.013394,0.000006,0.000106,0.000261,-0.000382,0.000179
5529,-0.000775,-3.217251e-07,6.353169e-11,-7.715083e-14,-2.817443e-11,-1.077624e-06,0.000259,-0.002904,-0.000096,-3.384603e-24,-1.615852e-11,-0.000003,-5.393119e-27,-0.022351,-0.000003,-0.000057,-0.000025,0.002409,0.010259


In [213]:
from pypfopt.efficient_frontier import EfficientFrontier

# Assuming risk_free_rate is the appropriate value for your analysis
risk_free_rate = 0.02

ef = EfficientFrontier(expected_returns, cov_matrix)
weights = ef.max_sharpe(risk_free_rate=risk_free_rate)

ValueError: at least one of the assets must have an expected return exceeding the risk-free rate