## Signed Laplacian Clustering 

We cluster using signed Laplacian Clustering

#### 1. Data Preparation

In [152]:
import pandas as pd 
import numpy as np 

# Put your own path 
df = pd.read_csv('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv')

print(df.columns.tolist())

['ticker', 'open', 'high', 'low', 'close', 'volume', 'OPCL', 'pvCLCL', 'prevAdjClose', 'SPpvCLCL', 'sharesOut', 'PERMNO', 'SICCD', 'PERMCO', 'prevRawOpen', 'prevRawClose', 'prevAdjOpen']


We first prepare the dataframe we will use to construct the portfolio: df_cleaned which contains the 5531 returns of 632 assets

In [153]:
import pandas as pd
import ast

# Function to safely convert a string into a list
def safe_literal_eval(s):
    try:
        # Tries to convert the string into a list
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If an error occurs, returns a default value, e.g. an empty list
        return []

# Apply conversion function to 'open' and 'close' columns
df['open'] = df['open'].apply(safe_literal_eval)
df['close'] = df['close'].apply(safe_literal_eval)

# Calculate returns for each line
df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)



In [154]:
# create a new data frame with the column ticker and return 
new_df = df[['ticker', 'return']] 

In [155]:
# Créons le DataFrame à partir des listes dans 'return'
# On suppose ici que 'new_df' est déjà défini et contient la colonne 'return'

# Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
returns_df = pd.DataFrame(new_df['return'].tolist())

# Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
returns_df.insert(0, 'ticker', new_df['ticker'])

# Renommer les colonnes pour refléter qu'elles sont des rendements
returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

def check_nan_inf(df):
    # Vérification des valeurs NaN
    if df.isna().any().any():
        print("There are NaN values in the dataframe")
    else:
        print("There are no NaN values in the dataframe")

def remove_rows_with_nan(df):
    return df.dropna()

df_cleaned = remove_rows_with_nan(returns_df)
df_cleaned.reset_index(drop=True, inplace=True)

check_nan_inf(df_cleaned)

df_cleaned.shape

There are no NaN values in the dataframe


(632, 5532)

#### 2. Signed Laplacian Clustering

Now we get a clustering with Signed Laplacian algorithm

In [156]:
import sys

sys.path.append('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project')  # Ajoute le chemin parent

from signet.cluster import Cluster 
from scipy import sparse
def signed_adjency(mat):
    '''
    L'idée est ici, à partir d'une matrice de corrélation mat, de renvoyer deux matrices 
    A_positive et A_negative qui correspondraient aux matrices des corrélations positives et négatives 
    associées  
    '''

    A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
    A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
    
    return A_pos, A_neg

def apply_signed_laplacian(correlation_matrix, k): 

    '''
    IDÉE : étant donné une matrice de correlation obtenue à partir d'une base de donnée et de la similarité de pearson, renvoyer un vecteur associant 
           à chaque actif le numéro du cluster auquel il appartient une fois qu'on lui a appliqué SPONGE (à partir du package signet)

    PARAMS : 

    - correlation_matrix : a square dataframe of size (number_of_stocks, number_of_stocks)
    - k : the number of clusters to identify. If a list is given, the output is a corresponding list

    RETURNS : array of int, or list of array of int: Output assignment to clusters.

    '''
    
    ## On respecte le format imposé par signet. Pour cela il faut changer le type des matrices A_pos et A_neg, qui ne peuvent pas rester des dataframes 

    A_pos, A_neg = signed_adjency(correlation_matrix)

    A_pos_sparse = sparse.csc_matrix(A_pos.values)
    A_neg_sparse = sparse.csc_matrix(A_neg.values)

    data = (A_pos_sparse, A_neg_sparse)

    cluster = Cluster(data)

    ## On applique la méthode SPONGE : clusters the graph using the Signed Positive Over Negative Generalised Eigenproblem (SPONGE) clustering.

    return cluster.spectral_cluster_laplacian(k)

In [157]:
start_col = 1 * 200 + 1  # Commence à partir de la deuxième colonne
end_col = start_col + 200

# Vérifie si la fin de la tranche dépasse le nombre de colonnes
if end_col > len(df_cleaned.columns):
    end_col = len(df_cleaned.columns)  # Ajuste pour ne pas dépasser
    
data_period.append(df_cleaned.iloc[:, start_col:end_col]) 

returns_transposed = data_period[1].transpose()
correlation_matrix = returns_transposed.corr(method='pearson')
r = apply_signed_laplacian(correlation_matrix, 20)
r

  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)


array([ 6, 19,  8,  0,  6, 14, 17, 11, 16, 17, 15, 15,  2,  2,  1, 11,  1,
        2,  6, 19,  2, 19,  0, 19, 19, 19, 19, 19,  1,  4,  6, 11, 19, 11,
       16, 11, 19,  6,  6,  1, 16,  2, 14,  0,  1,  3,  6, 12, 14, 18, 14,
       19,  6, 19, 18,  0, 19, 19, 14,  2,  1,  8, 14,  0, 19,  1, 14,  2,
       12,  7, 14,  6,  1, 18,  1,  7, 19,  1,  1, 11, 18,  1,  1,  6, 11,
        8,  1,  2,  1, 12, 16, 14,  6, 14, 19,  2, 11,  0, 10, 10, 18,  6,
       15,  0,  2,  1, 14, 17, 10,  6,  1, 15, 19, 15,  6,  8, 15, 11, 14,
        6, 19,  4,  1,  6,  1,  1, 11,  1, 18, 11,  6, 11, 10, 17, 11, 19,
        0, 11,  6, 10,  1,  1, 10, 11,  1, 15,  1, 19, 14,  1, 15,  3,  0,
       17, 17,  4, 19,  6, 11,  2, 17,  6, 14,  7,  6, 14, 11,  6,  6, 14,
        4,  6, 10,  6,  2, 19, 14,  1, 17,  0, 14, 11, 11, 19, 14,  1, 15,
       14, 19, 19,  5,  6, 11, 19, 14, 17,  0, 14,  1, 11, 14,  1, 14, 19,
       19, 13, 14, 11, 19, 19, 11, 14,  6, 14, 14, 11, 11,  1, 11,  0,  7,
        3, 18,  1, 11, 14

We divide the timeframe in 28 periods of 200 days (the last one is 132 days only) and run Signed Laplacian to make 28 clusterings

In [158]:
data_period=[]
result=[]
for i in range(28):
    start_col = i * 200 + 1  # Commence à partir de la deuxième colonne
    end_col = start_col + 200

    # Vérifie si la fin de la tranche dépasse le nombre de colonnes
    if end_col > len(df_cleaned.columns):
        end_col = len(df_cleaned.columns)  # Ajuste pour ne pas dépasser
    
    data_period.append(df_cleaned.iloc[:, start_col:end_col]) 

    returns_transposed = data_period[i].transpose()

    # Calculer la matrice de corrélation sur les actifs transposés
    correlation_matrix = returns_transposed.corr(method='pearson')

    # On prend un nombre de cluster égal à k=9
    result.append(apply_signed_laplacian(correlation_matrix, k=9))



  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = 

#### Side questions : are clusters stable over time ? 

##### 1. Cluster composition

In [159]:
## number of clusters 
k = 9

def cluster_composition(k):
    ## On récupère la liste de tous les tickers
    tickers = df_cleaned['ticker']

    ## On prépare les noms de colonne avant d'initialiser le dataframe
    nb_clustering = [f'Clustering_{i+1}' for i in range(28)]
    nb_cluster = [f'Cluster_{j+1}' for j in range(9)]

    cluster_composition = pd.DataFrame(index= nb_cluster, columns= nb_clustering)
    ## on parcourt l'ensemble de tous les clusterings 

    for i in range(28):

        ## on récupère le résultat du clustering i

        clustering = result[i]
        nb_clustering = f'clustering {i}'
        
        for j in range(k):
            ## on récupère les tickers des stocks qui sont dans le cluster
            ## numéro j 
            nb_cluster = f'cluster {j}'
            index = list(np.where(clustering == j)[0])
            cluster_composition.iloc[j][i] = list(tickers[index])

    return cluster_composition

cluster_composition = cluster_composition(k)
cluster_composition.head(3)

  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[index])
  cluster_composition.iloc[j][i] = list(tickers[

Unnamed: 0,Clustering_1,Clustering_2,Clustering_3,Clustering_4,Clustering_5,Clustering_6,Clustering_7,Clustering_8,Clustering_9,Clustering_10,...,Clustering_19,Clustering_20,Clustering_21,Clustering_22,Clustering_23,Clustering_24,Clustering_25,Clustering_26,Clustering_27,Clustering_28
Cluster_1,"[AEG, AIG, AJG, ALL, APD, ASG, ATR, AVY, AXP, ...","[AEM, ASA, AU, AXL, BKN, BKT, CCK, CDE, CIA, E...","[ABM, AFG, AFL, AIR, ALV, APD, ASG, ATR, AWF, ...","[CIF, CPE, CRY, DDF, DHY, EVF, IIF, LGF, MCR, ...","[AEM, ASA, AU, CDE, HL, NEM]","[CMU, FUN, KSM, MGF, MHF, MIN, NCA, NXP, PFD, ...","[AWF, CIF, DDF, DHF, DHY, DSM, DSU, EVF, HIO, ...","[ABM, ABT, ADI, AFG, AIN, ALB, AME, AN, ANF, A...","[ABT, AFG, AIV, ARE, AVA, AVB, AWF, AWR, BDN, ...","[ABM, ABT, ADM, ALV, ARW, ATO, AU, AVT, AVY, B...",...,"[ALB, AMD, APA, BHE, BRC, CIK, CLB, CLF, CLS, ...","[ALK, DRI, LUV, MMU]","[ABT, AFL, AJG, ALL, AMT, AOS, ARE, ASGN, ATR,...","[AEG, AFG, AIG, AIN, AIR, ALK, AN, ANF, AXL, B...","[AIV, AVB, CLX, CPK, CPT, DNP, EIX, ESS, FE, H...","[AEE, AEP, ATO, BKH, CMS, D, DTE, DUK, ED, ETR...","[ABM, ADX, AES, AFG, AIG, ALL, APH, ASG, AVY, ...","[ABT, ADM, AEE, AEP, ALB, AME, AMG, AOS, APD, ...","[ADM, ADX, AFG, AFL, AIN, AIV, AME, AMG, AN, A...","[ADM, AFG, AFL, AIG, AIV, AJG, AME, AOS, APD, ..."
Cluster_2,"[AU, BFS, BKN, BKT, CDE, CIF, CLB, CNX, CRY, D...","[AES, AFG, AIN, AIV, AOS, ATR, AVB, AVY, BEN, ...","[ABT, ADM, BAX, BDX, BRK, BTI, CAG, CL, CLX, C...","[BKN, BKT, MCA, MFL, MIY, MMU, MQT, MVF, MYC, ...","[APA, BPT, CNX, CRK, DRQ, DVN, EOG, EQT, HAL, ...","[AA, AEG, AIG, AIR, AJG, ALL, ALV, AME, AMG, A...","[AEG, AEP, AFG, AIV, AJG, ALL, AN, ANF, ARE, A...","[AFL, AIR, AJG, ALL, ASGN, AXL, BAX, BK, BMY, ...","[AEM, APA, ASA, BMO, BPT, CDE, CLB, CNX, CPE, ...","[ASH, BPT, CI, CR, CRY, DLX, FDP, FDS, GIB, HR...",...,"[AES, ATO, BF, BMY, CAG, CB, CHD, CI, CLX, CPK...","[ABM, ADI, AEG, AIG, AIN, AIR, AMD, AME, AMG, ...","[APA, ASA, BAC, BBY, BC, BK, BWA, CDE, CFR, CI...","[BDX, BKN, CMU, CXE, DSM, IQI, LEO, MCA, MFL, ...","[ADX, AIN, AIR, ALB, ALK, ALV, AME, AMG, AOS, ...","[ADX, AIG, AME, ASH, AVT, AWF, AZO, B, BA, BCS...","[AIN, ALB, ALV, AMG, ANF, APA, AU, AXL, BBY, B...","[AIN, AIR, AN, ASA, AWF, BKT, BYD, CHH, CIA, C...","[AA, ALB, ANF, APA, CLF, CRY, DVN, EPR, FCX, G...","[AA, APA, AWF, BKN, BYD, CLB, CLF, CPE, CRK, D..."
Cluster_3,"[ABM, ADI, AIR, AMD, ANF, AOS, APA, AVA, AVB, ...","[CIF, CIK, CMU, FT, HEI, KTF, MFL, MFM, MGF, M...","[AES, AMD, AMT, APH, AZO, BHP, BSX, CIEN, CLS,...","[AA, ABM, ADI, ADX, AFG, AIG, AIN, AIR, ALB, A...","[BKN, BKT, CMU, DSM, HYB, IQI, KSM, KTF, MCA, ...","[AWF, BKN, BKT, BPT, CIF, CIK, DHF, DHY, DSM, ...","[AEM, APA, ASA, AU, CDE, CLB, CNX, CPE, CRK, D...","[AA, AEE, AEM, AEP, AMD, AMG, AMT, ASA, ASG, A...","[AEE, AEP, ATO, BKH, BRK, CIA, CMS, CNA, CXE, ...","[AWF, BKT, CIF, CMU, FT, GIM, HIX, HYB, MFM, M...",...,"[AEE, AEP, CMS, DTE, DUK, ED, EIX, ETR, FE, HE...","[ABT, AEM, ALL, ASA, ATO, AWR, BFS, BKH, BKN, ...","[AEE, AEP, ATO, AVA, AWR, CLX, CMS, CPB, CPK, ...","[ADX, AFL, AJG, ALL, ALV, APD, APH, ARW, ASG, ...","[AMD, APA, AXL, BAC, BHP, CMC, DDS, DHF, DLX, ...","[ADI, AIR, AMD, ASG, AXL, BBY, BHE, BKE, BTO, ...","[AA, AN, ARW, ASA, BAC, BKT, BTO, BYD, CAT, CD...","[MFM, MUA, MYC, TEO, UNFI]","[ABT, AJG, AMT, ARE, ATO, BAX, BDX, BF, BLL, C...","[ADX, AEG, AIN, AIR, AN, ASGN, AU, BAC, BBY, B..."


#### 2. Cluster reallocation

Dans un premier temps, on s'intéresse à l'évolution de la taille des clusters

In [160]:

size = []
for i in range(len(cluster_composition.index)): ## cluster_composition.index = number of clusters
    
    size_i = []
    
    for j in range(28):
        
        size_i.append(len(cluster_composition.iloc[i][j]))

    size.append(size_i)

size

  size_i.append(len(cluster_composition.iloc[i][j]))


[[83,
  29,
  94,
  19,
  6,
  12,
  43,
  196,
  80,
  109,
  234,
  154,
  102,
  32,
  399,
  66,
  139,
  94,
  70,
  4,
  150,
  114,
  21,
  23,
  98,
  108,
  173,
  95],
 [24,
  145,
  48,
  13,
  28,
  195,
  115,
  73,
  52,
  26,
  16,
  53,
  54,
  465,
  149,
  37,
  113,
  6,
  45,
  224,
  50,
  32,
  219,
  144,
  59,
  70,
  40,
  53],
 [81,
  25,
  54,
  256,
  31,
  48,
  41,
  114,
  47,
  23,
  37,
  4,
  16,
  100,
  1,
  122,
  4,
  51,
  16,
  73,
  36,
  155,
  49,
  76,
  80,
  5,
  52,
  109],
 [94,
  97,
  50,
  167,
  189,
  2,
  11,
  29,
  7,
  20,
  1,
  19,
  32,
  1,
  15,
  18,
  22,
  146,
  8,
  6,
  102,
  97,
  21,
  56,
  60,
  182,
  122,
  30],
 [42,
  145,
  219,
  6,
  34,
  45,
  173,
  33,
  2,
  165,
  121,
  12,
  252,
  1,
  39,
  34,
  5,
  36,
  32,
  63,
  6,
  49,
  8,
  15,
  15,
  27,
  16,
  101],
 [65,
  37,
  50,
  47,
  89,
  21,
  70,
  25,
  6,
  29,
  64,
  248,
  150,
  25,
  2,
  128,
  234,
  1,
  44,
  14,
  18,
  30,
  

#### 3. How to construct a clustering from the clusterings ? 

Now we use  hierarchical clustering to construct a clustering from the 28 clusterings we made

In [161]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Supposons que vous avez une liste de listes, où chaque sous-liste est un des 28 clusterings
# Chaque sous-liste contient 632 éléments avec des valeurs de 0 à 9
clusterings = result  # Remplacez ceci par vos données de clustering

# Convertir les résultats de clustering en une matrice de co-occurrence
n_data = 632
co_occurrence_matrix = np.zeros((n_data, n_data))

for clustering in clusterings:
    for i in range(n_data):
        for j in range(n_data):
            if clustering[i] == clustering[j]:
                co_occurrence_matrix[i, j] += 1

# Convertir en matrice de similarité
similarity_matrix = co_occurrence_matrix / len(clusterings)

# Convertir la matrice de similarité en matrice de distance
distance_matrix = 1 - similarity_matrix

# Convertir la matrice de distance en format condensé pour la fonction linkage
# car linkage attend une matrice de distance condensée pour les données non euclidiennes
tri_upper_idx = np.triu_indices(n_data, k=1)
distance_condensed = distance_matrix[tri_upper_idx]

# Effectuer le clustering hiérarchique
Z = linkage(squareform(distance_condensed), method='average')

# Former des clusters
# Vous pouvez ajuster le 't' pour obtenir le nombre de clusters souhaité
clusters = fcluster(Z, t=9, criterion='maxclust')
clusters=clusters-1
# Afficher les clusters
print(clusters)

[5 2 8 1 3 1 7 1 5 7 3 1 3 1 2 2 6 3 1 1 3 1 2 1 1 3 1 2 1 5 1 1 6 1 5 1 3
 1 7 1 5 6 6 1 1 4 6 1 3 3 1 1 1 1 3 1 1 1 6 8 1 3 6 2 1 1 2 6 0 4 1 1 1 8
 2 5 1 3 1 3 3 1 1 6 1 8 1 3 1 3 5 2 8 1 1 3 2 1 4 4 8 5 5 2 8 2 1 7 0 1 1
 5 1 3 1 8 5 2 6 1 2 5 1 2 1 1 2 6 8 1 6 2 0 7 1 1 3 1 1 4 1 1 4 1 1 6 1 3
 6 3 5 0 4 7 7 5 1 3 2 1 7 1 6 7 1 2 1 1 1 2 5 3 6 6 5 1 6 1 7 4 1 1 1 1 1
 1 3 2 2 2 3 5 2 3 1 7 3 1 1 1 3 1 6 6 2 4 2 1 1 2 3 1 3 3 2 1 1 1 3 2 2 4
 8 2 1 1 2 1 1 6 1 1 2 5 1 6 1 7 2 3 4 6 4 5 1 3 1 1 5 1 1 6 3 8 1 8 3 3 1
 4 3 7 1 1 2 1 2 1 2 2 1 1 0 3 1 1 2 2 1 8 1 1 1 3 1 8 2 1 1 2 6 8 1 1 8 1
 3 6 0 2 0 2 1 2 0 2 3 3 8 3 2 2 7 3 1 6 2 1 6 1 6 6 3 1 1 1 2 0 3 3 4 2 3
 2 3 3 2 0 0 2 4 0 1 0 4 0 8 1 3 1 3 4 0 8 1 0 0 0 8 5 4 1 1 2 1 1 1 2 2 0
 0 0 5 0 0 1 0 0 1 0 0 0 0 0 5 2 0 1 5 3 6 7 6 3 2 6 6 3 1 2 2 1 4 1 2 0 3
 1 1 0 2 6 2 2 2 7 6 3 0 5 3 1 1 1 1 5 1 4 7 7 6 5 8 4 8 4 8 3 1 1 2 1 1 4
 2 1 1 6 0 0 2 6 1 7 1 7 4 6 6 1 1 5 1 2 1 4 2 3 6 2 3 3 3 5 1 1 3 3 3 2 1
 1 1 1 5 1 3 1 1 3 1 2 5 

  Z = linkage(squareform(distance_condensed), method='average')


#### 3. Markowitz 

Now we want to run markowitz on our portfolio, we need to get the returns and the covariance matrix of the returns of each asset, but before we need to get the composition of each cluster which is a combination of multiple assets. 

In [162]:
cluster_comp=[[],[],[],[],[],[],[],[],[]]
cluster_comp2=[]
for c in range(632):
    cluster_comp[clusters[c]].append(df_cleaned.iloc[c, 0])
for cluster in cluster_comp:
    cluster_indices = [df_cleaned.index[df_cleaned['ticker'] == element].tolist() for element in cluster]
    cluster_comp2.append([index for sublist in cluster_indices for index in sublist])


print(cluster_comp2[3])

[4, 10, 12, 17, 20, 25, 36, 48, 49, 54, 61, 77, 79, 80, 87, 89, 95, 113, 136, 147, 149, 157, 171, 186, 190, 193, 196, 200, 210, 212, 213, 218, 239, 245, 252, 256, 257, 260, 273, 283, 296, 306, 307, 309, 313, 322, 328, 329, 332, 334, 335, 348, 350, 389, 393, 397, 406, 417, 420, 437, 467, 470, 471, 472, 476, 477, 478, 486, 489, 499, 513, 520, 521, 533, 534, 536, 546, 547, 556, 558, 560, 568, 572, 575, 577, 580, 584, 586, 599, 604, 608, 609, 614, 621, 626, 631]


In [None]:
df_cleaned.set_index('ticker', inplace=True)

Now we compute the intracluster weights with gaussianweight

In [166]:
# Fonction pour calculer la distance gaussienne
def gaussian_distance(x, y, sigma):
    return np.exp(-np.linalg.norm(x - y)**2 / (2 * sigma**2))


# Paramètre sigma pour la distance gaussienne
sigma = 1.0  # vous pouvez ajuster cette valeur

# Calculer les poids pour chaque élément dans chaque cluster
cluster_weights = []
for cluster in cluster_comp2:
    cluster_data = df_cleaned.iloc[cluster, :]
    cluster_center = cluster_data.mean()  # Calcul du centre du cluster
    if isinstance(cluster_data, pd.Series):
        cluster_data = pd.DataFrame([cluster_data])
    weights = np.array([np.exp(- 2*(gaussian_distance(row, cluster_center, sigma)**2)) for _, row in cluster_data.iterrows()])
    weights_normalized = weights / weights.sum()
    cluster_weights.append(weights_normalized)

In [168]:
normalized_weights=[0 for i in range(632)]
for i in range(len(cluster_comp2)):
    for j in range(len(cluster_comp2[i])):
        normalized_weights[cluster_comp2[i][j]]=cluster_weights[i][j]

print(normalized_weights)

[0.023397657825850907, 0.007706912968416397, 0.04431442310549088, 0.005733104223692547, 0.009441116003059618, 0.0016768985714301695, 0.02907663179743173, 0.0030563187573704243, 0.028356236825043346, 0.036056583034114444, 0.017706016948367216, 0.004021638766578369, 0.010597643154004353, 0.007300834857718667, 0.00836896905655906, 0.011459576841553522, 0.018953548764079634, 0.00805186990039688, 0.004437274844217215, 0.006372843181230404, 0.00812365962220269, 0.0031343721346366645, 0.012299551202259664, 0.0030376770467317204, 0.004827488489302527, 0.017058574889101844, 0.005478840948636359, 0.0113128670456228, 0.00425846032406641, 0.018928238761430273, 0.0029656431748892828, 0.00427626305852607, 0.012375405315224028, 0.004602244309892746, 0.02392006137787838, 0.0033499223030838358, 0.018071205943029328, 0.004135078329705592, 0.04548992158239447, 0.0032065015825376983, 0.027797129419823322, 0.027852479185590192, 0.011611561695103648, 0.005053285983584417, 0.0030580559495037374, 0.0272245063

In [169]:
return_clusters = pd.DataFrame(columns=df_cleaned.columns[1:])

# Remplissage du nouveau DataFrame avec les moyennes des colonnes pour chaque cluster
for k in range(9):
    # Sélection des lignes appartenant au cluster k
    rows_in_cluster = df_cleaned.iloc[cluster_comp2[k]]

    # Calcul de la moyenne pour chaque colonne, en excluant la première colonne
    mean_values = (rows_in_cluster.iloc[:, 1:].mul(cluster_weights[k], axis=0)).sum()

    # Ajout des moyennes calculées au nouveau DataFrame
    return_clusters.loc[k] = mean_values

print(return_clusters.shape)
returns_transposed = return_clusters.transpose()

# Calculer la matrice de corrélation sur les actifs transposés
correlation_matrix = returns_transposed.cov()*np.sqrt(252)


annual_expected_returns=(return_clusters + 1).prod(axis=1)-1

# Afficher la matrice de corrélation
print(correlation_matrix)
print(annual_expected_returns)

(9, 5530)
          0         1         2         3         4         5         6  \
0  0.000459  0.000248  0.000262  0.000213  0.000342  0.000296  0.000296   
1  0.000248  0.002392  0.002543  0.001746  0.000650  0.002171  0.002029   
2  0.000262  0.002543  0.003001  0.001899  0.000702  0.002333  0.002298   
3  0.000213  0.001746  0.001899  0.001463  0.000512  0.001561  0.001611   
4  0.000342  0.000650  0.000702  0.000512  0.000667  0.000705  0.000686   
5  0.000296  0.002171  0.002333  0.001561  0.000705  0.004476  0.001786   
6  0.000296  0.002029  0.002298  0.001611  0.000686  0.001786  0.003010   
7  0.000196  0.000991  0.001007  0.000942  0.000352  0.001030  0.001249   
8  0.000109  0.000869  0.000872  0.000820  0.000224  0.000691  0.000852   

          7         8  
0  0.000196  0.000109  
1  0.000991  0.000869  
2  0.001007  0.000872  
3  0.000942  0.000820  
4  0.000352  0.000224  
5  0.001030  0.000691  
6  0.001249  0.000852  
7  0.001880  0.000787  
8  0.000787  0.000975  

In [171]:
from module1 import markowitz

weights=markowitz(annual_expected_returns, correlation_matrix)

print(weights)

OrderedDict([(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.34688), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.19965), (8, 0.45347)])


Issue --> put a 0 weight on 6 clusters...

In [185]:
weights

OrderedDict([(0, 0.0),
             (1, 0.0),
             (2, 0.0),
             (3, 0.34688),
             (4, 0.0),
             (5, 0.0),
             (6, 0.0),
             (7, 0.19965),
             (8, 0.45347)])

In [176]:
from module1 import portfolio_pnl_sharpe
overall_pnl_performance = portfolio_pnl_sharpe(returns_transposed, weights, risk_free_rate=0.03)

In [182]:
pnl_evolution = [overall_pnl_performance[0][i] for i in range(len(overall_pnl_performance[0]))]

  pnl_evolution = [overall_pnl_performance[0][i] for i in range(len(overall_pnl_performance[0]))]


In [183]:
pip install -q plotly

Note: you may need to restart the kernel to use updated packages.


In [184]:
import plotly.graph_objects as go

indices = list(range(len(pnl_evolution)))

# Création du graphique
fig = go.Figure()

# Ajout de la courbe
fig.add_trace(go.Scatter(x=indices, y=pnl_evolution, mode='lines+markers', name='Évolution du PNL', line=dict(width=1)))

# Ajout des annotations au survol de la souris
fig.update_layout(
    title='PNL',
    xaxis_title='Time (in days)',
    yaxis_title='PNL ',
    hovermode='x',
    hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell")
)

fig.update_traces(
    hoverinfo='text',
    text=[f"Day {i}: PNL = {pnl}" for i, pnl in enumerate(pnl_evolution)]
)

# Affichage du graphique interactif
fig.show()