## Signed Laplacian Clustering 

We cluster using signed Laplacian Clustering

#### 1. Data Preparation

In [2]:
import pandas as pd 
import numpy as np 

# Put your own path 
df = pd.read_csv('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv')

print(df.columns.tolist())

['ticker', 'open', 'high', 'low', 'close', 'volume', 'OPCL', 'pvCLCL', 'prevAdjClose', 'SPpvCLCL', 'sharesOut', 'PERMNO', 'SICCD', 'PERMCO', 'prevRawOpen', 'prevRawClose', 'prevAdjOpen']


We first prepare the dataframe we will use to construct the portfolio: df_cleaned which contains the 5531 returns of 632 assets

In [3]:
import pandas as pd
import ast

# Function to safely convert a string into a list
def safe_literal_eval(s):
    try:
        # Tries to convert the string into a list
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If an error occurs, returns a default value, e.g. an empty list
        return []

# Apply conversion function to 'open' and 'close' columns
df['open'] = df['open'].apply(safe_literal_eval)
df['close'] = df['close'].apply(safe_literal_eval)

# Calculate returns for each line
df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)



In [4]:
# create a new data frame with the column ticker and return 
new_df = df[['ticker', 'return']] 

In [17]:
# Créons le DataFrame à partir des listes dans 'return'
# On suppose ici que 'new_df' est déjà défini et contient la colonne 'return'

# Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
returns_df = pd.DataFrame(new_df['return'].tolist())

# Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
returns_df.insert(0, 'ticker', new_df['ticker'])

# Renommer les colonnes pour refléter qu'elles sont des rendements
returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

def check_nan_inf(df):
    # Vérification des valeurs NaN
    if df.isna().any().any():
        print("There are NaN values in the dataframe")
    else:
        print("There are no NaN values in the dataframe")

def remove_rows_with_nan(df):
    return df.dropna()

df_cleaned = remove_rows_with_nan(returns_df)
df_cleaned.reset_index(drop=True, inplace=True)

check_nan_inf(df_cleaned)

df_cleaned.shape

There are no NaN values in the dataframe


(632, 5532)

#### 2. SPONGE Clustering

Now we get a clustering with SPONGE algorithm

In [10]:
import sys

sys.path.append('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project')  # Ajoute le chemin parent

from signet.cluster import Cluster 
from scipy import sparse
def signed_adjency(mat):
    '''
    L'idée est ici, à partir d'une matrice de corrélation mat, de renvoyer deux matrices 
    A_positive et A_negative qui correspondraient aux matrices des corrélations positives et négatives 
    associées  
    '''

    A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
    A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
    
    return A_pos, A_neg

def apply_signed_laplacian(correlation_matrix, k): 

    '''
    IDÉE : étant donné une matrice de correlation obtenue à partir d'une base de donnée et de la similarité de pearson, renvoyer un vecteur associant 
           à chaque actif le numéro du cluster auquel il appartient une fois qu'on lui a appliqué SPONGE (à partir du package signet)

    PARAMS : 

    - correlation_matrix : a square dataframe of size (number_of_stocks, number_of_stocks)
    - k : the number of clusters to identify. If a list is given, the output is a corresponding list

    RETURNS : array of int, or list of array of int: Output assignment to clusters.

    '''
    
    ## On respecte le format imposé par signet. Pour cela il faut changer le type des matrices A_pos et A_neg, qui ne peuvent pas rester des dataframes 

    A_pos, A_neg = signed_adjency(correlation_matrix)

    A_pos_sparse = sparse.csc_matrix(A_pos.values)
    A_neg_sparse = sparse.csc_matrix(A_neg.values)

    data = (A_pos_sparse, A_neg_sparse)

    cluster = Cluster(data)

    ## On applique la méthode SPONGE : clusters the graph using the Signed Positive Over Negative Generalised Eigenproblem (SPONGE) clustering.

    return cluster.spectral_cluster_laplacian(k)

We divide the timeframe in 28 periods of 200 days (the last one is 132 days only) and run SPONGE to make 28 clusterings

In [12]:
data_period=[]
result=[]
for i in range(28):
    start_col = i * 200 + 1  # Commence à partir de la deuxième colonne
    end_col = start_col + 200

    # Vérifie si la fin de la tranche dépasse le nombre de colonnes
    if end_col > len(df_cleaned.columns):
        end_col = len(df_cleaned.columns)  # Ajuste pour ne pas dépasser
    
    data_period.append(df_cleaned.iloc[:, start_col:end_col]) 

    returns_transposed = data_period[i].transpose()

    # Calculer la matrice de corrélation sur les actifs transposés
    correlation_matrix = returns_transposed.corr(method='pearson')
    result.append(apply_signed_laplacian(correlation_matrix, k=9))



  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = 

Now we use  hierarchical clustering to construct a clustering from the 28 clusterings we made

In [13]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Supposons que vous avez une liste de listes, où chaque sous-liste est un des 28 clusterings
# Chaque sous-liste contient 632 éléments avec des valeurs de 0 à 9
clusterings = result  # Remplacez ceci par vos données de clustering

# Convertir les résultats de clustering en une matrice de co-occurrence
n_data = 632
co_occurrence_matrix = np.zeros((n_data, n_data))

for clustering in clusterings:
    for i in range(n_data):
        for j in range(n_data):
            if clustering[i] == clustering[j]:
                co_occurrence_matrix[i, j] += 1

# Convertir en matrice de similarité
similarity_matrix = co_occurrence_matrix / len(clusterings)

# Convertir la matrice de similarité en matrice de distance
distance_matrix = 1 - similarity_matrix

# Convertir la matrice de distance en format condensé pour la fonction linkage
# car linkage attend une matrice de distance condensée pour les données non euclidiennes
tri_upper_idx = np.triu_indices(n_data, k=1)
distance_condensed = distance_matrix[tri_upper_idx]

# Effectuer le clustering hiérarchique
Z = linkage(squareform(distance_condensed), method='average')

# Former des clusters
# Vous pouvez ajuster le 't' pour obtenir le nombre de clusters souhaité
clusters = fcluster(Z, t=9, criterion='maxclust')
clusters=clusters-1
# Afficher les clusters
print(clusters)

[2 2 7 1 2 1 8 1 4 8 2 1 2 1 2 2 6 2 1 2 2 1 2 1 1 2 2 2 2 4 1 1 6 1 4 1 2
 1 8 1 4 8 6 1 1 3 8 2 1 2 1 1 1 2 7 1 1 1 6 7 1 2 6 2 1 2 5 8 0 3 1 1 1 7
 2 4 1 2 1 2 2 1 1 6 1 7 1 2 1 2 4 5 7 2 2 5 5 1 3 3 7 4 2 4 7 5 1 8 0 2 1
 4 2 2 1 7 4 8 6 1 5 4 1 5 1 1 2 6 2 1 8 2 0 8 2 1 2 1 1 3 2 1 3 1 1 3 1 2
 6 2 4 0 3 8 8 4 1 2 2 1 8 1 6 8 2 2 2 1 1 5 4 1 6 6 4 1 6 1 8 3 1 1 1 1 1
 1 2 5 5 5 5 4 5 2 1 8 1 1 1 2 2 1 6 6 5 3 2 1 1 2 1 2 2 1 2 1 1 1 2 5 2 3
 7 1 1 1 2 2 1 3 1 1 5 4 1 6 1 8 5 2 3 6 3 4 1 2 1 1 4 1 1 6 1 7 1 7 2 5 1
 3 1 8 1 1 2 2 1 1 2 4 1 2 0 2 1 1 5 1 1 7 1 2 2 2 1 7 2 1 2 5 6 7 1 1 7 2
 5 6 0 2 0 5 1 2 0 5 2 2 7 2 1 2 8 1 1 6 4 1 6 1 6 6 1 1 1 1 2 0 2 2 3 1 2
 2 2 2 2 0 0 2 3 0 1 0 0 0 7 1 2 1 1 3 0 7 1 0 0 0 7 4 3 1 1 2 1 1 1 1 1 0
 0 0 4 0 0 1 0 0 1 0 0 0 0 0 4 2 0 1 4 2 6 8 8 2 2 6 6 2 1 2 4 1 3 1 5 0 1
 1 1 0 2 6 5 1 5 8 6 1 0 4 2 1 1 2 2 4 1 3 8 8 6 4 7 3 2 3 7 2 1 1 2 2 1 3
 5 1 1 6 0 0 2 8 1 8 1 8 3 6 6 2 1 4 1 2 2 3 2 2 6 2 1 2 2 4 2 1 2 2 2 1 1
 1 1 1 4 1 2 1 1 2 2 2 4 

  Z = linkage(squareform(distance_condensed), method='average')


In [37]:
def synthesis(data, number_of_clusters):

    '''
    ----------------------------------------------------------------
    PARAMETERS : 
    ----------------------------------------------------------------
    1. data : numpy array of lenght number of stocks. As returned by the 
              previous cell
    2. number_of_clusters : integer corresponding to the number of clusters
    ----------------------------------------------------------------
    '''

    synthesis = pd.DataFrame(index=['Stock Composition', 'Cluster Size', 'Cluster Sharpe Ratio'], columns=[f'Cluster {i}' for i in list(range(number_of_stocks))])

    for i in range(len(data)):
        if len(synthesis.iloc[0, data[i]]) == 0:
            synthesis.iloc[0, data[i]] = []
        else:
            synthesis.iloc[0, data[i]].append(data.index[i])
    
    for i in range(len(number_of_clusters)):
        synthesis.iloc[1, i] = len(synthesis.iloc[0, data[i]])
    
    return synthesis

In [38]:
synthesis(cluster, 9)

NameError: name 'number_of_stocks' is not defined

In [31]:
cluster_df

ticker,AA,ABM,ABT,ADI,ADM,ADX,AEE,AEG,AEM,AEP,...,XLI,XLK,XLP,XLU,XLV,XLY,XOM,XRX,YUM,ZTR
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,,,,,,,,,,,...,,,,,,,,,,
ABM,,,,,,,,,,,...,,,,,,,,,,
ABT,,,,,,,,,,,...,,,,,,,,,,
ADI,,,,,,,,,,,...,,,,,,,,,,
ADM,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLY,,,,,,,,,,,...,,,,,,,,,,
XOM,,,,,,,,,,,...,,,,,,,,,,
XRX,,,,,,,,,,,...,,,,,,,,,,
YUM,,,,,,,,,,,...,,,,,,,,,,


#### 3. Markowitz 

Now we want to run markowitz on our portfolio, we need to get the returns and the covariance matrix of the returns of each asset, but before we need to get the composition of each cluster which is a combination of multiple assets. 

In [14]:
cluster_comp=[[],[],[],[],[],[],[],[],[]]
cluster_comp2=[]
for c in range(632):
    cluster_comp[clusters[c]].append(df_cleaned.iloc[c, 0])
for cluster in cluster_comp:
    cluster_indices = [df_cleaned.index[df_cleaned['ticker'] == element].tolist() for element in cluster]
    cluster_comp2.append([index for sublist in cluster_indices for index in sublist])



print(cluster_comp2[3])

[45, 69, 98, 99, 139, 142, 145, 152, 179, 205, 221, 229, 240, 242, 259, 330, 340, 351, 360, 402, 427, 433, 435, 443, 456, 465, 515, 541, 585, 597]


In [15]:
# Fonction pour calculer la distance gaussienne
def gaussian_distance(x, y, sigma):
    return np.exp(-np.linalg.norm(x - y)**2 / (2 * sigma**2))


# Paramètre sigma pour la distance gaussienne
sigma = 1.0  # vous pouvez ajuster cette valeur

# Calculer les poids pour chaque élément dans chaque cluster
cluster_weights = []
for cluster in cluster_comp2:
    cluster_data = df_without_first_column.iloc[cluster, :]
    cluster_center = cluster_data.mean()  # Calcul du centre du cluster
    if isinstance(cluster_data, pd.Series):
        cluster_data = pd.DataFrame([cluster_data])
    weights = np.array([np.exp(- 2*(gaussian_distance(row, cluster_center, sigma)**2)) for _, row in cluster_data.iterrows()])
    weights_normalized = weights / weights.sum()
    cluster_weights.append(weights_normalized)

NameError: name 'df_without_first_column' is not defined