## Signed Positive Over Negative Generalized Eigenproblem (SPONGE)

We cluster using signed Signed Positive Over Negative Generalized Eigenproblem

#### 1. Data Preparation

In [5]:
import pandas as pd
import numpy as np

# Put your own path 
# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
# Jerome path : 'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DATA_Statapp.csv'
df = pd.read_csv(r'/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv')

print(df.columns.tolist())

['ticker', 'open', 'high', 'low', 'close', 'volume', 'OPCL', 'pvCLCL', 'prevAdjClose', 'SPpvCLCL', 'sharesOut', 'PERMNO', 'SICCD', 'PERMCO', 'prevRawOpen', 'prevRawClose', 'prevAdjOpen']


We first prepare the dataframe we will use to construct the portfolio: df_cleaned which contains the 5531 returns of 632 assets

In [6]:
import pandas as pd
import ast

# Function to safely convert a string into a list
def safe_literal_eval(s):
    try:
        # Tries to convert the string into a list
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If an error occurs, returns a default value, e.g. an empty list
        return []

# Apply conversion function to 'open' and 'close' columns
df['open'] = df['open'].apply(safe_literal_eval)
df['close'] = df['close'].apply(safe_literal_eval)

# Calculate returns for each line
df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)



In [7]:
new_df = df[['ticker', 'return']] # create a new data frame with the column ticker and return 

In [8]:
# Créons le DataFrame à partir des listes dans 'return'
# On suppose ici que 'new_df' est déjà défini et contient la colonne 'return'

# Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
returns_df = pd.DataFrame(new_df['return'].tolist())

# Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
returns_df.insert(0, 'ticker', new_df['ticker'])

# Renommer les colonnes pour refléter qu'elles sont des rendements
returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

def check_nan_inf(df):
    # Vérification des valeurs NaN
    if df.isna().any().any():
        print("There are NaN values in the dataframe")
    else:
        print("There are no NaN values in the dataframe")

def remove_rows_with_nan(df):
    return df.dropna()

df_cleaned = remove_rows_with_nan(returns_df)
df_cleaned.reset_index(drop=True, inplace=True)

check_nan_inf(df_cleaned)

df_cleaned.shape

There are no NaN values in the dataframe


(632, 5532)

Now we get a clustering with SPONGE algorithm

In [10]:
import sys

## path Nail : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project'
## path Jerome : 'C:/Users/33640/OneDrive/Documents/GitHub/Portfolio_clustering_project'
sys.path.append('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project')  # Ajoute le chemin parent

from signet.cluster import Cluster 
from scipy import sparse
def signed_adjency(mat):
    '''
    L'idée est ici, à partir d'une matrice de corrélation mat, de renvoyer deux matrices 
    A_positive et A_negative qui correspondraient aux matrices des corrélations positives et négatives 
    associées  
    '''

    A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
    A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
    
    return A_pos, A_neg
def apply_SPONGE(correlation_matrix, k): 

    '''
    IDÉE : étant donné une matrice de correlation obtenue à partir d'une base de donnée et de la similarité de pearson, renvoyer un vecteur associant 
           à chaque actif le numéro du cluster auquel il appartient une fois qu'on lui a appliqué SPONGE (à partir du package signet)

    PARAMS : 

    - correlation_matrix : a square dataframe of size (number_of_stocks, number_of_stocks)
    - k : the number of clusters to identify. If a list is given, the output is a corresponding list

    RETURNS : array of int, or list of array of int: Output assignment to clusters.

    '''
    
    ## On respecte le format imposé par signet. Pour cela il faut changer le type des matrices A_pos et A_neg, qui ne peuvent pas rester des dataframes 

    A_pos, A_neg = signed_adjency(correlation_matrix)

    A_pos_sparse = sparse.csc_matrix(A_pos.values)
    A_neg_sparse = sparse.csc_matrix(A_neg.values)

    data = (A_pos_sparse, A_neg_sparse)

    cluster = Cluster(data)

    ## On applique la méthode SPONGE : clusters the graph using the Signed Positive Over Negative Generalised Eigenproblem (SPONGE) clustering.

    return cluster.SPONGE(k)

We divide the timeframe in 28 periods of 200 days (the last one is 132 days only) and run SPONGE to make 28 clusterings

In [11]:

data_period=[]
result=[]
for i in range(28):
    start_col = i * 200 + 1  # Commence à partir de la deuxième colonne
    end_col = start_col + 200

    # Vérifie si la fin de la tranche dépasse le nombre de colonnes
    if end_col > len(df_cleaned.columns):
        end_col = len(df_cleaned.columns)  # Ajuste pour ne pas dépasser
    
    data_period.append(df_cleaned.iloc[:, start_col:end_col]) 

    returns_transposed = data_period[i].transpose()

    # Calculer la matrice de corrélation sur les actifs transposés
    correlation_matrix = returns_transposed.corr(method='pearson')
    result.append(apply_SPONGE(correlation_matrix, k=9))



  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
  A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
  super()._check_params_vs_input(X, default_n_init=10)
  A_pos = 

Now we use  hierarchical clustering to construct a clustering from the 28 clusterings we made

In [12]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

# Supposons que vous avez une liste de listes, où chaque sous-liste est un des 28 clusterings
# Chaque sous-liste contient 632 éléments avec des valeurs de 0 à 9
clusterings = result  # Remplacez ceci par vos données de clustering

# Convertir les résultats de clustering en une matrice de co-occurrence
n_data = 632
co_occurrence_matrix = np.zeros((n_data, n_data))

for clustering in clusterings:
    for i in range(n_data):
        for j in range(n_data):
            if clustering[i] == clustering[j]:
                co_occurrence_matrix[i, j] += 1

# Convertir en matrice de similarité
similarity_matrix = co_occurrence_matrix / len(clusterings)

# Convertir la matrice de similarité en matrice de distance
distance_matrix = 1 - similarity_matrix

# Convertir la matrice de distance en format condensé pour la fonction linkage
# car linkage attend une matrice de distance condensée pour les données non euclidiennes
tri_upper_idx = np.triu_indices(n_data, k=1)
distance_condensed = distance_matrix[tri_upper_idx]

# Effectuer le clustering hiérarchique
Z = linkage(squareform(distance_condensed), method='average')

# Former des clusters
# Vous pouvez ajuster le 't' pour obtenir le nombre de clusters souhaité
clusters = fcluster(Z, t=9, criterion='maxclust')
clusters=clusters-1
# Afficher les clusters
print(clusters)

[3 3 7 3 3 3 8 3 1 8 7 3 3 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 5 3 3 6 3 1 3 3
 3 7 3 1 7 6 3 3 2 7 3 3 3 3 3 3 3 3 3 3 3 6 3 3 3 6 3 3 3 3 7 0 0 3 3 3 3
 3 1 3 3 3 3 3 3 3 6 3 7 3 3 3 3 1 3 7 3 3 3 3 3 2 2 8 5 3 3 8 3 3 8 0 3 3
 5 3 3 3 8 5 8 6 3 3 5 3 3 3 3 3 6 3 3 7 3 0 8 3 3 4 3 3 2 3 3 2 3 3 2 3 3
 6 3 5 0 2 8 8 5 3 3 3 3 8 3 6 8 3 3 3 3 3 3 5 5 6 6 5 3 6 3 8 2 3 3 3 3 3
 3 3 3 4 3 3 3 3 3 3 8 3 3 3 3 3 3 6 6 3 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2
 8 3 3 3 3 3 3 8 3 3 3 5 3 6 3 8 3 3 2 6 2 1 3 3 3 3 5 3 3 6 3 7 3 8 7 3 3
 2 3 8 3 3 3 3 3 3 3 5 3 3 0 3 3 3 3 3 3 7 3 3 3 3 3 8 3 3 3 3 6 7 3 3 7 3
 7 6 0 3 0 4 3 3 0 3 3 3 7 3 3 3 8 3 3 6 5 3 6 3 6 6 3 3 3 3 3 0 3 3 2 3 3
 3 3 3 3 0 0 3 0 0 3 0 0 0 8 3 3 3 3 2 0 7 3 0 0 0 3 5 2 3 3 3 3 3 3 3 3 0
 0 0 5 0 0 3 0 0 3 0 0 0 0 0 5 3 0 3 1 5 6 8 7 3 3 6 6 3 3 3 5 3 2 3 3 0 3
 3 3 0 3 6 3 3 3 8 6 3 0 5 5 3 3 3 3 5 3 2 8 8 6 5 8 2 3 2 7 3 3 3 3 3 3 2
 3 3 3 6 0 0 3 7 3 8 3 8 2 6 6 3 3 5 3 3 3 2 3 3 6 3 3 3 3 5 3 3 3 3 3 3 3
 3 3 3 5 3 3 3 3 3 3 3 1 

  Z = linkage(squareform(distance_condensed), method='average')


Now we want to run markowitz on our portfolio, we need to get the returns and the covariance matrix of the returns of each asset, but before we need to get the composition of each cluster which is a combination of multiple assets. 

In [13]:
cluster_comp=[[],[],[],[],[],[],[],[],[]]
cluster_comp2=[]
for c in range(632):
    cluster_comp[clusters[c]].append(df_cleaned.iloc[c, 0])
for cluster in cluster_comp:
    cluster_indices = [df_cleaned.index[df_cleaned['ticker'] == element].tolist() for element in cluster]
    cluster_comp2.append([index for sublist in cluster_indices for index in sublist])



print(cluster_comp2[3])

[0, 1, 3, 4, 5, 7, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 33, 35, 36, 37, 39, 43, 44, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 63, 64, 65, 66, 70, 71, 72, 73, 74, 76, 77, 78, 79, 80, 81, 82, 84, 86, 87, 88, 89, 91, 93, 94, 95, 96, 97, 102, 103, 105, 106, 109, 110, 112, 113, 114, 119, 120, 122, 123, 124, 125, 126, 128, 129, 131, 134, 135, 137, 138, 140, 141, 143, 144, 146, 147, 149, 156, 157, 158, 159, 161, 164, 165, 166, 167, 168, 169, 175, 177, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 200, 201, 204, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 223, 224, 225, 226, 227, 228, 230, 231, 232, 234, 236, 238, 239, 244, 245, 246, 247, 249, 250, 252, 254, 257, 258, 260, 262, 263, 264, 265, 266, 267, 268, 270, 271, 273, 274, 275, 276, 277, 278, 280, 281, 282, 283, 284, 286, 287, 288, 289, 292, 293, 295, 299, 302, 303, 305, 306, 307, 309, 310, 311, 313, 314, 317, 319

In [15]:
df_cleaned.set_index('ticker', inplace=True)

Now we compute the intracluster weights with gaussianweight

In [17]:
# Fonction pour calculer la distance gaussienne
def gaussian_distance(x, y, sigma):
    return np.exp(-np.linalg.norm(x - y)**2 / (2 * sigma**2))


# Paramètre sigma pour la distance gaussienne
sigma = 1.0  # vous pouvez ajuster cette valeur

# Calculer les poids pour chaque élément dans chaque cluster
cluster_weights = []
for cluster in cluster_comp2:
    cluster_data = df_cleaned.iloc[cluster, :]
    cluster_center = cluster_data.mean()  # Calcul du centre du cluster
    if isinstance(cluster_data, pd.Series):
        cluster_data = pd.DataFrame([cluster_data])
    weights = np.array([np.exp(- 2*(gaussian_distance(row, cluster_center, sigma)**2)) for _, row in cluster_data.iterrows()])
    weights_normalized = weights / weights.sum()
    cluster_weights.append(weights_normalized)

In [18]:
normalized_weights=[0 for i in range(632)]
for i in range(len(cluster_comp2)):
    for j in range(len(cluster_comp2[i])):
        normalized_weights[cluster_comp2[i][j]]=cluster_weights[i][j]

print(normalized_weights)

[0.002947381636606141, 0.0022980929458325514, 0.03283678138108144, 0.00294160789580845, 0.0020660574029301502, 0.0008472365309277881, 0.01922530462263595, 0.0015639749238956057, 0.08733590783262517, 0.02465679166673695, 0.08180908390265249, 0.0020189698721073644, 0.0021849696254996067, 0.003715779665686399, 0.0026222673585138342, 0.0035225164343939824, 0.023956882240268724, 0.0018545653374089218, 0.00230726383297309, 0.0032428049229840123, 0.001780583562803329, 0.0016019978697393617, 0.003730564361436296, 0.0015717493655179509, 0.0024685042869677766, 0.003522019884248032, 0.002796069626375997, 0.003454070364388544, 0.0021682821432191367, 0.02166745775134526, 0.0015401618578241608, 0.0022113170160723917, 0.01685489795813552, 0.0023702344325850873, 0.05976778873055711, 0.001708179365351363, 0.0037033827445079183, 0.0021264301451932315, 0.026650178255601214, 0.0016273615164344366, 0.08411089665830175, 0.04904608602526936, 0.015378642275199935, 0.00260001381989444, 0.0015798413198340499, 0

We get the means of the returns of each cluster, and the covariance of the returns of the clusters.

In [19]:
return_clusters = pd.DataFrame(columns=df_cleaned.columns[1:])

# Remplissage du nouveau DataFrame avec les moyennes des colonnes pour chaque cluster
for k in range(9):
    # Sélection des lignes appartenant au cluster k
    rows_in_cluster = df_cleaned.iloc[cluster_comp2[k]]

    # Calcul de la moyenne pour chaque colonne, en excluant la première colonne
    mean_values = (rows_in_cluster.iloc[:, 1:].mul(cluster_weights[k], axis=0)).sum()

    # Ajout des moyennes calculées au nouveau DataFrame
    return_clusters.loc[k] = mean_values

print(return_clusters.shape)
returns_transposed = return_clusters.transpose()

# Calculer la matrice de corrélation sur les actifs transposés
correlation_matrix = returns_transposed.cov()*np.sqrt(252)


annual_expected_returns=(return_clusters + 1).prod(axis=1)-1

# Afficher la matrice de corrélation
print(correlation_matrix)
print(annual_expected_returns)

(9, 5530)
          0         1         2         3         4         5         6  \
0  0.000405  0.000257  0.000325  0.000232  0.000257  0.000285  0.000303   
1  0.000257  0.004584  0.000569  0.001204  0.001028  0.002441  0.001218   
2  0.000325  0.000569  0.000808  0.000682  0.000629  0.000796  0.000811   
3  0.000232  0.001204  0.000682  0.002297  0.001610  0.002291  0.002268   
4  0.000257  0.001028  0.000629  0.001610  0.005452  0.001723  0.001707   
5  0.000285  0.002441  0.000796  0.002291  0.001723  0.004946  0.002151   
6  0.000303  0.001218  0.000811  0.002268  0.001707  0.002151  0.004077   
7  0.000178  0.000711  0.000413  0.001244  0.000934  0.001264  0.001325   
8  0.000175  0.000601  0.000362  0.000911  0.000738  0.000968  0.001116   

          7         8  
0  0.000178  0.000175  
1  0.000711  0.000601  
2  0.000413  0.000362  
3  0.001244  0.000911  
4  0.000934  0.000738  
5  0.001264  0.000968  
6  0.001325  0.001116  
7  0.001266  0.001011  
8  0.001011  0.001241  

In [20]:
from module1 import markowitz

weights=markowitz(annual_expected_returns, correlation_matrix)

print(weights)

OrderedDict([(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 1.00001)])


Issue --> put a 0 weight on ALL clusters except the last one 

In [50]:
weights

OrderedDict([(0, 0.0),
             (1, 0.0),
             (2, 0.0),
             (3, 0.0),
             (4, 0.0),
             (5, 0.0),
             (6, 0.0),
             (7, 0.0),
             (8, 1.00001)])

In [30]:
from module1 import portfolio_pnl_sharpe
overall_pnl_performance = portfolio_pnl_sharpe(returns_transposed, weights, risk_free_rate=0.03)

In [41]:
pnl_evolution = [overall_pnl_performance[0][i] for i in range(len(overall_pnl_performance[0]))]


  pnl_evolution = [overall_pnl_performance[0][i] for i in range(len(overall_pnl_performance[0]))]


In [44]:
pip install -q plotly

Note: you may need to restart the kernel to use updated packages.


In [49]:
import plotly.graph_objects as go

indices = list(range(len(pnl_evolution)))

# Création du graphique
fig = go.Figure()

# Ajout de la courbe
fig.add_trace(go.Scatter(x=indices, y=pnl_evolution, mode='lines+markers', name='Évolution du PNL', line=dict(width=1)))

# Ajout des annotations au survol de la souris
fig.update_layout(
    title='PNL',
    xaxis_title='Time (in days)',
    yaxis_title='PNL ',
    hovermode='x',
    hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell")
)

fig.update_traces(
    hoverinfo='text',
    text=[f"Day {i}: PNL = {pnl}" for i, pnl in enumerate(pnl_evolution)]
)

# Affichage du graphique interactif
fig.show()

