## Symetric Normalized version of SPONGE (Signed Positive Over Negative Generalized Eigenproblem)

We cluster using signed Signed Positive Over Negative Generalized Eigenproblem

In [1]:
import pandas as pd
import numpy as np
import ast 

In [4]:
# Function to safely convert a string into a list
def safe_literal_eval(s):
    try:
        # Tries to convert the string into a list
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If an error occurs, returns a default value, e.g. an empty list
        return []

def check_nan_inf(df):
    # Vérification des valeurs NaN
    if df.isna().any().any():
        print("There are NaN values in the dataframe")
    else:
        print("There are no NaN values in the dataframe")

def remove_rows_with_nan(df):
    return df.dropna()

def load_cleaned_data(path):
    '''
    ----------------------------------------------------------------
    PARAMETERS : path = string
    ----------------------------------------------------------------
    '''
    df = pd.read_csv(path) 

    # Apply conversion function to 'open' and 'close' columns
    df['open'] = df['open'].apply(safe_literal_eval)
    df['close'] = df['close'].apply(safe_literal_eval)

    # Calculate returns for each line
    df['return'] = df.apply(lambda row: [(close - open) / open for open, close in zip(row['open'], row['close'])], axis=1)

    # create a new data frame with the column ticker and return 
    new_df = df[['ticker', 'return']] 

    # Convertir chaque liste dans la colonne 'return' en plusieurs colonnes dans le nouveau DataFrame
    returns_df = pd.DataFrame(new_df['return'].tolist())

    # Ajouter la colonne 'ticker' du 'new_df' au début de 'returns_df'
    returns_df.insert(0, 'ticker', new_df['ticker'])

    # Renommer les colonnes pour refléter qu'elles sont des rendements
    returns_df.columns = ['ticker'] + [f'return_{i}' for i in range(len(returns_df.columns) - 1)]

    df_cleaned = remove_rows_with_nan(returns_df)
    df_cleaned.reset_index(drop=True, inplace=True)

    check_nan_inf(df_cleaned)

    return df_cleaned

# Put your own path 
# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
# Jerome path : 'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DATA_Statapp.csv'

nail_path = '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DATA_Statapp.csv'
df_cleaned = load_cleaned_data(nail_path)

There are no NaN values in the dataframe


Now we get a clustering with SPONGE algorithm

In [None]:
import sys

## path Nail : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project'
## path Jerome : 'C:/Users/33640/OneDrive/Documents/GitHub/Portfolio_clustering_project'
sys.path.append('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project')  # Ajoute le chemin parent

from signet.cluster import Cluster 
from scipy import sparse
def signed_adjency(mat):
    '''
    L'idée est ici, à partir d'une matrice de corrélation mat, de renvoyer deux matrices 
    A_positive et A_negative qui correspondraient aux matrices des corrélations positives et négatives 
    associées  
    '''

    A_pos = mat.applymap(lambda x: x if x >= 0 else 0)
    A_neg = mat.applymap(lambda x: abs(x) if x < 0 else 0)
    
    return A_pos, A_neg
def apply_SPONGE(correlation_matrix, k): 

    '''
    IDÉE : étant donné une matrice de correlation obtenue à partir d'une base de donnée et de la similarité de pearson, renvoyer un vecteur associant 
           à chaque actif le numéro du cluster auquel il appartient une fois qu'on lui a appliqué SPONGE (à partir du package signet)

    PARAMS : 

    - correlation_matrix : a square dataframe of size (number_of_stocks, number_of_stocks)
    - k : the number of clusters to identify. If a list is given, the output is a corresponding list

    RETURNS : array of int, or list of array of int: Output assignment to clusters.

    '''
    
    ## On respecte le format imposé par signet. Pour cela il faut changer le type des matrices A_pos et A_neg, qui ne peuvent pas rester des dataframes 

    A_pos, A_neg = signed_adjency(correlation_matrix)

    A_pos_sparse = sparse.csc_matrix(A_pos.values)
    A_neg_sparse = sparse.csc_matrix(A_neg.values)

    data = (A_pos_sparse, A_neg_sparse)

    cluster = Cluster(data)

    ## On applique la méthode SPONGE : clusters the graph using the Signed Positive Over Negative Generalised Eigenproblem (SPONGE) clustering.

    return cluster.SPONGE(k )