# Data preparation

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_excel ('Ian_CD31805.05.2022._Fatma.xlsx')
df

df=df.fillna(0)
df.columns = df.columns.str.replace(' ','_')
df = df.rename(columns = {"rejet_aigu_cellulaire_dans_l'année":'rejet_aigu_cellulaire_dans_lannée'})

df['date_de_rejet_aigu_cellulaire']=pd.factorize(df['date_de_rejet_aigu_cellulaire'])[0]
df['grade_RAC']=pd.factorize(df['grade_RAC'])[0]
df['date_de_TP']=pd.factorize(df['date_de_TP'])[0]

y=df["rejet_aigu_cellulaire_dans_lannée"]
z1=df['NIP']
z2=df['date_de_TP']
x=df.drop(["rejet_aigu_cellulaire_dans_lannée",'NIP','date_de_TP'],axis=1)

from sklearn.preprocessing import StandardScaler
#Standardisez les caractéristiques en supprimant la moyenne et en mettant à l'échelle la variance unitaire
df_scale=pd.DataFrame(StandardScaler().fit_transform(df))
df_scale.index=df.index
df_scale.columns=df.columns

#Next, let’s collect all these point clouds in a single NumPy array:
point_clouds = np.asarray(
    [
       df_scale.query("rejet_aigu_cellulaire_dans_lannée == @df")[[ 'CD31_H24',
       'PaO2/FiO2_H24', 'SOFA_respiratoire_H24', 'CD31_H48', 'PaO2/FiO2_H48',
       'SOFA_respiratoire_H48', 'CD31_H72', 'PaO2/FiO2_H72',
       'SOFA_respiratoire_H72'
       ]].values
        for df in df_scale["rejet_aigu_cellulaire_dans_lannée"].unique()
    ]
)
point_clouds.shape

P0 = point_clouds[0].tolist()
P1=point_clouds[1].tolist()

#transformer sous forme de matrice 
P0 = np.reshape(P0, (len(P0),3, 3))  #9=3*3
P1 = np.reshape(P1, (len(P1),3, 3))
P0.shape,P1.shape

P = np.concatenate((P0, P1), axis=0)
P.shape

from gtda.homology import VietorisRipsPersistence

persistence = VietorisRipsPersistence(metric = 'euclidean',homology_dimensions=[0,1,2],n_jobs=-1,collapse_edges=True)

persistence_diagrams = persistence.fit_transform(P)

#from gtda.plotting import plot_diagram 

#plot_diagram(persistence_diagrams[0])

#convertir chaque diagramme en un vecteur tridimensionnel.
from gtda.diagrams import PersistenceEntropy

persistence_entropy = PersistenceEntropy(normalize=True)

# Calculate topological feature matrix
X= persistence_entropy.fit_transform(persistence_diagrams)

X.shape

labels=np.zeros(40)
labels[33:] = 1


# Perceptron multicouche

## Before topological improvement

In [2]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd',batch_size=20, hidden_layer_sizes=(18,2),max_iter=10000,random_state=1)

from sklearn.model_selection import train_test_split

X_train, X_valid, labels_train, labels_valid = train_test_split(X, labels, random_state=89)

clf.fit(X_train,labels_train)
clf.score(X_valid,labels_valid)

0.8

## After topological improvement

In [3]:
metrics=["bottleneck", "wasserstein", "landscape","betti","heat","silhouette", "persistence_image"]
from itertools import combinations

l=[]
for j in range(len(metrics)):
    l=l+(list(combinations(metrics, j+1)) )
l

[('bottleneck',),
 ('wasserstein',),
 ('landscape',),
 ('betti',),
 ('heat',),
 ('silhouette',),
 ('persistence_image',),
 ('bottleneck', 'wasserstein'),
 ('bottleneck', 'landscape'),
 ('bottleneck', 'betti'),
 ('bottleneck', 'heat'),
 ('bottleneck', 'silhouette'),
 ('bottleneck', 'persistence_image'),
 ('wasserstein', 'landscape'),
 ('wasserstein', 'betti'),
 ('wasserstein', 'heat'),
 ('wasserstein', 'silhouette'),
 ('wasserstein', 'persistence_image'),
 ('landscape', 'betti'),
 ('landscape', 'heat'),
 ('landscape', 'silhouette'),
 ('landscape', 'persistence_image'),
 ('betti', 'heat'),
 ('betti', 'silhouette'),
 ('betti', 'persistence_image'),
 ('heat', 'silhouette'),
 ('heat', 'persistence_image'),
 ('silhouette', 'persistence_image'),
 ('bottleneck', 'wasserstein', 'landscape'),
 ('bottleneck', 'wasserstein', 'betti'),
 ('bottleneck', 'wasserstein', 'heat'),
 ('bottleneck', 'wasserstein', 'silhouette'),
 ('bottleneck', 'wasserstein', 'persistence_image'),
 ('bottleneck', 'landscape

In [4]:
from sklearn.model_selection import train_test_split
from itertools import combinations
from gtda.diagrams import PersistenceEntropy
from sklearn.neural_network import MLPClassifier


from sklearn.ensemble import RandomForestClassifier
from gtda.diagrams import NumberOfPoints , Amplitude

from sklearn.pipeline import make_union

metrics =  ["bottleneck"]


def amplitude(metric):
    a=[Amplitude(metric[i]) for i in range(len(metric))]
    return a 




def feature_union(metric):
         a= make_union( PersistenceEntropy(normalize=True), NumberOfPoints(n_jobs=-1),*[a for a in amplitude(metric)])
         return a


X_train, X_valid, labels_train, labels_valid = train_test_split(persistence_diagrams, labels, random_state=89)

from gtda.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

estimators = [("features", feature_union(metric=metrics)),("clf", MLPClassifier(solver='sgd',batch_size=20, hidden_layer_sizes=(18,2),max_iter=10000,random_state=1))]

p=Pipeline(estimators)

param_grid = { 'features':[feature_union(metric=l[i]) for i in range(len(l))]
}
from sklearn.model_selection import StratifiedKFold

k_fold = StratifiedKFold(n_splits=15, shuffle=True, random_state=0)

CV = GridSearchCV(estimator=p, param_grid=param_grid, cv= k_fold,scoring='accuracy')



CV

GridSearchCV(cv=StratifiedKFold(n_splits=15, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('persistenceentropy',
                                                                        PersistenceEntropy(normalize=True)),
                                                                       ('numberofpoints',
                                                                        NumberOfPoints(n_jobs=-1)),
                                                                       ('amplitude',
                                                                        Amplitude(metric='bottleneck'))])),
                                       ('clf',
                                        MLPClassifier(batch_size=20,
                                                      hidden_layer_sizes=(18...
                                                                      Amplitude(metric='wa

In [5]:
CV.fit(X_train,labels_train)



GridSearchCV(cv=StratifiedKFold(n_splits=15, random_state=0, shuffle=True),
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('persistenceentropy',
                                                                        PersistenceEntropy(normalize=True)),
                                                                       ('numberofpoints',
                                                                        NumberOfPoints(n_jobs=-1)),
                                                                       ('amplitude',
                                                                        Amplitude(metric='bottleneck'))])),
                                       ('clf',
                                        MLPClassifier(batch_size=20,
                                                      hidden_layer_sizes=(18...
                                                                      Amplitude(metric='wa

In [6]:
# training
CV.best_score_

0.8333333333333334

In [7]:
# To verify that there is only one best combination
np.where(pd.DataFrame(CV.cv_results_)['rank_test_score']==1)

(array([100]),)

In [8]:
print('The best combination is : ',CV.best_params_['features'].get_params()['transformer_list'])

The best combination is :  [('persistenceentropy', PersistenceEntropy(normalize=True)), ('numberofpoints', NumberOfPoints(n_jobs=-1)), ('amplitude-1', Amplitude(metric='bottleneck')), ('amplitude-2', Amplitude(metric='wasserstein')), ('amplitude-3', Amplitude()), ('amplitude-4', Amplitude(metric='betti')), ('amplitude-5', Amplitude(metric='persistence_image'))]


In [9]:
print('Test accuracy: %.3f' % CV.score(X_valid, labels_valid))

Test accuracy: 0.900
