In [1]:
%%capture capt
!pip install scikit-learn-extra

In [2]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle
import time

### Importation Data

In [3]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [5]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de mÃ©trique

In [6]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [7]:
#weights = [1, 1, 1]

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

def filter_dist(arr1, arr2):
    return distance_filter(arr1, arr2, weights=weights)

def filter_dist_4weights(arr1, arr2):
    return distance_filter_4weights(arr1, arr2, weights=weights)

### Calculer les distances

In [8]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 56


In [9]:
%run Distances.ipynb

In [10]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm
dict_distance_function['filter_dist'] = filter_dist
dict_distance_function['filter_dist_4weights'] = filter_dist_4weights

In [11]:
def calculate_distances(data, distance, scaled, weights):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    #print()
    #print(f"max before scaling = {np.max(distances)}")
    if scaled:
        scale_distances(distances)
        #distances /= np.mean(distances)
        #print(f"After scaling --> max = {np.max(distances)}")
    return distances

### Implementation du clustering

In [12]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed', init='k-medoids++')
    kmedoids.fit(distances)
    return kmedoids

In [13]:
def ajout_cluster_base(kmedoids, n_clusters):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

##### Obtention de la base finale

In [14]:
global weights

In [15]:
def base_finale(data, distance, scaled, weights_metric, n_clusters):
    global weights
    weights = weights_metric
    
    if distance not in list(dict_distance_function.keys()):
        print(f'Distance doit etre choisie parmi: {list(dict_distance_function.keys())}')
        return 
    
    if __name__ == '__main__':
        print(f'Calcul des distances, weights={weights}', end='\r')
        start = time.time()
        distances = calculate_distances(data, distance, scaled, weights)
        stop = time.time()
        print()
        print(f'Distances calculees en {round(stop-start)}s, calcul des clusters -> n_clusters={n_clusters}', end = '\r')
        kmedoids = get_kmedoids(n_clusters, distances)
        print()
        print('Clusters definis, construction de la base finale', end='\r')
        base_patient_cluster = ajout_cluster_base(kmedoids, n_clusters)
        base_patient_cluster = ajout_p_soins(base_patient_cluster)

        return kmedoids, base_patient_cluster