In [1]:
%%capture capt
!pip install scikit-learn-extra

In [2]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle

### Importation Data

In [3]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [5]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de métrique

In [6]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [7]:
weights = [1, 1, 1]

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

### Calculer les distances

In [8]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 56


In [9]:
def custom_pairwise_distances_i_triangle(data, distance_function, i):
    distance_i = [distance_function(data[i], data[j]) for j in range(i)]
    distance_i.append(0)
    return distance_i

def custom_pairwise_distances_triangle_for_multithread(global_data, distance_function, starting_index, length):
    distances = [custom_pairwise_distances_i_triangle(global_data, distance_function, i+starting_index) for i in range(length)]
    return distances

def get_starting_indices(len_data, divide_into):
    linspace = np.linspace(0, len_data, divide_into)
    indices = [math.floor(x) for x in linspace]
    return indices    

def get_lengths(len_data, indices):
    if len(indices) <= 1:
        return [len_data]
    lengths = [indices[1]]
    for i in range(2, len(indices)):
        lengths.append(indices[i]-indices[i-1])
    lengths.append(len_data-indices[-1])
    return lengths

def custom_pairwise_distances_triangle_multithread(data, distance_function, divide_into=nb_cpu_cores):
    distances = []    
    starting_indices = get_starting_indices(len(data), divide_into)
    lengths = get_lengths(len(data), starting_indices)
    
    with Pool() as p:
        async_results = [p.apply_async(custom_pairwise_distances_triangle_for_multithread, 
                                       args=(data, distance_function, starting_indices[i], lengths[i])) for i in range(len(lengths))]
        
        for i in range(len(lengths)):
            distances += async_results[i].get()
    return distances
    
def custom_pairwise_distances(data, distance_function, divide_into=nb_cpu_cores):
    distances_triangle = custom_pairwise_distances_triangle_multithread(data, distance_function, divide_into=divide_into)
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distances_triangle[i].append(distances_triangle[j][i])
    return np.array(distances_triangle)

### Grid Search

### Parameters

In [10]:
list_weights = [[i, j, k] for i in range(0, 101, 5) for j in range(0, i+1, 5) for k in range(0, j, 5)]

In [11]:
for i in range(len(list_weights)):
    for k in range(3):
        if list_weights[i][k] == 0:
            list_weights[i][k] = 1

In [12]:
list_weights = [[1, 1, 1]] + list_weights

In [13]:
clusters = range(5, 42, 2)

#### Calculate distances

In [14]:
weights = list_weights[0]

In [15]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm

In [16]:
def calculate_distances(data, distance):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    
    scaler = MinMaxScaler()
    distances = scaler.fit_transform(distances)
    return distances

### Calculate clusters

In [17]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed')
    kmedoids.fit(distances)
    return kmedoids

In [18]:
def ajout_cluster_base(kmedoids):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

In [19]:
def get_cluster_score(n_clusters, distances):
    kmedoids = get_kmedoids(n_clusters, distances)
    base_patient_cluster = ajout_cluster_base(kmedoids)
    base_patient_cluster = ajout_p_soins(base_patient_cluster)
    base_patient_cluster.to_csv("../data/custom_metric/grid_search.csv", index=False)
    %run Cluster_assessment.ipynb
    score = get_score()
    return score

### Perform Grid Search

In [28]:
try:
    with open('../data/grid_search/gridSearch_scaled.pickle', 'rb') as handle:
        grid_search = pickle.load(handle)
except:
    grid_search = {}

In [29]:
len(grid_search)

4646

In [22]:
def get_str(c, w):
    return 'n_clusters:' + str(c) + ', weights:' + str(w[0]) + ' ' + str(w[1]) + ' ' + str(w[2])

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
def check_if_clusters_all_calculated_for_distance(grid_search, clusters, weights):
    for cluster in clusters:
        key_grid_search = get_str(cluster, weights)
        if grid_search.get(key_grid_search) == None:
            return False
        return True

In [None]:
if __name__ == '__main__':
    with tqdm(total=len(clusters)*len(list_weights)) as pbar:
        for i in range(len(list_weights)):
            weights = list_weights[i]
            if check_if_clusters_all_calculated_for_distance(grid_search, clusters, weights):
                for cluster in clusters:
                    pbar.update(1)
            else:
                distances = calculate_distances(X_patient, 'custom')

                for n_clusters in clusters:
                    key_grid_search = get_str(n_clusters, weights)
                    if grid_search.get(key_grid_search) != None:
                        pbar.update(1)
                    else:
                        score = get_cluster_score(n_clusters, distances)
                        grid_search[key_grid_search] = score
                        with open('../data/grid_search/gridSearch_scaled.pickle, 'wb') as handle:
                            pickle.dump(grid_search, handle)
                        pbar.update(1)

 15%|█▌        | 4450/29279 [1:14:31<375:49:51, 54.49s/it]

### Meilleurs weights

In [25]:
max_score = -float('inf')
max_key = ''

for key in grid_search.keys():
    if grid_search[key] >= max_score:
        max_score = grid_search[key]
        max_key = key

In [26]:
max_score

0.9684855479335592

In [27]:
max_key

'n_clusters:39, weights:45 10 1'

In [38]:
max_ = 0
max_key = ""
for key in grid_search.keys():
    if grid_search[key] > 0.90 and int(key.split(",")[0][-2:]) <= 21:
        print(key, round(grid_search[key], 3))
        if grid_search[key] >= max_:
            max_ = grid_search[key]
            max_key = key

n_clusters:11, weights:20 20 5 0.908
n_clusters:11, weights:25 15 5 0.908
n_clusters:15, weights:25 15 10 0.932
n_clusters:17, weights:25 15 10 0.94
n_clusters:11, weights:25 20 1 0.908
n_clusters:13, weights:25 20 1 0.922
n_clusters:15, weights:25 20 1 0.932
n_clusters:11, weights:25 20 5 0.908
n_clusters:15, weights:25 25 1 0.932
n_clusters:17, weights:25 25 1 0.939
n_clusters:11, weights:30 10 5 0.908
n_clusters:13, weights:30 10 5 0.922
n_clusters:15, weights:30 10 5 0.932
n_clusters:17, weights:30 10 5 0.94
n_clusters:13, weights:30 15 1 0.922
n_clusters:15, weights:30 15 1 0.932
n_clusters:17, weights:30 15 1 0.94
n_clusters:11, weights:30 20 1 0.908
n_clusters:13, weights:30 20 1 0.922
n_clusters:15, weights:30 20 1 0.916
n_clusters:17, weights:30 20 1 0.926
n_clusters:15, weights:35 10 1 0.932
n_clusters:17, weights:35 10 1 0.94
n_clusters:15, weights:35 15 5 0.932
n_clusters:17, weights:35 15 5 0.939
n_clusters:19, weights:35 15 5 0.945
n_clusters:21, weights:35 15 5 0.951
n_c

In [39]:
max_key, max_

('n_clusters:21, weights:35 15 5', 0.9505888047381758)