In [1]:
%%capture capt
!pip install scikit-learn-extra

In [2]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle
import random

### Importation Data

In [3]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [5]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de métrique

In [6]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [7]:
weights = [1, 1, 1]

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

def filter_dist(arr1, arr2):
    return distance_filter(arr1, arr2, weights=weights)

### Calculer les distances

In [8]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 72


In [9]:
%run Distances.ipynb

### Grid Search

### Parameters

In [10]:
list_weights = [[i, j, k] for i in range(0, 101, 5) for j in range(0, i, 5) for k in range(0, j, 5)]

In [11]:
for i in range(len(list_weights)):
    for k in range(3):
        if list_weights[i][k] == 0:
            list_weights[i][k] = 1

In [12]:
random.shuffle(list_weights)

In [13]:
list_weights = [[1, 1, 1]] + list_weights

In [14]:
clusters = range(5, 32, 2)

#### Calculate distances

In [15]:
weights = list_weights[0]

In [16]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm
dict_distance_function['filter_dist'] = filter_dist

In [17]:
def calculate_distances(data, distance, scaled=False):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    
    if scaled:
        scale_distances(distances)
    return distances

### Calculate clusters

In [18]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed', init='k-medoids++')
    kmedoids.fit(distances)
    return kmedoids

In [19]:
def ajout_cluster_base(kmedoids):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

In [20]:
def get_cluster_score(n_clusters, distances):
    kmedoids = get_kmedoids(n_clusters, distances)
    base_patient_cluster = ajout_cluster_base(kmedoids)
    base_patient_cluster = ajout_p_soins(base_patient_cluster)
    base_patient_cluster.to_csv("../data/custom_metric/grid_search.csv", index=False)
    %run Cluster_assessment.ipynb
    score = get_score()
    return score

### Perform Grid Search

In [21]:
path_dict = '../data/grid_search/gridSearch_filter.pickle'

In [22]:
try:
    with open(path_dict, 'rb') as handle:
        grid_search = pickle.load(handle)
except:
    grid_search = {}

In [23]:
len(grid_search)

4584

In [24]:
def get_str(c, w):
    return 'n_clusters:' + str(c) + ', weights:' + str(w[0]) + ' ' + str(w[1]) + ' ' + str(w[2])

In [25]:
import warnings
warnings.filterwarnings("ignore")

In [26]:
def check_if_clusters_all_calculated_for_distance(grid_search, clusters, weights):
    for cluster in clusters:
        key_grid_search = get_str(cluster, weights)
        if grid_search.get(key_grid_search) == None:
            return False
        return True

In [27]:
def best_from_dict(dic):
    max_score = -float('inf')
    max_key = ''

    for key in grid_search.keys():
        if grid_search[key] >= max_score:
            max_score = grid_search[key]
            max_key = key
    return max_score, max_key

### Distance avec filtre, scaling et cluster assessment prend en compte la taille du cluster

In [None]:
nb_done = len(grid_search)
if __name__ == '__main__':
    for i in range(len(list_weights)):
        weights = list_weights[i]
        print(f"Nb calculated -> {nb_done}, amounts for {round(100*len(grid_search)/(len(clusters)*len(list_weights)), 2)}%, best so far -> {best_from_dict(grid_search)} // current weights -> {weights}", end='\r')
        if check_if_clusters_all_calculated_for_distance(grid_search, clusters, weights):
            pass
        else:
            distances = calculate_distances(X_patient, 'filter_dist', scaled=True)

            for n_clusters in clusters:
                key_grid_search = get_str(n_clusters, weights)
                if grid_search.get(key_grid_search) != None:
                    pass
                else:
                    score = get_cluster_score(n_clusters, distances)
                    grid_search[key_grid_search] = score
                    with open(path_dict, 'wb') as handle:
                        pickle.dump(grid_search, handle)
                    nb_done += 1

Nb calculated -> 4584, amounts for 24.6%, best so far -> (1.8567908109912595e-05, 'n_clusters:5, weights:100 60 45') // current weights -> [90, 60, 35]

### Meilleurs weights

In [28]:
max_score, max_key = best_from_dict(grid_search)

In [29]:
max_score

1.8567908109912595e-05

In [30]:
max_key

'n_clusters:5, weights:100 60 45'