In [1]:
%%capture capt
!pip install scikit-learn-extra
!pip install optuna

In [2]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle
import random
import optuna

### Importation Data

In [3]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [5]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de métrique

In [6]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [7]:
global weights

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

def filter_dist(arr1, arr2):
    return distance_filter(arr1, arr2, weights=weights)

def filter_dist_4weights(arr1, arr2):
    return distance_filter_4weights(arr1, arr2, weights=weights)

### Calculer les distances

In [8]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 72


In [9]:
%run Distances.ipynb

### Grid Search

### Parameters

In [10]:
list_weights = [[i, j, k, l] for i in range(0, 101, 5) for j in range(0, i, 5) for k in range(0, j, 5) for l in range(0, k, 5)]

In [11]:
for i in range(len(list_weights)):
    for k in range(len(list_weights[0])):
        if list_weights[i][k] == 0:
            list_weights[i][k] = 1

In [12]:
list_weights = [[1, 1, 1, 1]] + list_weights

In [13]:
clusters = range(5, 22, 2)

#### Calculate distances

In [14]:
weights = list_weights[0]

In [15]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm
dict_distance_function['filter_dist'] = filter_dist
dict_distance_function['filter_dist_4weights'] = filter_dist_4weights

In [16]:
def calculate_distances(data, distance, scaled=False):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    
    if scaled:
        scale_distances(distances)
    return distances

### Calculate clusters

In [17]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed', init='k-medoids++')
    kmedoids.fit(distances)
    return kmedoids

In [18]:
def ajout_cluster_base(kmedoids, n_clusters):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

In [19]:
def get_cluster_score(n_clusters, distances):
    kmedoids = get_kmedoids(n_clusters, distances)
    base_patient_cluster = ajout_cluster_base(kmedoids, n_clusters)
    base_patient_cluster = ajout_p_soins(base_patient_cluster)
    base_patient_cluster.to_csv("../data/custom_metric/grid_search.csv", index=False)
    %run Cluster_assessment.ipynb
    score = get_score()
    return score

In [20]:
def get_weights_score(distance, weights_tested, scaled=True, cv=2):
    
    global weights
    if weights_tested.get(get_str_weights(weights)):
        return weights_tested[get_str_weights(weights)]        
    
    distances = calculate_distances(X_patient, distance, scaled=scaled)

    max_score = -np.inf
    best_n_clusters = 0
    for n_clusters in clusters:
        score = np.mean([get_cluster_score(n_clusters, distances) for _ in range(cv)])
        if score > max_score:
            max_score = score
            best_n_clusters = n_clusters
    return max_score, best_n_clusters

# Optuna GridSearch

In [21]:
path_dict = '../data/grid_search/gridSearch_filter_4weights.pickle'

In [22]:
try:
    with open(path_dict, 'rb') as handle:
        weights_tested = pickle.load(handle)
except:
    weights_tested = {}
    
print(len(weights_tested))

9


In [23]:
def get_str_weights(w):
    return str(w[0]) + " " + str(w[1]) + " " + str(w[2]) + " " + str(w[3])

In [24]:
def find_best_weights(trial):
    i = trial.suggest_int('i', 0, len(list_weights)-1)
    
    global weights
    weights = list_weights[i]
    
    score, nb_clusters = get_weights_score('filter_dist_4weights', weights_tested, scaled=True, cv=1)
    weights_tested[f"n_clusters:{nb_clusters}, weights: {get_str_weights(weights)}"] = score
    with open(path_dict, 'wb') as handle:
        pickle.dump(weights_tested, handle)
    return score

In [25]:
study_name = "4weights_study"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
print(storage_name)

sqlite:///4weights_study.db


In [None]:
study = optuna.create_study(direction = 'maximize', study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(find_best_weights, n_trials=len(list_weights)//2, show_progress_bar=True)

  self._init_valid()


  0%|          | 0/2993 [00:00<?, ?it/s]

[32m[I 2023-04-13 10:36:10,918][0m Trial 18 finished with value: 1.2082279018899515e-05 and parameters: {'i': 3846}. Best is trial 1 with value: 1.85647381322038e-05.[0m
[32m[I 2023-04-13 10:45:31,082][0m Trial 19 finished with value: 1.8548273254445637e-05 and parameters: {'i': 80}. Best is trial 1 with value: 1.85647381322038e-05.[0m
[32m[I 2023-04-13 10:54:56,621][0m Trial 20 finished with value: 1.676572980574659e-05 and parameters: {'i': 2364}. Best is trial 1 with value: 1.85647381322038e-05.[0m
[32m[I 2023-04-13 11:04:13,425][0m Trial 21 finished with value: 1.7929329173519945e-05 and parameters: {'i': 1140}. Best is trial 1 with value: 1.85647381322038e-05.[0m
[32m[I 2023-04-13 11:13:54,277][0m Trial 22 finished with value: 1.775879538382395e-05 and parameters: {'i': 3895}. Best is trial 1 with value: 1.85647381322038e-05.[0m
[32m[I 2023-04-13 11:23:45,210][0m Trial 23 finished with value: 1.4083121604331946e-05 and parameters: {'i': 2864}. Best is trial 1 with

In [25]:
study.best_params

{'i': 712}

### Check best params found

In [29]:
index_best_weights = list(study.best_params.values())[0]

In [30]:
index_best_weights

712

In [31]:
list_weights[index_best_weights]

[60, 55, 50, 30]

In [32]:
global weights
weights = list_weights[index_best_weights]
best_score = get_weights_score('filter_dist_4weights', scaled=True, cv=3)

In [33]:
best_score

1.4902595147040688e-05

In [36]:
nb_keys = 0
nb = 0
max_ = 0
for key in weights_tested.keys():
    if weights_tested[key] >= 1:
        nb += 1
        nb_keys += weights_tested[key]
        max_ = max(max_, weights_tested[key])

In [37]:
print(nb)
print(nb_keys)
print(max_)

233
240
2
