In [1]:
%%capture capt
!pip install scikit-learn-extra
!pip install optuna

In [2]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle
import random
import optuna

### Importation Data

In [3]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [5]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de métrique

In [6]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [7]:
global weights

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

def filter_dist(arr1, arr2):
    return distance_filter(arr1, arr2, weights=weights)

def filter_dist_4weights(arr1, arr2):
    return distance_filter_4weights(arr1, arr2, weights=weights)

### Calculer les distances

In [8]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 72


In [9]:
%run Distances.ipynb

### Grid Search

### Parameters

In [10]:
clusters = range(5, 22, 2)

#### Calculate distances

In [11]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm
dict_distance_function['filter_dist'] = filter_dist
dict_distance_function['filter_dist_4weights'] = filter_dist_4weights

In [12]:
def calculate_distances(data, distance, scaled=False):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    
    if scaled:
        scale_distances(distances)
    return distances

### Calculate clusters

In [13]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed', init='k-medoids++')
    kmedoids.fit(distances)
    return kmedoids

In [14]:
def ajout_cluster_base(kmedoids, n_clusters):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

In [15]:
def get_cluster_score(n_clusters, distances):
    kmedoids = get_kmedoids(n_clusters, distances)
    base_patient_cluster = ajout_cluster_base(kmedoids, n_clusters)
    base_patient_cluster = ajout_p_soins(base_patient_cluster)
    base_patient_cluster.to_csv("../data/custom_metric/grid_search.csv", index=False)
    %run Cluster_assessment.ipynb
    score = get_score()
    return score

In [16]:
def get_weights_score(distance, weights_tested, scaled=True, cv=2):
    
    global weights
    if weights_tested.get(get_str_weights(weights)):
        return weights_tested[get_str_weights(weights)]        
    
    distances = calculate_distances(X_patient, distance, scaled=scaled)

    max_score = -np.inf
    best_n_clusters = 0
    for n_clusters in clusters:
        score = np.mean([get_cluster_score(n_clusters, distances) for _ in range(cv)])
        if score > max_score:
            max_score = score
            best_n_clusters = n_clusters
    return max_score, best_n_clusters

# Optuna GridSearch

In [17]:
path_dict = '../data/grid_search/gridSearch_filter_4weights.pickle'

In [18]:
try:
    with open(path_dict, 'rb') as handle:
        weights_tested = pickle.load(handle)
except:
    weights_tested = {}
    
print(len(weights_tested))

176


In [19]:
def get_str_weights(w):
    return str(w[0]) + " " + str(w[1]) + " " + str(w[2]) + " " + str(w[3])

In [26]:
def find_best_weights(trial):
    w1 = trial.suggest_int('w1', 15, 100, 5)
    w2 = trial.suggest_int('w2', 10, w1-5, 5)
    w3 = trial.suggest_int('w3', 5, w2-5, 5)
    w4 = trial.suggest_int('w4', 0, w3-5, 5)
    
    global weights
    weights = [w1, w2, w3, w4]
    
    score, nb_clusters = get_weights_score('filter_dist_4weights', weights_tested, scaled=True, cv=1)
    if weights_tested.get(f"n_clusters:{nb_clusters}, weights: {get_str_weights(weights)}"):
        weights_tested[f"n_clusters:{nb_clusters}, weights: {get_str_weights(weights)}"] = max(score, weights_tested[f"n_clusters:{nb_clusters}, weights: {get_str_weights(weights)}"])
    else:
        weights_tested[f"n_clusters:{nb_clusters}, weights: {get_str_weights(weights)}"] = score
        
    with open(path_dict, 'wb') as handle:
        pickle.dump(weights_tested, handle)
    return score

In [27]:
study_name = "4weights_study"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
print(storage_name)

sqlite:///4weights_study.db


In [None]:
study = optuna.create_study(direction = 'maximize', study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(find_best_weights, n_trials=2500, show_progress_bar=True)

In [29]:
study.best_params

{'w1': 80, 'w2': 75, 'w3': 55, 'w4': 40}

### Check best params found

In [32]:
best_weights = list(study.best_params.values())

In [33]:
best_weights

[80, 75, 55, 40]

In [36]:
global weights
weights = best_weights
best_score, best_n_clusters = get_weights_score('filter_dist_4weights', weights_tested, scaled=True, cv=5)

In [37]:
best_score

1.061238419217001e-05

In [38]:
best_n_clusters

19