In [1]:
%%capture capt
import numpy as np
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
import pickle

### Importation Data

In [2]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [3]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [4]:
X_patient = []
for i in range(len(base_patient)):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    X_patient.append(ghm_row)

### Acceder aux fonctions de métrique

In [5]:
%run Metriques.ipynb

Installation/Importation des librairies necessaires.


In [6]:
weights = [1, 1, 1]

def custom_dist(arr1, arr2):
    return custom_distance(arr1, arr2, weights=weights)

def custom_dist_norm(arr1, arr2):
    return custom_distance_norm(arr1, arr2, weights=weights)

### Calculer les distances

In [7]:
nb_cpu_cores = multiprocessing.cpu_count()
print(f"CPU cores available -> {nb_cpu_cores}")

CPU cores available -> 56


In [8]:
def custom_pairwise_distances_i_triangle(data, distance_function, i):
    distance_i = [distance_function(data[i], data[j]) for j in range(i)]
    distance_i.append(0)
    return distance_i

def custom_pairwise_distances_triangle_for_multithread(global_data, distance_function, starting_index, length):
    distances = [custom_pairwise_distances_i_triangle(global_data, distance_function, i+starting_index) for i in range(length)]
    return distances

def get_starting_indices(len_data, divide_into):
    linspace = np.linspace(0, len_data, divide_into)
    indices = [math.floor(x) for x in linspace]
    return indices    

def get_lengths(len_data, indices):
    if len(indices) <= 1:
        return [len_data]
    lengths = [indices[1]]
    for i in range(2, len(indices)):
        lengths.append(indices[i]-indices[i-1])
    lengths.append(len_data-indices[-1])
    return lengths

def custom_pairwise_distances_triangle_multithread(data, distance_function, divide_into=nb_cpu_cores):
    distances = []    
    starting_indices = get_starting_indices(len(data), divide_into)
    lengths = get_lengths(len(data), starting_indices)
    
    with Pool() as p:
        async_results = [p.apply_async(custom_pairwise_distances_triangle_for_multithread, 
                                       args=(data, distance_function, starting_indices[i], lengths[i])) for i in range(len(lengths))]
        
        for i in range(len(lengths)):
            distances += async_results[i].get()
    return distances
    
def custom_pairwise_distances(data, distance_function, divide_into=nb_cpu_cores):
    distances_triangle = custom_pairwise_distances_triangle_multithread(data, distance_function, divide_into=divide_into)
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distances_triangle[i].append(distances_triangle[j][i])
    return np.array(distances_triangle)

### Grid Search

### Parameters

In [9]:
list_weights = [[i, j, k] for i in range(0, 41, 5) for j in range(0, i+1, 5) for k in range(0, j, 5)]

In [10]:
for i in range(len(list_weights)):
    for k in range(3):
        if list_weights[i][k] == 0:
            list_weights[i][k] = 1

In [11]:
list_weights = [[1, 1, 1]] + list_weights

#### Calculate distances

In [12]:
weights = list_weights[0]

In [13]:
dict_distance_function = dict()
dict_distance_function['jaro_winkler'] = jaro_winkler_distance
dict_distance_function['levenshtein'] = levenshtein_distance
dict_distance_function['custom'] = custom_dist
dict_distance_function['custom_norm'] = custom_dist_norm

In [14]:
def calculate_distances(data, distance):
    distance_function = dict_distance_function[distance]
    distances = custom_pairwise_distances(data, distance_function)
    return distances

### Calculate clusters

In [15]:
n_clusters=20

In [16]:
def get_kmedoids(n_clusters, distances):
    kmedoids = KMedoids(n_clusters=n_clusters, metric='precomputed')
    kmedoids.fit(distances)
    return kmedoids

In [33]:
def ajout_cluster_base(kmedoids):
    base_patient_cluster = base_patient.drop(columns=["cluster"])
    for i in range(1, n_clusters+1):
        base_patient_cluster["cluster"+str(i)] = [0]*len(base_patient_cluster)
    for i, label in enumerate(kmedoids.labels_):
        base_patient_cluster.loc[i, "cluster"+str(label+1)] = 1
    return base_patient_cluster
        
def ajout_p_soins(base_patient_cluster):
    base_patient_cluster['p_soins'] = ['']*len(base_patient_cluster)
    for i, patient in enumerate(X_patient):
        p_soin = convert_to_sentence(patient)
        base_patient_cluster.loc[i, 'p_soins'] = p_soin
    return base_patient_cluster

In [36]:
def get_cluster_score(n_clusters, distances):
    kmedoids = get_kmedoids(n_clusters, distances)
    base_patient_cluster = ajout_cluster_base(kmedoids)
    base_patient_cluster = ajout_p_soins(base_patient_cluster)
    base_patient_cluster.to_csv("../data/custom_metric/grid_search.csv", index=False)
    %run Cluster_assessment.ipynb
    score = get_score()
    return score

### Perform Grid Search

In [24]:
try:
    with open('../data/gridSearch.pickle', 'rb') as handle:
        grid_search = pickle.load(handle)
except:
    grid_search = {}

In [25]:
def get_str(w):
    return str(w[0]) + ' ' + str(w[1]) + ' ' + str(w[2])

In [26]:
for i in tqdm(range(len(list_weights))):
    weights = list_weights[i]
    print(weights, end='\r')
    str_weights = get_str(weights)
    if grid_search.get(str_weights) != None:
        pass
    else:
        if __name__ == '__main__':
            distances = calculate_distances(X_patient, 'custom_norm')
        score = get_cluster_score(n_clusters, distances)

        grid_search[get_str(weights)] = score
        with open('../data/gridSearch.pickle', 'wb') as handle:
            pickle.dump(grid_search, handle)

  0%|          | 0/121 [06:41<?, ?it/s]


UnboundLocalError: local variable 'ajout_cluster_base' referenced before assignment

### Meilleurs weights

In [None]:
max_score = -float('inf')
max_key = ''

for key in grid_search.keys():
    if grid_search[key] >= max_score:
        max_score = grid_search[key]
        max_key = key