In [1]:
%%capture capt
!pip install prefixspan

In [2]:
%%capture capt
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from prefixspan import PrefixSpan

### Load Data

In [3]:
patient = pd.read_csv("../data/custom_metric/grid_search.csv")

In [4]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

### Vecteurs des parcours de GHM

In [5]:
def nb_cluster_df(df):
    nb_cluster = 1
    found_max = False
    while not found_max:
        try:
            df['cluster'+str(nb_cluster)]
            nb_cluster += 1
        except:
            found_max=True
            
    nb_cluster -= 1
    return nb_cluster

In [6]:
nb_cluster = nb_cluster_df(patient)

In [7]:
patient['cluster'] = [1+np.argmax(patient.loc[:, 'cluster1':'cluster'+str(nb_cluster)].iloc[i]) for i in range(len(patient))]

In [8]:
dico_cluster = {}
for i in range(nb_cluster):
    patient_cluster = patient.loc[patient.cluster == i+1]
    list_p_soins = patient_cluster.p_soins.values.tolist()
    list_list_p_soins = [x.split(',') for x in list_p_soins]
    dico_cluster[i+1] = list_list_p_soins

In [9]:
dico_full = {"Full": [patient.p_soins.values.tolist()[i].split(',') for i in range(len(patient))]}

In [10]:
nb_ghm = len(np.unique([item for sublist in dico_full['Full'] for item in sublist]))

In [11]:
def motifs_frequents_full():
    top_freq = [[], [], []]
    top_effectif = [[], [], []]
    top_motif = [[], [], []]
    
    effectif = len(patient)
    
    for length in range(1,4):
        ps = PrefixSpan(dico_full['Full'])
        ps.minlen = length
        ps.maxlen = length
        list_ps = ps.topk(k=nb_ghm)
        for i in range(len(list_ps)):
            top_effectif[length-1].append(list_ps[i][0])
            top_freq[length-1].append(list_ps[i][0]/effectif)
            top_motif[length-1].append(list_ps[i][1])
            
    results=pd.DataFrame()
    for i in range(1, 4):
        results[f"len{i}_effectif"] = top_effectif[i-1]
        results[f"len{i}_freq"] = top_freq[i-1]
        results[f"len{i}_motif"] = top_motif[i-1]
    return results

In [12]:
motifs_full = motifs_frequents_full()

In [13]:
def motifs_frequents_cluster(data, dico, topk):
    top_freq=[]
    top_effectif=[]
    top_motif=[]
    results=pd.DataFrame()

    for length in range(1,4):
        for cluster_p in range(1, nb_cluster + 1):
            ps = PrefixSpan(dico[cluster_p])
            ps.minlen = length
            if ps.topk(k=topk) != [] :
                effectif_cluster = (data.cluster==cluster_p).sum()
                top_effectif.append(ps.topk(k=topk)[topk-1][0])
                top_freq.append(ps.topk(k=topk)[topk-1][0]/effectif_cluster)
                top_motif.append(ps.topk(k=topk)[topk-1][1])
            else:
                top_freq.append(0)
                top_effectif.append(0)
                top_motif.append([])

        results[f"len{length}_effectif"]=top_effectif
        results[f"len{length}_freq"]=top_freq
        results[f"len{length}_motif"]=top_motif
        top_freq=[]
        top_motif=[]
        top_effectif=[]

    return results

In [14]:
motifs_cluster = motifs_frequents_cluster(patient, dico_cluster, 1)

### Comparer les fr√©quences des motifs des clusters avec celles de la base

In [1]:
def size_cluster(cluster):
    return len(patient.loc[patient.cluster == cluster])

In [15]:
df_diff = pd.DataFrame()
for i in range(1, 4):
    df_diff[f'diff_len{i}'] = [0]*nb_cluster

for i in range(len(motifs_cluster)):
    for length in range(1, 4):
        ghm = motifs_cluster.iloc[i][f'len{length}_motif']
        freq_ghm_in_cluster = motifs_cluster.iloc[i][f'len{length}_freq']
        df_freq_ghm_full = motifs_full.loc[motifs_full[f'len{length}_motif'].apply(lambda x: str(x)) == str(ghm)][f'len{length}_freq']
        if len(df_freq_ghm_full) == 0:
            #print(ghm, i, length)
            freq_ghm_full = 0
        else:
            freq_ghm_full = df_freq_ghm_full.iloc[0]
        
        diff = freq_ghm_in_cluster - freq_ghm_full
        df_diff.loc[i, f'diff_len{length}'] = diff / (len(patient)-size_cluster(i+1))

In [None]:
#on met a 0 les differences de frequence dans df_diff pour les clusters qui ont 0 ou 1 personne
for i in range(nb_cluster):
    if size_cluster(i+1) <= 1:
        for col in df_diff.columns:
            df_diff.loc[i, col] = 0

In [21]:
mean_of_means = np.mean([df_diff[list(df_diff.columns)[i]].mean() for i in range(len(df_diff.columns))])

In [None]:
def get_score():
    return mean_of_means