In [1]:
import time
start = time.time()

In [2]:
%%capture capt
!pip install prefixspan

In [3]:
%%capture capt
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from prefixspan import PrefixSpan

### Load Data

In [4]:
patient = pd.read_csv("../data/custom_metric/50cluster_50_10_1.csv")

In [5]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

### Vecteurs des parcours de GHM

In [6]:
def nb_cluster_df(df):
    nb_cluster = 1
    found_max = False
    while not found_max:
        try:
            df['cluster'+str(nb_cluster)]
            nb_cluster += 1
        except:
            found_max=True
            
    nb_cluster -= 1
    return nb_cluster

In [7]:
nb_cluster = nb_cluster_df(patient)

In [8]:
patient['cluster'] = [1+np.argmax(patient.loc[:, 'cluster1':'cluster'+str(nb_cluster)].iloc[i]) for i in range(len(patient))]

In [9]:
dico_cluster = {}
for i in range(nb_cluster):
    patient_cluster = patient.loc[patient.cluster == i+1]
    list_p_soins = patient_cluster.p_soins.values.tolist()
    list_list_p_soins = [x.split(',') for x in list_p_soins]
    dico_cluster[i+1] = list_list_p_soins

In [10]:
dico_full = {"Full": [patient.p_soins.values.tolist()[i].split(',') for i in range(len(patient))]}

In [11]:
nb_ghm = len(np.unique([item for sublist in dico_full['Full'] for item in sublist]))

In [12]:
def motifs_frequents_full():
    top_freq = [[], [], []]
    top_effectif = [[], [], []]
    top_motif = [[], [], []]
    
    effectif = len(patient)
    
    for length in range(1,4):
        ps = PrefixSpan(dico_full['Full'])
        ps.minlen = length
        ps.maxlen = length
        list_ps = ps.topk(k=nb_ghm)
        for i in range(len(list_ps)):
            top_effectif[length-1].append(list_ps[i][0])
            top_freq[length-1].append(list_ps[i][0]/effectif)
            top_motif[length-1].append(list_ps[i][1])
            
    results=pd.DataFrame()
    for i in range(1, 4):
        results[f"len{i}_effectif"] = top_effectif[i-1]
        results[f"len{i}_freq"] = top_freq[i-1]
        results[f"len{i}_motif"] = top_motif[i-1]
    return results

In [13]:
motifs_full = motifs_frequents_full()

In [14]:
def motifs_frequents_cluster(data, dico, topk):
    top_freq=[]
    top_effectif=[]
    top_motif=[]
    results=pd.DataFrame()

    for length in range(1,4):
        for cluster_p in range(1, nb_cluster + 1):
            ps = PrefixSpan(dico[cluster_p])
            ps.minlen = length
            if ps.topk(k=topk) != [] :
                effectif_cluster = (data.cluster==cluster_p).sum()
                top_effectif.append(ps.topk(k=topk)[topk-1][0])
                top_freq.append(ps.topk(k=topk)[topk-1][0]/effectif_cluster)
                top_motif.append(ps.topk(k=topk)[topk-1][1])
            else:
                top_freq.append(0)
                top_effectif.append(0)
                top_motif.append([])

        results[f"len{length}_effectif"]=top_effectif
        results[f"len{length}_freq"]=top_freq
        results[f"len{length}_motif"]=top_motif
        top_freq=[]
        top_motif=[]
        top_effectif=[]

    return results

In [15]:
motifs_cluster = motifs_frequents_cluster(patient, dico_cluster, 1)

### Comparer les fréquences des motifs des clusters avec celles de la base

In [16]:
motifs_full.head(3)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif
0,2093,0.208238,[05M092],868,0.08636,"[02C05J, 02C05J]",152,0.015123,"[05M092, 05M092, 05M092]"
1,1803,0.179385,[05M093],471,0.046861,"[05M092, 05M092]",143,0.014227,"[02C05J, 02C05J, 05M092]"
2,1532,0.152423,[02C05J],396,0.039399,"[05M092, 05M093]",119,0.01184,"[05M092, 05M092, 05M093]"


In [17]:
motifs_cluster.head(3)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif
0,172,0.501458,[02C05J],122,0.355685,"[02C05J, 02C05J]",26,0.075802,"[02C05J, 02C05J, 05M092]"
1,16,0.275862,[05M092],5,0.086207,"[02C05J, 02C05J]",2,0.034483,"[05M091, 02C05J, 02C05J]"
2,30,0.11194,[05M092],5,0.018657,"[05M092, 04M053]",1,0.003731,"[05M092, 04M053, 05M154]"


In [18]:
df_diff = pd.DataFrame()
for i in range(1, 4):
    df_diff[f'diff_len{i}'] = [0]*nb_cluster

for i in range(len(motifs_cluster)):
    for length in range(1, 4):
        ghm = motifs_cluster.iloc[i][f'len{length}_motif']
        freq_ghm_in_cluster = motifs_cluster.iloc[i][f'len{length}_freq']
        df_freq_ghm_full = motifs_full.loc[motifs_full[f'len{length}_motif'].apply(lambda x: str(x)) == str(ghm)][f'len{length}_freq']
        if len(df_freq_ghm_full) == 0:
            #print(ghm, i, length)
            freq_ghm_full = 0
        else:
            freq_ghm_full = df_freq_ghm_full.iloc[0]
        
        diff = freq_ghm_in_cluster - freq_ghm_full
        df_diff.loc[i, f'diff_len{length}'] = diff

In [19]:
df_diff

Unnamed: 0,diff_len1,diff_len2,diff_len3
0,0.349035,0.269326,0.061574
1,0.067624,-0.000153,0.030404
2,-0.096298,0.006718,0.003731
3,0.164643,0.035462,0.016949
4,0.176888,-0.036977,0.024691
5,0.057068,-0.002643,-0.004919
6,0.043044,0.00082,0.008122
7,0.042837,0.036472,0.027778
8,0.130472,0.074931,0.018031
9,0.213715,0.037236,0.043849


In [21]:
df_diff

Unnamed: 0,diff_len1,diff_len2,diff_len3
2,-0.096298,0.006718,0.003731
11,-0.031582,0.000164,-0.001053
13,-0.103476,-0.038741,0.019048
17,-0.059471,0.0,0.0
26,-0.102753,-0.014802,0.004219
33,-0.065381,-0.029969,0.009288
42,-0.052879,0.009959,0.005427
43,-0.019212,-0.004108,0.006173
46,-0.086545,-0.054614,0.009886


In [20]:
stop = time.time()
print(stop-start)

34.70508432388306
