# PrefixSpan

In [1]:
pip install prefixspan

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Import des librairies

from prefixspan import PrefixSpan

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import plotly.express as px

In [3]:
%%capture capt
p_soins=pd.read_csv("./data/p_soins.csv", index_col=0)
p_soins_IC=pd.read_csv("./data/p_soins_IC.csv", index_col=0)

In [4]:
p_soins.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,P0,4,05K051,05M042,05M16T,05M09T,05M092,05C191,05M20Z,05M20Z,...,,,,,,,,,,
1,P6,4,02C05J,05M093,04M132,05C222,23M103,04M053,04M24E,Décès,...,,,,,,,,,,
2,P8,2,11M041,06C194,Décès,,,,,,...,,,,,,,,,,
3,P13,2,05M04T,05K051,,,,,,,...,,,,,,,,,,
4,P16,4,08C11V,23M15Z,05K102,05M08T,05K102,05C191,23M101,05K191,...,,,,,,,,,,


In [5]:
def truncateGHM(df):
    for i in tqdm(range(2, df.shape[1])):
        df[str(i)]=df[str(i)].map(lambda x: str(x)[:5])
    df = df.replace('nan', np.nan)

In [6]:
truncateGHM(p_soins)
p_soins=p_soins.replace('nan', np.nan)

truncateGHM(p_soins_IC)
p_soins_IC=p_soins_IC.replace('nan', np.nan)

100%|██████████| 254/254 [00:00<00:00, 284.72it/s]
100%|██████████| 123/123 [00:00<00:00, 704.96it/s]


In [7]:
p_soins.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,P0,4,05K05,05M04,05M16,05M09,05M09,05C19,05M20,05M20,...,,,,,,,,,,
1,P6,4,02C05,05M09,04M13,05C22,23M10,04M05,04M24,Décès,...,,,,,,,,,,
2,P8,2,11M04,06C19,Décès,,,,,,...,,,,,,,,,,
3,P13,2,05M04,05K05,,,,,,,...,,,,,,,,,,
4,P16,4,08C11,23M15,05K10,05M08,05K10,05C19,23M10,05K19,...,,,,,,,,,,


In [18]:
def motifs_frequents(topk):
    top_freq=[]
    top_effectif=[]
    top_motif=[]
    results=pd.DataFrame()

    for length in range(1,5):
        corpus = p_soins.drop(["0", "1"], axis=1).stack().groupby(level=0).apply(list).tolist()
        ps = PrefixSpan(corpus)
        ps.minlen=length
        if ps.topk(k=topk) != [] :
            effectif = len(corpus)
            top_effectif.append(ps.topk(k=topk)[topk-1][0])
            top_freq.append(round(ps.topk(k=topk)[topk-1][0]/effectif,3))
            top_motif.append(ps.topk(k=topk)[topk-1][1])
        else:
            top_freq.append(0)
            top_effectif.append(0)
            top_motif.append([])

        results[f"len{length}_effectif"]=top_effectif
        results[f"len{length}_freq"]=top_freq
        results[f"len{length}_motif"]=top_motif
        top_freq=[]
        top_motif=[]
        top_effectif=[]

    return results

## Motifs de GHM fréquents sur l'ensemble de la population

### Motif le plus fréquent

In [19]:
motifs_frequents(1)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,6618,0.658,[Décès],3298,0.328,"[05M09, Décès]",1206,0.12,"[05M09, 05M09, Décès]",560,0.056,"[05M09, 05M09, 05M09, Décès]"


### Second motif le plus fréquent

In [20]:
motifs_frequents(2)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,4637,0.461,[05M09],1632,0.162,"[05M09, 05M09]",728,0.072,"[05M09, 05M09, 05M09]",367,0.037,"[05M09, 05M09, 05M09, 05M09]"


### Troisième motif le plus fréquent

In [21]:
motifs_frequents(3)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,3298,0.328,"[05M09, Décès]",1331,0.132,"[04M05, Décès]",560,0.056,"[05M09, 05M09, 05M09, Décès]",277,0.028,"[05M09, 05M09, 05M09, 05M09, Décès]"


In [10]:
def motifs_frequents_cluster(topk):
    top_freq=[]
    top_effectif=[]
    top_motif=[]
    results=pd.DataFrame()

    for length in range(1,5):
        for cluster in range(1, 5):
            corpus = p_soins[p_soins['1']==cluster].drop(["0", "1"], axis=1).stack().groupby(level=0).apply(list).tolist()
            ps = PrefixSpan(corpus)
            ps.minlen=length
            if ps.topk(k=topk) != [] :
                effectif_cluster = len(corpus)
                top_effectif.append(ps.topk(k=topk)[topk-1][0])
                top_freq.append(round(ps.topk(k=topk)[topk-1][0]/effectif_cluster,3))
                top_motif.append(ps.topk(k=topk)[topk-1][1])
            else:
                top_freq.append(0)
                top_effectif.append(0)
                top_motif.append([])

        results[f"len{length}_effectif"]=top_effectif
        results[f"len{length}_freq"]=top_freq
        results[f"len{length}_motif"]=top_motif
        top_freq=[]
        top_motif=[]
        top_effectif=[]

    return results

## Motifs de GHM fréquents par clusters de patients

### Motif le plus fréquent

In [11]:
motifs_frequents_cluster(1)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,833,0.61,[Décès],507,0.371,"[05M09, Décès]",301,0.22,"[05M09, 05M09, Décès]",181,0.133,"[05M09, 05M09, 05M09, Décès]"
1,3467,0.685,[Décès],1542,0.305,"[05M09, Décès]",336,0.066,"[05M09, 05M09, Décès]",91,0.018,"[05M09, 05M09, 05M09, Décès]"
2,22,0.629,[23M20],15,0.429,"[23M20, 23M20]",8,0.229,"[23M20, 23M20, 23M20]",8,0.229,"[07M14, 07M14, 07M14, 07M14]"
3,2082,0.651,[Décès],1111,0.347,"[05M09, Décès]",487,0.152,"[05M09, 05M09, Décès]",234,0.073,"[05M09, 05M09, 05M09, Décès]"


### Second motif le plus fréquent

In [12]:
motifs_frequents_cluster(2)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,777,0.569,[05M09],431,0.316,"[05M09, 05M09]",256,0.187,"[05M09, 05M09, 05M09]",155,0.113,"[05M09, 05M09, 05M09, 05M09]"
1,2032,0.401,[05M09],489,0.097,"[04M05, Décès]",104,0.021,"[05M09, 05M09, 05M09]",25,0.005,"[05M09, 05M09, 05M09, 05M09]"
2,18,0.514,[05M09],10,0.286,"[23M20, 16M11]",8,0.229,"[23M20, 23M20, 23M20]",7,0.2,"[07M14, 07M14, 07M14, 07M14, 07M14]"
3,1586,0.496,[05M09],642,0.201,"[05M09, 05M09]",285,0.089,"[05M09, 05M09, 05M09]",129,0.04,"[05M09, 05M09, 05M09, 05M09]"


### Troisième motif le plus fréquent

In [13]:
motifs_frequents_cluster(3)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif,len4_effectif,len4_freq,len4_motif
0,507,0.371,"[05M09, Décès]",301,0.22,"[05M09, 05M09, Décès]",181,0.133,"[05M09, 05M09, 05M09, Décès]",109,0.08,"[05M09, 05M09, 05M09, 05M09, Décès]"
1,1542,0.305,"[05M09, Décès]",418,0.083,"[05M09, 05M09]",102,0.02,"[02C05, 05M09, Décès]",21,0.004,"[05M09, 05M09, 05M09, 05M09, Décès]"
2,15,0.429,"[23M20, 23M20]",9,0.257,"[23M20, 05M09]",8,0.229,"[23M20, 23M20, 23M20]",7,0.2,"[07M14, 07M14, 07M14, 07M14, 07M14, 07M14]"
3,1111,0.347,"[05M09, Décès]",505,0.158,"[04M05, Décès]",234,0.073,"[05M09, 05M09, 05M09, Décès]",109,0.034,"[05M09, 05M09, 05M09, 05M09, Décès]"
