# PrefixSpan

Les algorithmes de **sequential-pattern-mining** sont utilisés pour extraire des motifs fréquents dans des séquences de données. Il existe deux approches principales pour effectuer cette tâche: l'approche basée sur la génération de candidats (ou candidates generation) et l'approche basée sur la croissance de motifs (ou pattern growth).

+ Les algorithmes de **candidates generation**, tels que *AprioriAll* et *GSP* (Generalized Sequential Pattern), génèrent des candidats de motifs en combinant des ensembles de motifs de taille inférieure. Ils passent ensuite à travers la base de données pour compter le nombre d'occurrences de chaque candidat généré et éliminer les candidats non fréquents. Ces étapes de génération de candidats et de filtrage sont répétées jusqu'à ce que tous les motifs fréquents soient extraits.

+ En revanche, les algorithmes de **pattern growth**, tels que *PrefixSpan*, utilisent une approche différente. Ils construisent des motifs fréquents de manière récursive à partir d'un ensemble initial de préfixes. Ils développent ensuite chaque préfixe en ajoutant des éléments de la séquence qui sont fréquents. Cette approche évite la génération de tous les candidats et la comparaison avec la base de données, ce qui la rend souvent plus efficace que l'approche de candidates generation.

In [1]:
pip install prefixspan

Collecting prefixspan
  Using cached prefixspan-0.5.2.tar.gz (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting docopt>=0.6.2
  Using cached docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting extratools>=0.8.1
  Using cached extratools-0.8.2.1.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: prefixspan, docopt, extratools
  Building wheel for prefixspan (setup.py): started
  Building wheel for prefixspan (setup.py): finished with status 'done'
  Created wheel for prefixspan: filename=prefixspan-0.5.2-py3-none-any.whl size=11239 sha256=3bfab22eebf9ef5769f427d10e2ef8452692af1945d0cb53caf57bb45cfb7604
  Stored in directory: c:\users\trist\appdata\local\pip\cache\wheels\1b\96\e2\4eafa983829d97cbdd88d66b06221e28a35ff5b8c618c991

In [2]:
from prefixspan import PrefixSpan
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [3]:
%%capture capt
p_soins=pd.read_csv("../data/parcours_soins.csv")
patients=pd.read_csv("../data/profil_patient.csv")

## Motifs fréquents selon les clusters de patients

In [4]:
p_soins=p_soins.rename(columns={"BEN_NIR_IDT": "CODE_PATIENT"})
p_soins=p_soins.set_index("CODE_PATIENT")
patients=patients.set_index("CODE_PATIENT")
p_soins["cluster"]=patients["cluster"]

  p_soins["cluster"]=patients["cluster"]


In [5]:
with open('../data/parcours_soins.pickle', 'rb') as handle:
    p_soin=pickle.load(handle)
    
with open('../data/parcours_soins_dp.pickle', 'rb') as handle:
    p_soin_dp=pickle.load(handle)

In [6]:
def motifs_frequents(data, dico, topk):
    top_freq=[]
    top_effectif=[]
    top_motif=[]
    results=pd.DataFrame()

    for length in range(1,4):
        for cluster_p in range(18):
            ps = PrefixSpan(dico[f"Cluster {cluster_p}"])
            ps.minlen = length
            if ps.topk(k=topk) != [] :
                effectif_cluster = (data.cluster==cluster_p).sum()
                top_effectif.append(ps.topk(k=topk)[topk-1][0])
                top_freq.append(round(ps.topk(k=topk)[topk-1][0]/effectif_cluster,3))
                top_motif.append(ps.topk(k=topk)[topk-1][1])
            else:
                top_freq.append(0)
                top_effectif.append(0)
                top_motif.append([])

        results[f"len{length}_effectif"]=top_effectif
        results[f"len{length}_freq"]=top_freq
        results[f"len{length}_motif"]=top_motif
        top_freq=[]
        top_motif=[]
        top_effectif=[]

    return results

In [7]:
motifs_frequents(p_soins, p_soin, 1)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif
0,207,0.162,[02C05J],28,0.022,"[05M092, 05M092]",7,0.005,"[05M091, 05M092, 05M092]"
1,553,0.195,[05M093],96,0.034,"[05M092, 05M092]",18,0.006,"[05M092, 05M092, 05M092]"
2,38,0.158,[06K04J],7,0.029,"[06K04J, 06K04J]",4,0.017,"[06K04J, 06K04J, 06K04J]"
3,112,0.232,[05K101],15,0.031,"[05K101, 05K061]",3,0.006,"[05K101, 05K061, 05K101]"
4,180,0.377,[05M092],83,0.174,"[05M092, 05M092]",40,0.084,"[05M092, 05M092, 05M092]"
5,53,0.262,[05K101],14,0.069,"[05K101, 05K101]",10,0.05,"[11M171, 11M171, 11M171]"
6,136,0.35,[05M092],49,0.126,"[05M092, 05M092]",20,0.051,"[05M092, 05M093, 05M093]"
7,139,0.166,[02C05J],13,0.016,"[05K101, 02C05J]",4,0.005,"[05M092, 05M092, 05M092]"
8,109,0.304,[05M092],41,0.115,"[05M092, 05M092]",14,0.039,"[05M092, 05M092, 05M092]"
9,185,0.304,[05K101],60,0.099,"[05M092, 05M092]",28,0.046,"[05M092, 05M092, 05M092]"


In [8]:
motifs_frequents(p_soins, p_soin_dp, 1)

Unnamed: 0,len1_effectif,len1_freq,len1_motif,len2_effectif,len2_freq,len2_motif,len3_effectif,len3_freq,len3_motif
0,229,0.179,[I500],58,0.045,"[I500, I500]",22,0.017,"[I500, I500, I500]"
1,703,0.248,[I500],156,0.055,"[I500, I500]",38,0.013,"[I500, I500, I500]"
2,28,0.116,[I500],6,0.025,"[Z098, Z098]",2,0.008,"[Z098, Z098, Z098]"
3,67,0.139,[I501],16,0.033,"[Z098, Z098]",4,0.008,"[Z098, Z098, Z098]"
4,205,0.429,[I500],89,0.186,"[I500, I500]",52,0.109,"[I500, I500, I500]"
5,48,0.238,[I501],23,0.114,"[I501, I501]",14,0.069,"[Z940, Z940, Z940]"
6,141,0.362,[I500],72,0.185,"[I500, I500]",38,0.098,"[I500, I500, I500]"
7,146,0.175,[I500],22,0.026,"[I500, I500]",7,0.008,"[I500, I500, I500]"
8,138,0.385,[I501],53,0.148,"[I501, I501]",27,0.075,"[I501, I501, I501]"
9,198,0.326,[I500],87,0.143,"[I500, I500]",47,0.077,"[I500, I500, I500]"


## Sur tout le dataset

In [9]:
corpus = []
for i in range(18):
    corpus += p_soin[f"Cluster {i}"]

In [10]:
ps = PrefixSpan(corpus)
ps.topk(k=20)

[(1838, ['05M092']),
 (1631, ['05M093']),
 (1390, ['02C05J']),
 (985, ['05K101']),
 (855, ['04M053']),
 (803, ['05M091']),
 (790, ['05M09T']),
 (664, ['23M20Z']),
 (584, ['23M20T']),
 (555, ['06K04J']),
 (542, ['05K061']),
 (519, ['04M052']),
 (494, ['23Z02Z']),
 (482, ['05M094']),
 (438, ['05M092', '05M092']),
 (429, ['05K102']),
 (414, ['06K02Z']),
 (366, ['02C051']),
 (366, ['05K062']),
 (366, ['05M092', '05M093'])]

In [11]:
freq=[]; motifs=[]
for i in range(20):
    freq.append(ps.topk(k=20)[i][0]/len(corpus))
    motifs.append(ps.topk(k=20)[i][1])

In [12]:
pd.DataFrame({"Motif":motifs, "Frequence":freq})

Unnamed: 0,Motif,Frequence
0,[05M092],0.182867
1,[05M093],0.162272
2,[02C05J],0.138295
3,[05K101],0.098
4,[04M053],0.085066
5,[05M091],0.079893
6,[05M09T],0.078599
7,[23M20Z],0.066063
8,[23M20T],0.058104
9,[06K04J],0.055218


In [166]:
p_soins_IC=pd.read_csv("../data/p_soins_IC.csv")

In [174]:
p_soins_IC_2=p_soins_IC.iloc[:,1:3]
p_soins_IC_2.head()

Unnamed: 0,1,2
0,05M09T,05M092
1,05M093,04M132
2,05M093,04M143
3,05M091,05M171
4,05M091,23K02Z


In [175]:
corpus_IC=[]
for patient in range(len(p_soins_IC_2)):
    corpus_IC.append([p_soins_IC_2.iloc[patient][0], p_soins_IC_2.iloc[patient][1]])

In [176]:
ps = PrefixSpan(corpus_IC)
ps.minlen=2
output=ps.topk(k=20)

In [177]:
len(p_soins_IC)

3749

In [178]:
for i in output:
    ghm1=str(i[1][0])
    ghm2=str(i[1][1])
    print("['"+ghm1+'\'',",\'"+ghm2+" ',",round(i[0]/len(corpus_IC),3),"],")

['05M092' ,'nan ', 0.066 ],
['05M093' ,'nan ', 0.057 ],
['05M092' ,'05M092 ', 0.027 ],
['05M091' ,'nan ', 0.024 ],
['05M09T' ,'nan ', 0.024 ],
['05M092' ,'05M093 ', 0.014 ],
['05M093' ,'05M092 ', 0.014 ],
['05M094' ,'nan ', 0.014 ],
['05M091' ,'05K101 ', 0.013 ],
['05M093' ,'05M093 ', 0.013 ],
['05M091' ,'05M092 ', 0.012 ],
['05M092' ,'05M091 ', 0.01 ],
['05M09T' ,'05M092 ', 0.01 ],
['05M091' ,'05M091 ', 0.009 ],
['05M092' ,'05K101 ', 0.009 ],
['05M09T' ,'05M093 ', 0.006 ],
['05M092' ,'05M09T ', 0.006 ],
['05M092' ,'02C05J ', 0.006 ],
['05M093' ,'05K101 ', 0.006 ],
['05M09T' ,'05M091 ', 0.006 ],


In [179]:
p_soins_IC_3=p_soins_IC.iloc[:,2:4]
p_soins_IC_3.head()

Unnamed: 0,2,3
0,05M092,05C191
1,04M132,05C222
2,04M143,11M044
3,05M171,05M13T
4,23K02Z,09C081


In [180]:
corpus_IC_2=[]
for patient in range(len(p_soins_IC_3)):
    if str(p_soins_IC_3.iloc[patient][1])!='nan':
        corpus_IC_2.append([p_soins_IC_3.iloc[patient][0], p_soins_IC_3.iloc[patient][1]])
    else:
        corpus_IC_2.append([p_soins_IC_3.iloc[patient][0], 'nan'])

In [181]:
ps = PrefixSpan(corpus_IC_2)
ps.minlen=2
output=ps.topk(k=20)

In [183]:
for i in output:
    ghm1=str(i[1][0])
    ghm2=str(i[1][1])
    print("['"+ghm1+' \'',",\' "+ghm2+" ',",round(i[0]/len(corpus_IC),3),"],")

['nan ' ,' nan ', 0.186 ],
['05M092 ' ,' nan ', 0.011 ],
['05M092 ' ,' 05M092 ', 0.01 ],
['05M093 ' ,' nan ', 0.009 ],
['02C05J ' ,' 02C05J ', 0.007 ],
['05K101 ' ,' nan ', 0.006 ],
['05M091 ' ,' nan ', 0.005 ],
['05M092 ' ,' 05M093 ', 0.005 ],
['02C05J ' ,' nan ', 0.004 ],
['04M053 ' ,' nan ', 0.004 ],
['05M093 ' ,' 05M092 ', 0.004 ],
['05M091 ' ,' 05M092 ', 0.003 ],
['05M09T ' ,' 05M092 ', 0.003 ],
['05M09T ' ,' nan ', 0.003 ],
['05M091 ' ,' 05M091 ', 0.003 ],
['05M092 ' ,' 05M091 ', 0.003 ],
['05M093 ' ,' 05M093 ', 0.002 ],
['04M132 ' ,' nan ', 0.002 ],
['05M094 ' ,' nan ', 0.002 ],
['23M20Z ' ,' nan ', 0.002 ],


In [102]:
dictionnaire={"05M091":0, "05M092":1, "05M093":2, "05M09T":3, '05K101':4, '02C05J':5, 'nan':6}

In [103]:
source=[]
target=[]
value=[]

for i in ps.topk(k=20):
    ghm1=str(i[1][0]); source.append(dictionnaire[ghm1])
    ghm2=str(i[1][1]); target.append(dictionnaire[ghm2])
    nb=i[0]; value.append(nb)

In [104]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["05M091", "05M092", "05M093", "05M09T"],
      color = "blue"
    ),
    link = dict(
      source = source, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = target,
      value = value
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()