In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
plt.style.use('seaborn-white')

Non sono considerate le sequenze, che rappresentano i clienti, ottenute a partire dall'intero dataset ma solo dei sottoinsiemi di sequenze, questo perché altrimenti l'esecuzione dell'algoritmo è troppo lenta. Il secondo sottoinsieme considerato contiene le sequenze ottenute a partire dal dataset che hanno almeno 4 elementi e non più di 20 elementi.

In [2]:
# Leggiamo il dataframe dal file csv 'acquisti.csv'
acquisti = pd.read_csv('acquisti.csv', sep=',')
acquisti.drop(columns=['Unnamed: 0'], inplace=True)
acquisti.head(3)

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta,Spending
0,536365,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,15.3
1,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6,20.34
2,536365,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8,22.0


In [3]:
# Dizionario che contiene per ogni ProdID la descrizione corrispondente
ProID_to_ProdDescr = dict()
for index, row in acquisti.iterrows():
    key = row['ProdID']
    if not key in ProID_to_ProdDescr:
        ProID_to_ProdDescr[row['ProdID']] = row['ProdDescr']

In [4]:
# Ogni cliente deve essere rappresentato come una sequenza di acquisti, gli attributi che non servono vengono quindi eliminati
acquisti.drop(columns=['Sale', 'ProdDescr', 'Qta', 'Spending', 'CustomerCountry'], inplace=True)

In [5]:
tmp = acquisti.groupby(['CustomerID', 'BasketID'])['ProdID'].apply(list)
customer_sequences = tmp.groupby('CustomerID').apply(list)
sequences = customer_sequences.tolist()

In [6]:
print("Numero di sequenze (uguale al numero di clienti):", len(sequences))

Numero di sequenze (uguale al numero di clienti): 4324


In [7]:
# Sequenze che contengono almeno 4 basket e al massimo 20 basket
subset_of_sequences = []
for lst in sequences:
    if 4 <= len(lst) <= 20:
        subset_of_sequences.append(lst)
len(subset_of_sequences)

1391

In [8]:
from gsp import *
results3 = apriori(subset_of_sequences, 60, verbose=False)

In [9]:
results3

[([['15036']], 101),
 ([['15056BL']], 73),
 ([['15056N']], 87),
 ([['16156S']], 68),
 ([['16161P']], 105),
 ([['16161U']], 70),
 ([['16169E']], 78),
 ([['17003']], 75),
 ([['20675']], 81),
 ([['20676']], 95),
 ([['20677']], 70),
 ([['20679']], 70),
 ([['20682']], 90),
 ([['20685']], 182),
 ([['20711']], 85),
 ([['20712']], 145),
 ([['20713']], 131),
 ([['20717']], 94),
 ([['20718']], 145),
 ([['20719']], 140),
 ([['20723']], 139),
 ([['20724']], 195),
 ([['20725']], 328),
 ([['20726']], 235),
 ([['20727']], 280),
 ([['20728']], 289),
 ([['20749']], 115),
 ([['20750']], 114),
 ([['20751']], 67),
 ([['20754']], 87),
 ([['20914']], 192),
 ([['20969']], 96),
 ([['20970']], 83),
 ([['20971']], 146),
 ([['20972']], 173),
 ([['20973']], 114),
 ([['20974']], 114),
 ([['20975']], 129),
 ([['20977']], 60),
 ([['20978']], 74),
 ([['20979']], 90),
 ([['20981']], 85),
 ([['20982']], 83),
 ([['20983']], 100),
 ([['20984']], 77),
 ([['20992']], 88),
 ([['20996']], 68),
 ([['21034']], 212),
 ([['21035

In [10]:
len(results3)

5004

In [11]:
# Sottosequenze che contengono 4 elementi e hanno support più alto
seq_l4 = []

for t in results3:
    lst, support = t
    if len(lst) == 4 and support > 66 :
        seq_l4.append((lst, support))
        

In [12]:
len(seq_l4)

6

In [13]:
seq_l4

[([['84879'], ['84879'], ['84879'], ['84879']], 67),
 ([['85099B'], ['23203'], ['23203'], ['23203']], 67),
 ([['85099B'], ['23203'], ['85099B'], ['23203']], 68),
 ([['85099B'], ['85099B'], ['23203'], ['85099B']], 69),
 ([['85099B'], ['85099B'], ['85099B'], ['85099B']], 83),
 ([['85123A'], ['85123A'], ['85123A'], ['85123A']], 107)]

In [14]:
for seq in seq_l4:
    lst, sup = seq
    for l in lst:
        for item in l:
            print(item + ": " + ProID_to_ProdDescr[item])
    print("support " + str(sup))
    print("\n")

84879: ASSORTED COLOUR BIRD ORNAMENT
84879: ASSORTED COLOUR BIRD ORNAMENT
84879: ASSORTED COLOUR BIRD ORNAMENT
84879: ASSORTED COLOUR BIRD ORNAMENT
support 67


85099B: JUMBO BAG RED RETROSPOT
23203: JUMBO BAG DOILEY PATTERNS
23203: JUMBO BAG DOILEY PATTERNS
23203: JUMBO BAG DOILEY PATTERNS
support 67


85099B: JUMBO BAG RED RETROSPOT
23203: JUMBO BAG DOILEY PATTERNS
85099B: JUMBO BAG RED RETROSPOT
23203: JUMBO BAG DOILEY PATTERNS
support 68


85099B: JUMBO BAG RED RETROSPOT
85099B: JUMBO BAG RED RETROSPOT
23203: JUMBO BAG DOILEY PATTERNS
85099B: JUMBO BAG RED RETROSPOT
support 69


85099B: JUMBO BAG RED RETROSPOT
85099B: JUMBO BAG RED RETROSPOT
85099B: JUMBO BAG RED RETROSPOT
85099B: JUMBO BAG RED RETROSPOT
support 83


85123A: WHITE HANGING HEART T-LIGHT HOLDER
85123A: WHITE HANGING HEART T-LIGHT HOLDER
85123A: WHITE HANGING HEART T-LIGHT HOLDER
85123A: WHITE HANGING HEART T-LIGHT HOLDER
support 107




In [15]:
seq_l3_1 = []
seq_l3_2 = []
for t in results3:
    lst, support = t
    if len(lst) == 3:
        if (len(lst[0]) > 1 or len(lst[1]) > 1 or len(lst[2]) > 1) and support == 63:
            seq_l3_1.append((lst, support))
        elif len(lst[0]) == 1 and len(lst[1]) == 1 and len(lst[2]) == 1 and support > 90:
            seq_l3_2.append((lst, support))

In [16]:
len(seq_l3_1)

18

In [17]:
# la variabile seq_l3_1 contiene sottosequenze che hanno 3 elementi e più di 3 oggetti
seq_l3_1

[([['20725', '20727'], ['20728'], ['20727']], 63),
 ([['20725', '20727'], ['22383'], ['20727']], 63),
 ([['20725', '20728'], ['20728'], ['20725']], 63),
 ([['20725', '20728'], ['22383'], ['22383']], 63),
 ([['20725', '20728'], ['23209'], ['23209']], 63),
 ([['20725', '22383'], ['20725'], ['22383']], 63),
 ([['20725', '22383'], ['20728'], ['20725']], 63),
 ([['20725', '22383'], ['23209'], ['20725']], 63),
 ([['20725', '22383'], ['23209'], ['23209']], 63),
 ([['20725', '22384'], ['22384'], ['20725']], 63),
 ([['20725', '22384'], ['22384'], ['20727']], 63),
 ([['20727', '22383'], ['20727'], ['20725']], 63),
 ([['20727', '22383'], ['22383'], ['20725']], 63),
 ([['20727', '22383'], ['23209'], ['20727']], 63),
 ([['20727', '22384'], ['20725'], ['20725']], 63),
 ([['20728', '22383'], ['20725'], ['22383']], 63),
 ([['20728', '22383'], ['22383'], ['22384']], 63),
 ([['23203', '85099B'], ['23203'], ['85099B']], 63)]

In [18]:
for seq in seq_l3_1:
    lst, sup = seq
    for l in lst:
        for item in l:
            print(item + ": " + ProID_to_ProdDescr[item])
    print("support " + str(sup))
    print("\n")

20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20728: LUNCH BAG CARS BLUE
20727: LUNCH BAG  BLACK SKULL.
support 63


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
22383: LUNCH BAG SUKI  DESIGN 
20727: LUNCH BAG  BLACK SKULL.
support 63


20725: LUNCH BAG RED RETROSPOT
20728: LUNCH BAG CARS BLUE
20728: LUNCH BAG CARS BLUE
20725: LUNCH BAG RED RETROSPOT
support 63


20725: LUNCH BAG RED RETROSPOT
20728: LUNCH BAG CARS BLUE
22383: LUNCH BAG SUKI  DESIGN 
22383: LUNCH BAG SUKI  DESIGN 
support 63


20725: LUNCH BAG RED RETROSPOT
20728: LUNCH BAG CARS BLUE
23209: LUNCH BAG DOILEY PATTERN 
23209: LUNCH BAG DOILEY PATTERN 
support 63


20725: LUNCH BAG RED RETROSPOT
22383: LUNCH BAG SUKI  DESIGN 
20725: LUNCH BAG RED RETROSPOT
22383: LUNCH BAG SUKI  DESIGN 
support 63


20725: LUNCH BAG RED RETROSPOT
22383: LUNCH BAG SUKI  DESIGN 
20728: LUNCH BAG CARS BLUE
20725: LUNCH BAG RED RETROSPOT
support 63


20725: LUNCH BAG RED RETROSPOT
22383: LUNCH BAG SUKI  DESIGN 

In [19]:
# la variabile seq_l3_2 contiene sottosequenze che hanno 3 elementi e 3 oggetti
seq_l3_2

[([['20725'], ['20725'], ['20725']], 122),
 ([['20725'], ['20725'], ['20727']], 106),
 ([['20725'], ['20725'], ['23209']], 99),
 ([['20725'], ['20727'], ['20725']], 99),
 ([['20725'], ['20727'], ['20727']], 94),
 ([['20725'], ['22383'], ['20725']], 93),
 ([['20725'], ['22384'], ['20727']], 92),
 ([['20725'], ['23209'], ['20725']], 98),
 ([['20725'], ['23209'], ['23209']], 95),
 ([['20727'], ['20725'], ['20727']], 99),
 ([['20727'], ['20727'], ['20725']], 92),
 ([['20727'], ['20727'], ['20727']], 95),
 ([['20728'], ['22383'], ['22383']], 91),
 ([['22383'], ['22383'], ['20725']], 91),
 ([['22383'], ['22383'], ['22383']], 103),
 ([['22423'], ['22423'], ['22423']], 117),
 ([['23203'], ['23203'], ['23203']], 98),
 ([['23209'], ['23209'], ['23209']], 93),
 ([['47566'], ['47566'], ['47566']], 99),
 ([['84879'], ['84879'], ['84879']], 127),
 ([['85099B'], ['20725'], ['85099B']], 100),
 ([['85099B'], ['23203'], ['23203']], 113),
 ([['85099B'], ['23203'], ['85099B']], 118),
 ([['85099B'], ['8509

In [20]:
for seq in seq_l3_2:
    lst, sup = seq
    for l in lst:
        for item in l:
            print(item + ": " + ProID_to_ProdDescr[item])
    print("support " + str(sup))
    print("\n")

20725: LUNCH BAG RED RETROSPOT
20725: LUNCH BAG RED RETROSPOT
20725: LUNCH BAG RED RETROSPOT
support 122


20725: LUNCH BAG RED RETROSPOT
20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
support 106


20725: LUNCH BAG RED RETROSPOT
20725: LUNCH BAG RED RETROSPOT
23209: LUNCH BAG DOILEY PATTERN 
support 99


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
support 99


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20727: LUNCH BAG  BLACK SKULL.
support 94


20725: LUNCH BAG RED RETROSPOT
22383: LUNCH BAG SUKI  DESIGN 
20725: LUNCH BAG RED RETROSPOT
support 93


20725: LUNCH BAG RED RETROSPOT
22384: LUNCH BAG PINK POLKADOT
20727: LUNCH BAG  BLACK SKULL.
support 92


20725: LUNCH BAG RED RETROSPOT
23209: LUNCH BAG DOILEY PATTERN 
20725: LUNCH BAG RED RETROSPOT
support 98


20725: LUNCH BAG RED RETROSPOT
23209: LUNCH BAG DOILEY PATTERN 
23209: LUNCH BAG DOILEY PATTERN 
support 95


20727: LUNCH BAG  BLACK SKULL.
20725

In [24]:
seq_l3_3 = []
for t in results3:
    lst, support = t
    if len(lst) == 3:
        if (len(lst[0]) > 1 or len(lst[1]) > 1 or len(lst[2]) > 1):
            seq_l3_3.append((lst, support))

In [25]:
# la variabile seq_l3_3 contiene tutte le sottosequenze che hanno 3 elementi e più di 3 oggetti
seq_l3_3

[([['20725', '20727'], ['20725'], ['20725']], 68),
 ([['20725', '20727'], ['20725'], ['20727']], 78),
 ([['20725', '20727'], ['20725'], ['22384']], 61),
 ([['20725', '20727'], ['20725'], ['23209']], 60),
 ([['20725', '20727'], ['20727'], ['20725']], 70),
 ([['20725', '20727'], ['20727'], ['20727']], 72),
 ([['20725', '20727'], ['20727'], ['22384']], 60),
 ([['20725', '20727'], ['20728'], ['20725']], 62),
 ([['20725', '20727'], ['20728'], ['20727']], 63),
 ([['20725', '20727'], ['22383'], ['20727']], 63),
 ([['20725', '20727'], ['22384'], ['20725']], 61),
 ([['20725', '20727'], ['22384'], ['20727']], 69),
 ([['20725', '20727'], ['23206'], ['20727']], 60),
 ([['20725', '20727'], ['23209'], ['20727']], 64),
 ([['20725', '20727'], ['23209'], ['23209']], 61),
 ([['20725', '20728'], ['20725'], ['20725']], 66),
 ([['20725', '20728'], ['20725'], ['20727']], 64),
 ([['20725', '20728'], ['20728'], ['20725']], 63),
 ([['20725', '20728'], ['22383'], ['20725']], 64),
 ([['20725', '20728'], ['22383'

In [26]:
for seq in seq_l3_3:
    lst, sup = seq
    for l in lst:
        for item in l:
            print(item + ": " + ProID_to_ProdDescr[item])
    print("support " + str(sup))
    print("\n")

20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
20725: LUNCH BAG RED RETROSPOT
support 68


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
support 78


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
22384: LUNCH BAG PINK POLKADOT
support 61


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
23209: LUNCH BAG DOILEY PATTERN 
support 60


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20727: LUNCH BAG  BLACK SKULL.
20725: LUNCH BAG RED RETROSPOT
support 70


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20727: LUNCH BAG  BLACK SKULL.
20727: LUNCH BAG  BLACK SKULL.
support 72


20725: LUNCH BAG RED RETROSPOT
20727: LUNCH BAG  BLACK SKULL.
20727: LUNCH BAG  BLACK SKULL.
22384: LUNCH BAG PINK POLKADOT
support 60


20725: LUNCH BAG RED RETROSPOT
20727: L