In [1]:
import numpy as np
import pandas as pd
import itertools

df = pd.read_csv('./data.csv')

# Cleaning the data and removing unwanted columns

In [2]:
### Pre Processing Dataset ###

# Convert Invoice Date to Pandas Date format
idate = pd.to_datetime(df['InvoiceDate'], format='%m/%d/%Y %H:%M')
df['Date'] = idate
del(idate)

# Remove unused columns
df = df.drop(columns=['Description', 'Country', 'UnitPrice', 'InvoiceDate', 'Quantity'])

# Select data of 5 days
df = df[df['Date'] > '2010-12-5 12:43:00']
df = df[df['Date'] < '2010-12-15 12:43:00']

# Some customer IDs are Blank. Removing them
df = df[df['CustomerID'].notnull()]

# Remove Cancelled Invoices
df = df[~df['InvoiceNo'].str.startswith('C')]

# Remove items that were bought less than 15 times; https://stackoverflow.com/a/49137073/6364386
df = df[df.groupby('StockCode')['StockCode'].transform('count').ge(15)]

# Remove items that were bought more than 100 times, as these are might just be common items without any patterns.
# df = df[df.groupby('StockCode')['StockCode'].transform('count').le(100)]

#Generating seqences that act as input for GSP algo

In [3]:
### Generate Purchase Sequences ###

# Change StockCode from String to Integer for efficiency
df['StockCode']  = df['StockCode'].apply(lambda x: hash(x) % (10 ** 8))
df['CustomerID'] = df['CustomerID'].astype('int')
df['InvoiceNo']  = df['InvoiceNo'].astype('int')

sequence_list = []

count = 0
total = len(df['CustomerID'].unique())

for customer in df['CustomerID'].unique():
    sequence = []
    
    # DataFrame of all items this customer purchased
    df_cust = df[df['CustomerID'] == customer]
    
    for invoice in df_cust['InvoiceNo'].unique():
        
        # DataFrame of all items in this invoice
        df_invoice = df_cust[df_cust['InvoiceNo'] == invoice]
        
        sequence.append(df_invoice.sort_values('StockCode')['StockCode'].values)
    
    count += 1
    print('%d/%d' % (count, total), end='\r')
                
    sequence_list.append(sequence)

# sequence_list is a list of lists which has all the purchase sequences. 

517/517

In [4]:
def calc_support(items):    
    support = 0
    for sequence in sequence_list:
        first_item_not_found = True
        for array in sequence:
            if first_item_not_found:
                if items[0] in array: first_item_not_found = False
            else:
                if items[1] in array: 
                    support += 1
                    break
    return support



supports = {}
print('Calculating combinations.')
combinations = [p for p in itertools.product(df['StockCode'].unique(), repeat=2)]
print('Calculating supports for combinations.')

count = 0
total = len(combinations)

for pair in combinations:
    score = calc_support(pair)
    
    if score >= 3:
        supports[str(pair)] = [score]
    count += 1
    if not (count % 100): print('%d/%d' % (count, total), end='\r')
    
print('%d/%d' % (count, total), end='\r')
supports_df = pd.DataFrame.from_dict(supports).T

# supports_df has the supports calculated for this figure: http://3.bp.blogspot.com/-iFZkp8foZbo/VPkGSFwN8zI/AAAAAAAAB6Q/Z88wJHll2Sk/s1600/GSP%2B2%2BItem%2BSequences%2B(1).jpg
# in the article

Calculating combinations.
Calculating supports for combinations.
63504/63504

In [5]:
table = list(supports_df.index)
first = [x.split(',')[0][1:] for x in table ]
last = [x.split(',')[-1][:-1] for x in table ]

table_df = pd.DataFrame({
    '2-seq': table,
    'first': first,
    'last': last
})

table_df['first'] = table_df['first'].astype('int')
table_df['last'] = table_df['last'].astype('int')

# table_df
for index, row in table_df.iterrows():
    for first in table_df[table_df['last'] == row['first']]['first']:
        if (first not in [row['last'], row['first']]) and (row['last'] != row['first']): 
             print('%d, %d, %d' % (row['last'], row['first'], first))

# Matching first and last to make patterns of length 3
# Output of this is stored in sin.txt

39205831, 87561379, 27718800
39205831, 87561379, 93087014
57699076, 87561379, 27718800
57699076, 87561379, 93087014
32022942, 87561379, 27718800
32022942, 87561379, 93087014
56750275, 5408882, 23301214
56750275, 5408882, 9797485
56750275, 5408882, 11047790
56750275, 5408882, 4091042
80585875, 5408882, 23301214
80585875, 5408882, 9797485
80585875, 5408882, 11047790
80585875, 5408882, 4091042
483430, 5408882, 23301214
483430, 5408882, 9797485
483430, 5408882, 11047790
483430, 5408882, 4091042
99000570, 5408882, 23301214
99000570, 5408882, 9797485
99000570, 5408882, 11047790
99000570, 5408882, 4091042
57699076, 39205831, 87561379
56750275, 39205831, 87561379
56750275, 39205831, 57699076
53151389, 39205831, 87561379
53151389, 39205831, 57699076
32022942, 39205831, 87561379
32022942, 39205831, 57699076
56545166, 39205831, 87561379
56545166, 39205831, 57699076
22655197, 39205831, 87561379
22655197, 39205831, 57699076
47113206, 57699076, 87561379
47113206, 57699076, 23301214
47113206, 5769907

32022942, 26664066, 21571832
32022942, 26664066, 80585875
32022942, 26664066, 50568150
32022942, 26664066, 58840401
32022942, 26664066, 99000570
32022942, 26664066, 76447340
56545166, 26664066, 49277359
56545166, 26664066, 57699076
56545166, 26664066, 59897011
56545166, 26664066, 8203104
56545166, 26664066, 86945220
56545166, 26664066, 56750275
56545166, 26664066, 23161772
56545166, 26664066, 65360779
56545166, 26664066, 81096492
56545166, 26664066, 9797485
56545166, 26664066, 11047790
56545166, 26664066, 27718800
56545166, 26664066, 3459518
56545166, 26664066, 70030742
56545166, 26664066, 9238901
56545166, 26664066, 32347333
56545166, 26664066, 93087014
56545166, 26664066, 40468429
56545166, 26664066, 21571832
56545166, 26664066, 80585875
56545166, 26664066, 50568150
56545166, 26664066, 58840401
56545166, 26664066, 99000570
56545166, 26664066, 76447340
21571832, 26664066, 49277359
21571832, 26664066, 57699076
21571832, 26664066, 59897011
21571832, 26664066, 8203104
21571832, 26664066,

70030742, 49105396, 26664066
70030742, 49105396, 23161772
70030742, 49105396, 9797485
70030742, 49105396, 11047790
70030742, 49105396, 27718800
70030742, 49105396, 9238901
70030742, 49105396, 32347333
70030742, 49105396, 44103732
32347333, 49105396, 26664066
32347333, 49105396, 23161772
32347333, 49105396, 9797485
32347333, 49105396, 11047790
32347333, 49105396, 27718800
32347333, 49105396, 70030742
32347333, 49105396, 9238901
32347333, 49105396, 44103732
61447196, 49105396, 26664066
61447196, 49105396, 23161772
61447196, 49105396, 9797485
61447196, 49105396, 11047790
61447196, 49105396, 27718800
61447196, 49105396, 70030742
61447196, 49105396, 9238901
61447196, 49105396, 32347333
61447196, 49105396, 44103732
46199099, 49105396, 26664066
46199099, 49105396, 23161772
46199099, 49105396, 9797485
46199099, 49105396, 11047790
46199099, 49105396, 27718800
46199099, 49105396, 70030742
46199099, 49105396, 9238901
46199099, 49105396, 32347333
46199099, 49105396, 44103732
77010466, 49105396, 26

29189306, 77781247, 23301214
29189306, 77781247, 65360779
29189306, 77781247, 9797485
29189306, 77781247, 11047790
29189306, 77781247, 27718800
29189306, 77781247, 9679962
29189306, 77781247, 45081767
91575829, 77781247, 23301214
91575829, 77781247, 65360779
91575829, 77781247, 9797485
91575829, 77781247, 11047790
91575829, 77781247, 27718800
91575829, 77781247, 9679962
91575829, 77781247, 29189306
91575829, 77781247, 45081767
99033080, 77781247, 23301214
99033080, 77781247, 65360779
99033080, 77781247, 9797485
99033080, 77781247, 11047790
99033080, 77781247, 27718800
99033080, 77781247, 9679962
99033080, 77781247, 29189306
99033080, 77781247, 45081767
87561379, 93087014, 23301214
87561379, 93087014, 56750275
87561379, 93087014, 26664066
87561379, 93087014, 65360779
87561379, 93087014, 81096492
87561379, 93087014, 9797485
87561379, 93087014, 11047790
87561379, 93087014, 27718800
87561379, 93087014, 70030742
87561379, 93087014, 32347333
87561379, 93087014, 61447196
87561379, 93087014, 6

23378934, 18091259, 64134451
23378934, 18091259, 9679962
23378934, 18091259, 44729031
23378934, 18091259, 6715962
23378934, 18091259, 4091042
23378934, 18091259, 99033080
23378934, 18091259, 48740665
23378934, 18091259, 45081767
23378934, 18091259, 46111203
23378934, 18091259, 55266091
58213454, 18091259, 23301214
58213454, 18091259, 65360779
58213454, 18091259, 81096492
58213454, 18091259, 9797485
58213454, 18091259, 11047790
58213454, 18091259, 27718800
58213454, 18091259, 93087014
58213454, 18091259, 64134451
58213454, 18091259, 9679962
58213454, 18091259, 44729031
58213454, 18091259, 6715962
58213454, 18091259, 4091042
58213454, 18091259, 99033080
58213454, 18091259, 48740665
58213454, 18091259, 45081767
58213454, 18091259, 46111203
58213454, 18091259, 55266091
93087014, 18091259, 23301214
93087014, 18091259, 65360779
93087014, 18091259, 81096492
93087014, 18091259, 9797485
93087014, 18091259, 11047790
93087014, 18091259, 27718800
93087014, 18091259, 64134451
93087014, 18091259, 96

45586263, 44729031, 23301214
45586263, 44729031, 65360779
45586263, 44729031, 9797485
45586263, 44729031, 11047790
45586263, 44729031, 27718800
45586263, 44729031, 61447196
45586263, 44729031, 93087014
45586263, 44729031, 18091259
45586263, 44729031, 4091042
45586263, 44729031, 29189306
45586263, 44729031, 48740665
45586263, 44729031, 46111203
94855809, 44729031, 23301214
94855809, 44729031, 65360779
94855809, 44729031, 9797485
94855809, 44729031, 11047790
94855809, 44729031, 27718800
94855809, 44729031, 61447196
94855809, 44729031, 93087014
94855809, 44729031, 18091259
94855809, 44729031, 4091042
94855809, 44729031, 29189306
94855809, 44729031, 48740665
94855809, 44729031, 46111203
30667185, 44729031, 23301214
30667185, 44729031, 65360779
30667185, 44729031, 9797485
30667185, 44729031, 11047790
30667185, 44729031, 27718800
30667185, 44729031, 61447196
30667185, 44729031, 93087014
30667185, 44729031, 18091259
30667185, 44729031, 4091042
30667185, 44729031, 29189306
30667185, 44729031, 

#Calculating support of combined sets

In [6]:
def calc_support_together(items):
    support = 0
    for sequence in sequence_list:
        for array in sequence:
            if (items[0] in array) and (items[1] in array):
                if np.where(array == items[0])[0][0] < np.where(array == items[1])[0][0]:
                    support += 1
            
    return support

supports_t = {}
print('Calculating combinations.')
combinations_t = list(itertools.combinations(df['StockCode'].unique(), 2))
print('Calculating supports for combinations.')

count = 0
total = len(combinations_t)

for pair in combinations_t:
    score = calc_support_together(pair)
    
    if score >= 7:
        supports_t[str(pair)] = [score]
    count += 1
    if not (count % 100): print('%d/%d' % (count, total), end='\r')
    
print('%d/%d' % (count, total), end='\r')
supports_df_t = pd.DataFrame.from_dict(supports_t).T

# supports_df has the supports calculated for this figure: http://4.bp.blogspot.com/-_HU7wXo6x44/VPkGT-3DXnI/AAAAAAAAB6Y/gdrGf3UAHyU/s1600/GSP%2B2%2BItem%2BSequences%2B(2).jpg
# in the article

Calculating combinations.
Calculating supports for combinations.
31626/31626

# Printing 3seq data using GSP code 

In [7]:
table = list(supports_df_t.index)
first = [x.split(',')[0][1:] for x in table ]
last = [x.split(',')[-1][:-1] for x in table ]

table_df = pd.DataFrame({
    '2-seq': table,
    'first': first,
    'last': last
})

table_df['first'] = table_df['first'].astype('int')
table_df['last'] = table_df['last'].astype('int')


# table_df
for index, row in table_df.iterrows():
    for first in table_df[table_df['last'] == row['first']]['first']:
        if (first not in [row['last'], row['first']]) and (row['last'] != row['first']): 
            print('(%d, %d)%d' % (row['last'], row['first'], first))
    
    
    for last in table_df[table_df['first'] == row['last']]['last']:
        if (last not in [row['first'], row['last']]) and (row['first'] != row['last']):
            print('(%d, %d)%d' % (row['first'], row['last'], last))

# Matching first and last to make patterns of length 3 from remaining length 2 patterns.
# Output of this is stored in tog.txt

(87561379, 93087014)94333967
(87561379, 93087014)93744785
(87561379, 93087014)94855809
(23301214, 39205831)57699076
(23301214, 39205831)56750275
(23301214, 39205831)53151389
(23301214, 39205831)77781247
(23301214, 39205831)83569177
(23301214, 39205831)91270448
(23301214, 39205831)58356343
(23301214, 39205831)56545166
(23301214, 39205831)64134451
(23301214, 39205831)67358289
(23301214, 39205831)61488090
(23301214, 39205831)91400379
(23301214, 26664066)65360779
(23301214, 26664066)81096492
(23301214, 26664066)27718800
(23301214, 26664066)49105396
(23301214, 26664066)70030742
(23301214, 26664066)32347333
(23301214, 26664066)61447196
(23301214, 26664066)46199099
(23301214, 26664066)58213454
(23301214, 26664066)35088343
(23301214, 26664066)77781247
(23301214, 26664066)93087014
(23301214, 26664066)32484091
(23301214, 26664066)83569177
(23301214, 26664066)91270448
(23301214, 26664066)58356343
(23301214, 26664066)32022942
(23301214, 26664066)29527402
(23301214, 26664066)56545166
(23301214, 266

(57699076, 83569177)93744785
(57699076, 83569177)94855809
(57699076, 83569177)83699629
(57699076, 83569177)91400379
(91270448, 57699076)5408882
(91270448, 57699076)47113206
(91270448, 57699076)39205831
(58356343, 57699076)5408882
(58356343, 57699076)47113206
(58356343, 57699076)39205831
(57699076, 58356343)87796319
(57699076, 58356343)89827900
(57699076, 58356343)67358289
(57699076, 58356343)75231635
(57699076, 58356343)61488090
(57699076, 58356343)98145684
(77352160, 57699076)5408882
(77352160, 57699076)47113206
(77352160, 57699076)39205831
(73165405, 57699076)5408882
(73165405, 57699076)47113206
(73165405, 57699076)39205831
(93744785, 57699076)5408882
(93744785, 57699076)47113206
(93744785, 57699076)39205831
(64134451, 57699076)5408882
(64134451, 57699076)47113206
(64134451, 57699076)39205831
(57699076, 64134451)70781138
(68362270, 57699076)5408882
(68362270, 57699076)47113206
(68362270, 57699076)39205831
(88916855, 57699076)5408882
(88916855, 57699076)47113206
(88916855, 57699076)39

(27718800, 23161772)5408882
(27718800, 23161772)2613471
(23161772, 27718800)49105396
(23161772, 27718800)70030742
(23161772, 27718800)32347333
(23161772, 27718800)61447196
(23161772, 27718800)58213454
(23161772, 27718800)35088343
(23161772, 27718800)77781247
(23161772, 27718800)93087014
(23161772, 27718800)58356343
(23161772, 27718800)29527402
(23161772, 27718800)57564916
(23161772, 27718800)93744785
(23161772, 27718800)64134451
(23161772, 27718800)44729031
(23161772, 27718800)35648532
(23161772, 27718800)63789474
(23161772, 27718800)61488090
(49105396, 23161772)5408882
(49105396, 23161772)2613471
(23161772, 49105396)70030742
(23161772, 49105396)79470637
(23161772, 49105396)61447196
(23161772, 49105396)77781247
(23161772, 49105396)90306960
(23161772, 49105396)63789474
(23161772, 49105396)94855809
(70030742, 23161772)5408882
(70030742, 23161772)2613471
(23161772, 70030742)79470637
(32347333, 23161772)5408882
(32347333, 23161772)2613471
(23161772, 32347333)61447196
(23161772, 32347333)46

(61488090, 27718800)23301214
(61488090, 27718800)5408882
(61488090, 27718800)26664066
(61488090, 27718800)23161772
(61488090, 27718800)23378934
(61488090, 27718800)9797485
(61488090, 27718800)11047790
(70030742, 49105396)26664066
(70030742, 49105396)23161772
(70030742, 49105396)9797485
(70030742, 49105396)11047790
(70030742, 49105396)27718800
(49105396, 70030742)79470637
(79470637, 49105396)26664066
(79470637, 49105396)23161772
(79470637, 49105396)9797485
(79470637, 49105396)11047790
(79470637, 49105396)27718800
(61447196, 49105396)26664066
(61447196, 49105396)23161772
(61447196, 49105396)9797485
(61447196, 49105396)11047790
(61447196, 49105396)27718800
(49105396, 61447196)77781247
(49105396, 61447196)93087014
(49105396, 61447196)93744785
(49105396, 61447196)64134451
(49105396, 61447196)63789474
(77781247, 49105396)26664066
(77781247, 49105396)23161772
(77781247, 49105396)9797485
(77781247, 49105396)11047790
(77781247, 49105396)27718800
(49105396, 77781247)93087014
(49105396, 77781247)

(55543427, 58356343)87796319
(55543427, 58356343)89827900
(55543427, 58356343)67358289
(55543427, 58356343)75231635
(55543427, 58356343)61488090
(55543427, 58356343)98145684
(55543427, 67358289)75231635
(55543427, 67358289)98145684
(58356343, 18091259)5408882
(58356343, 18091259)9797485
(58356343, 18091259)11047790
(18091259, 58356343)87796319
(18091259, 58356343)89827900
(18091259, 58356343)67358289
(18091259, 58356343)75231635
(18091259, 58356343)61488090
(18091259, 58356343)98145684
(32022942, 18091259)5408882
(32022942, 18091259)9797485
(32022942, 18091259)11047790
(18091259, 32022942)56545166
(18091259, 32022942)57564916
(18091259, 32022942)64134451
(18091259, 32022942)44729031
(18091259, 32022942)49652513
(18091259, 32022942)63789474
(18091259, 32022942)75911404
(18091259, 32022942)67358289
(18091259, 32022942)94855809
(18091259, 32022942)87003549
(18091259, 32022942)34720519
(29527402, 18091259)5408882
(29527402, 18091259)9797485
(29527402, 18091259)11047790
(18091259, 29527402)

(50780934, 33313570)29527402
(4091042, 48740665)70467675
(4091042, 48740665)75605615
(4091042, 48740665)68886206
(4091042, 48740665)79075745
(4091042, 31144073)79075745
(94855809, 29189306)26664066
(94855809, 29189306)18091259
(1951254, 63789474)99033080
(1951254, 63789474)78780012
(64694275, 35648532)27718800
(64694275, 35648532)29527402
(14855959, 46225241)71863938
(14855959, 34620826)71863938
(99033080, 63789474)57699076
(99033080, 63789474)26664066
(99033080, 63789474)23161772
(99033080, 63789474)27718800
(99033080, 63789474)49105396
(99033080, 63789474)32347333
(99033080, 63789474)61447196
(99033080, 63789474)35088343
(99033080, 63789474)40468429
(99033080, 63789474)18091259
(99033080, 63789474)32022942
(99033080, 63789474)9679962
(99033080, 63789474)1951254
(78780012, 63789474)57699076
(78780012, 63789474)26664066
(78780012, 63789474)23161772
(78780012, 63789474)27718800
(78780012, 63789474)49105396
(78780012, 63789474)32347333
(78780012, 63789474)61447196
(78780012, 63789474)350

# Print 2nd seq data sets

In [8]:
for index, row in supports_df.iterrows():
#     print(index, row[0])
     print(index)

(67211117, 9679962)
(49277359, 26664066)
(87561379, 39205831)
(87561379, 57699076)
(87561379, 32022942)
(23301214, 5408882)
(23301214, 57699076)
(23301214, 56750275)
(23301214, 53151389)
(23301214, 23378934)
(23301214, 61447196)
(23301214, 77781247)
(23301214, 93087014)
(23301214, 18091259)
(23301214, 35585359)
(23301214, 44729031)
(23301214, 23140564)
(23301214, 80585875)
(23301214, 483430)
(23301214, 99000570)
(5408882, 56750275)
(5408882, 80585875)
(5408882, 483430)
(5408882, 99000570)
(39205831, 39205831)
(39205831, 57699076)
(39205831, 56750275)
(39205831, 53151389)
(39205831, 32022942)
(39205831, 56545166)
(39205831, 22655197)
(57699076, 47113206)
(57699076, 39205831)
(57699076, 57699076)
(57699076, 56750275)
(57699076, 26664066)
(57699076, 35088343)
(57699076, 77010466)
(57699076, 83569177)
(57699076, 91270448)
(57699076, 32022942)
(57699076, 56545166)
(57699076, 7828144)
(57699076, 1951254)
(57699076, 88916855)
(57699076, 91575829)
(57699076, 36124265)
(53077914, 23140564)
(598

(70781138, 70781138)
(89392033, 9679962)
(99000570, 56750275)
(99000570, 26664066)
(99000570, 6941666)
(76447340, 26664066)
(76447340, 9679962)
(59405574, 56545166)
(34335202, 32347333)
(34335202, 29189306)
(87253856, 56750275)


#results were stored in a seperate txt file for clear look