In [239]:
import pandas as pd
import numpy as np
import itertools

In [240]:
df = pd.read_csv('new_customer_supermarket.csv',encoding='utf-8',sep='\t')

# Remove the Unnamed column
df.drop(["Unnamed: 0"], axis=1, inplace=True)

grouped = df.groupby('BasketID')

In [241]:
def extract_frequent_sequences(sequences,sup,all_basket):
    frequent_sequences = []
    infrequent_sequences = []

    for sequence in sequences:
        if sequences[sequence] / len(all_basket) >= sup:
            frequent_sequences.append(sequence)
        else:
            infrequent_sequences.append(sequence)

    print("Frequent sequences: ",frequent_sequences[:5])
    print("Infrequent sequences: ",infrequent_sequences[:5])
    
    return frequent_sequences

all_baskets = {}
all_lvl1_items = {}

for name,group in grouped:
    group.sort_values(by=['BasketDate'],inplace=True)
    all_baskets[name] = group['ProdDescr'].tolist()

    for item in group['ProdDescr'].tolist():
        if item in all_lvl1_items:
            all_lvl1_items[item] += 1
        else:
            all_lvl1_items[item] = 1

lvl1 = extract_frequent_sequences(all_lvl1_items,0.05,all_baskets)

for i in range(2,6):

    if i == 2:
        subsets = itertools.combinations(lvl1, i)
        
        subsets_count = {}
        for j in subsets:
            for k in all_baskets:
                if set(j).issubset(set(all_baskets[k])):
                    if j in subsets_count:
                        subsets_count[j] += 1
                    else:
                        subsets_count[j] = 1
        
        print("Level 2")
        lvl2 = extract_frequent_sequences(subsets_count,0.01,all_baskets)
        lvlk = lvl2.copy()
    else:

        candidate_itemsets = []

        for k in range(len(lvlk)):
            for l in range(k+1, len(lvlk)):

                itemset1 = lvlk[k]
                itemset2 = lvlk[l]
                
                if itemset1[:-1] == itemset2[:-1]:
                    
                    x = tuple(itemset2[:-1])
                    candidate_itemset = itemset1 + x
                    candidate_itemsets.append(candidate_itemset)

        if len(candidate_itemsets) == 0:
            print("No more candidate itemsets")
            break

        subsets_count = {}
        for j in candidate_itemsets:
            for k in all_baskets:
                if set(j).issubset(set(all_baskets[k])):
                    if j in subsets_count:
                        subsets_count[j] += 1
                    else:
                        subsets_count[j] = 1
        print("Level ",i)
        lvlk = extract_frequent_sequences(subsets_count,0.03,all_baskets)

# Final Sequences
for i in lvlk:
    print(i)

Frequent sequences:  ['WHITE HANGING HEART TLIGHT HOLDER', 'JUMBO BAG RED RETROSPOT', 'SET OF CAKE TINS PANTRY DESIGN', 'NATURAL SLATE HEART CHALKBOARD', 'REGENCY CAKESTAND TIER']
Infrequent sequences:  ['JUMBO BAG PINK POLKADOT', 'SET OF TLIGHTS EASTER CHICKS', 'COFFEE MUG PEARS DESIGN', 'COFFEE MUG APPLES DESIGN', 'PEG BAG APPLES DESIGN']
Level 2
Frequent sequences:  [('WHITE HANGING HEART TLIGHT HOLDER', 'JUMBO BAG RED RETROSPOT'), ('WHITE HANGING HEART TLIGHT HOLDER', 'SET OF CAKE TINS PANTRY DESIGN'), ('WHITE HANGING HEART TLIGHT HOLDER', 'NATURAL SLATE HEART CHALKBOARD'), ('WHITE HANGING HEART TLIGHT HOLDER', 'REGENCY CAKESTAND TIER'), ('WHITE HANGING HEART TLIGHT HOLDER', 'ASSORTED COLOUR BIRD ORNAMENT')]
Infrequent sequences:  [('WHITE HANGING HEART TLIGHT HOLDER', 'PACK OF RETROSPOT CAKE CASES'), ('WHITE HANGING HEART TLIGHT HOLDER', 'FRENCH BLUE METAL DOOR SIGN'), ('WHITE HANGING HEART TLIGHT HOLDER', 'BAKING SET PIECE RETROSPOT'), ('WHITE HANGING HEART TLIGHT HOLDER', 'SET O