In [None]:
# Import packages for the entire program
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules

In [None]:
# Code Development for Apriori Algorithm (for both frequent itemset generation and rule generation)

## Global Variable Declaration
MINSUP = 0.1
MINCONF = 0.2
support_count = {}
rules = []

## Function Declaration

# Candidate Generation Function
# Generate a set of candidate itemsets that is complete and non-redundant
def candidate_gen(frequent_itemset):
    candidate_itemset = [] # store candidate_itemset for the current iteration
    k = len(frequent_itemset[0])
    
     # if k = 1, F_(k-1) is of length 1 so don't need to compare the first k-2 elements
    if k == 1:
        # Iterate through all possible combinations of frequent_itemsets
        for i in range(0, len(frequent_itemset) - 1):
            for j in range(i + 1, len(frequent_itemset)):
                temp = frequent_itemset[i] + frequent_itemset[j]
                candidate_itemset.append(temp)
          
    # if k >= 2, compare the first k-2 elements of each of the two items; combine if they are the same.
    else: 
        # Iterate through all possible combinations of frequent_itemsets, in lexicographic order
        for i in range(0, len(frequent_itemset) - 1):
            for j in range(i + 1, len(frequent_itemset)): # j starts from i + 1 so that the combined candidate
                                                          # follows lexicographic order
                if frequent_itemset[i][:-1] == frequent_itemset[j][:-1]:
                    temp = frequent_itemset[i].copy()
                    temp.append(frequent_itemset[j][-1])
                    candidate_itemset.append(temp)

    return candidate_itemset


# Candidate Pruning Function
# Prune the generated candidate itemsets based on whether or not their subsets are frequent
def candidate_prune(candidate_itemset, frequent_itemset):
    #print(candidate_itemset)
    #print(frequent_itemset)
    #print('next')
    k = len(frequent_itemset[0])
    # if k = 1, we know that the subsets are all frequent
    if k != 1: 
        for candidate in candidate_itemset:
            for item in candidate[:-1]:
                item_index = candidate.index(item)
                subset = candidate[:item_index] + candidate[item_index+1:] # create the subset
                if subset not in frequent_itemset:
                    candidate_itemset.remove(candidate) # prune if not in frequent itemset
                    break
    return candidate_itemset


# Apriori Algorithm for Frequent Itemset Generation
def Apriori_frequent(transaction):
    
    # Initialization
    frequent_k = []
    frequent_itemset = []
    candidate_itemset = []
    k = 0
    
    # Calculate the support count for the 1-itemset
    for t in transaction:
        for item in t[1:]:
            if item in support_count.keys():
                support_count[item] += 1
            else:
                support_count[item] = 1
    
    # Generate the frequent 1-itemset
    for i, j in support_count.items():
        if j >= len(transaction) * MINSUP:
            frequent_k.append([i])
    frequent_k.sort()
    frequent_itemset.append(frequent_k.copy())
    
    # Repeatedly generate the following k-itemsets
    while frequent_itemset[k]: # if not empty
        frequent_k.clear()
        k = k + 1
        # Generate candidate itemset
        candidate_itemset = candidate_gen(frequent_itemset[k-1])
        candidate_itemset = candidate_prune(candidate_itemset, frequent_itemset[k-1])
        
        # Reduce the transaction size by removing transactions with length < k
        transaction = [transaction[i] for i in range(len(transaction)) if len(transaction[i]) >= k + 1]
        
        # Calculate the support count for the k-itemset
        for t in transaction:
            candidate_in_t = [itemset for itemset in candidate_itemset
                              if all([itemset[j] in t for j in range(len(itemset))])]
                
            for c in candidate_in_t:
                c_str = ','.join([str(item) for item in c])
                # Use the support_count dictionary to store the support count for each candidate itemset
                if c_str in support_count.keys():
                    support_count[c_str] += 1
                else:
                    support_count[c_str] = 1
        
        # Determine the frequent k-itemset
        for candidate in candidate_itemset:
            c_str = ','.join([str(item) for item in candidate])
            if support_count[c_str]:
                if support_count[c_str] >= len(transaction) * MINSUP:
                    frequent_k.append(candidate)
        
        frequent_itemset.append(frequent_k.copy())
    
    # Remove the final empty frequent itemset and return
    frequent_itemset.pop(k)
    return frequent_itemset
            
    
# Algorithm for rule generation
def ap_genrules(k_itemset, k_consequent):
    
    temp_candidate = []
    k = len(k_itemset)
    m = len(k_consequent[0])
    
    if k > m + 1:
        temp_consequent = candidate_gen(k_consequent)
        temp_consequent = candidate_prune(temp_consequent, k_consequent)

        for consequent in temp_consequent:
            remain = k_itemset.copy()
            for item in consequent:
                remain.remove(item)
            
            c_str = ','.join([str(item) for item in k_itemset])
            remain_str = ','.join([str(item) for item in remain])
            
            conf = support_count[c_str] / support_count[remain_str]
            if conf >= MINCONF:
                current_rule = []
                current_rule.append(remain)
                current_rule.append(consequent)
                rules.append(current_rule.copy())
            else:
                temp_consequent.remove(consequent)

        if temp_consequent:
            ap_genrules(k_itemset, temp_consequent)
        
    
    
def Apriori_rules(frequent_itemset):
    rule = []
    for i in frequent_itemset:
        for k_itemset in i:
            k = len(k_itemset)
            temp_consequent = []
        
            if k >= 2:
                for item in frequent_itemset[0]:
                    if set(item).issubset(set(k_itemset)):
                        temp_consequent.append(item)
        
                if temp_consequent:
                    ap_genrules(k_itemset, temp_consequent)
    return rules
            

In [None]:
# Test my Apriori Algorithm on datasets

file_name = 'data/Assoc_Analysis_Vidhya.dat.csv'
data = []

# Open the file as data_file_ptr
with open(file_name, 'r') as data_file_ptr:
    # For every line in the file
    for in_item in data_file_ptr:
        in_item = in_item.strip()
        in_item_list = in_item.split(',') # split to list using "," as seperator
        
        for item in in_item_list:
            if item == '':
                in_item_list.remove(item)
        data.append(in_item_list)
    data_file_ptr.close()
    
# Frequent Itemset Generation
frequent_itemset = Apriori_frequent(data)
print("The generated frequent itemsets are: ")
print(frequent_itemset)

# Rule Generation
rules = Apriori_rules(frequent_itemset)
print("The generated rules are: ")
print(rules)

The generated frequent itemsets are: 
[[['Bagel'], ['Bread'], ['Cheese'], ['Diaper'], ['Eggs'], ['Meat'], ['Milk'], ['Pencil'], ['Wine']], [['Bagel', 'Bread'], ['Bagel', 'Cheese'], ['Bagel', 'Diaper'], ['Bagel', 'Eggs'], ['Bagel', 'Meat'], ['Bagel', 'Milk'], ['Bagel', 'Pencil'], ['Bagel', 'Wine'], ['Bread', 'Cheese'], ['Bread', 'Diaper'], ['Bread', 'Eggs'], ['Bread', 'Meat'], ['Bread', 'Milk'], ['Bread', 'Pencil'], ['Bread', 'Wine'], ['Cheese', 'Diaper'], ['Cheese', 'Eggs'], ['Cheese', 'Meat'], ['Cheese', 'Milk'], ['Cheese', 'Pencil'], ['Cheese', 'Wine'], ['Diaper', 'Eggs'], ['Diaper', 'Meat'], ['Diaper', 'Milk'], ['Diaper', 'Pencil'], ['Diaper', 'Wine'], ['Eggs', 'Meat'], ['Eggs', 'Milk'], ['Eggs', 'Pencil'], ['Eggs', 'Wine'], ['Meat', 'Milk'], ['Meat', 'Pencil'], ['Meat', 'Wine'], ['Milk', 'Pencil'], ['Milk', 'Wine'], ['Pencil', 'Wine']], [['Bagel', 'Bread', 'Cheese'], ['Bagel', 'Bread', 'Diaper'], ['Bagel', 'Bread', 'Meat'], ['Bagel', 'Bread', 'Milk'], ['Bagel', 'Bread', 'Wine'], ['

In [None]:
# Use package 'mlxtend'

file_name = "data/Assoc_Analysis_Vidhya_pd.dat.csv"
raw_data = []
# Read data
with open(file_name, 'r') as data_file_ptr:
    for in_item in data_file_ptr:
        in_item = in_item.strip()
        in_item_list = in_item.split(',')
        raw_data.append(in_item_list)
    data_file_ptr.close()

te = TransactionEncoder()
te_ary = te.fit(raw_data).transform(raw_data)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets_1 = fpgrowth(df, min_support=0.1, use_colnames=True)
frequent_itemsets_2 = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets_3 = fpmax(df, min_support=0.1, use_colnames=True)

print('Frequent Item Generation using fpgrowth algorithm: ')
print(frequent_itemsets_1)
print('')
print('Frequent Item Generation using Apriori algorithm: ')
print(frequent_itemsets_2)
print('')
print('Frequent Item Generation using fpmax algorithm: ')
print(frequent_itemsets_3)
print('')

# Filter results using the result generated by the Apriori algorithm
frequent_itemsets_2['length'] = frequent_itemsets_2['itemsets'].apply(lambda x: len(x))
len_2plus = frequent_itemsets_2[frequent_itemsets_2['length'] == 2]
print('Frequent itemsets with length 2: ')
print(len_2plus)
print('')

# Filter by itemset
egg_itemset = frequent_itemsets_2[frequent_itemsets_2['itemsets'] == {'Eggs'}]
print('Frequent itemsets with only egg: ')
print(egg_itemset)
print('')

# Or use frozenset to filter the itemsets
egg_itemset2 = frequent_itemsets_2[frequent_itemsets_2['itemsets'] == frozenset(({'Eggs'}))]
print('Filter only egg itemset using frozenset: ')
print(egg_itemset2)
print('')

rules = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=0.3)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
print('The rules generated are: ')
print(rules)

Frequent Item Generation using fpgrowth algorithm: 
     support                itemsets
0   0.503165                 (Bread)
1   0.500000                (Cheese)
2   0.474684                  (Meat)
3   0.436709                  (Wine)
4   0.436709                  (Eggs)
..       ...                     ...
95  0.107595   (Bagel, Wine, Cheese)
96  0.104430     (Meat, Wine, Bagel)
97  0.104430  (Bread, Bagel, Cheese)
98  0.120253   (Cheese, Meat, Bagel)
99  0.113924    (Bread, Meat, Bagel)

[100 rows x 2 columns]

Frequent Item Generation using Apriori algorithm: 
     support                    itemsets
0   0.424051                     (Bagel)
1   0.503165                     (Bread)
2   0.500000                    (Cheese)
3   0.405063                    (Diaper)
4   0.436709                      (Eggs)
..       ...                         ...
95  0.113924        (Pencil, Meat, Wine)
96  0.151899  (Cheese, Eggs, Meat, Milk)
97  0.110759  (Eggs, Meat, Wine, Cheese)
98  0.104430  (Che

In [None]:
# Working with stacked data (2 columns only)

file_name = 'data/GroceryStoreStacked.csv'
raw_data = []
# Read data
with open(file_name, 'r') as data_file_ptr:
    temp = []
    for in_item in data_file_ptr:
        in_item = in_item.strip()
        in_item_list = in_item.split(',')
        
        # Special read-in
        if temp: # if not empty
            if in_item_list[0] == temp[0]:
                temp.append(in_item_list[1])
            else: 
                raw_data.append(temp[1:].copy())
                temp.clear()
                temp = in_item_list.copy()
        else: # if empty
            temp = in_item_list.copy()
        
    data_file_ptr.close()

te = TransactionEncoder()
te_ary = te.fit(raw_data).transform(raw_data)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Generate Frequent Itemset and Rules
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
print('The frequent itemsets with a min_support of 0.2: ')
print(frequent_itemsets)
print('')

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
print('The rules generated are: ')
print(rules)

The frequent itemsets with a min_support of 0.2: 
     support                    itemsets
0   0.313313                    (apples)
1   0.363363                   (avocado)
2   0.314314                   (chicken)
3   0.295295                      (coke)
4   0.488488                  (crackers)
5   0.312312                    (gelato)
6   0.305305                       (ham)
7   0.305305                    (hummus)
8   0.472472                    (olives)
9   0.295295                   (peppers)
10  0.598599                    (peroni)
11  0.295295                  (pretzels)
12  0.389389                (prosciutto)
13  0.392392                   (risotto)
14  0.402402              (sarsaparilla)
15  0.318318                      (soda)
16  0.226226                     (steak)
17  0.485485                      (tuna)
18  0.283283                    (turkey)
19  0.211211           (hummus, avocado)
20  0.249249           (peroni, avocado)
21  0.215215          (risotto, avocado)
22  0.2

In [None]:
# Working with my own data

file_name = 'data/FIFA.csv'
raw_data = pd.read_csv(file_name)

# Keep only English Tweet
clean_data = raw_data[raw_data['lang'] == 'en']
clean_data = clean_data[['ID', 'Tweet']]
clean_data = clean_data.dropna(axis=0)

clean_data = clean_data.sample(5)
temp = clean_data.values.tolist()
tweet_data = []
for i in range(len(temp)):
    #print(temp[i][1])
    in_item = temp[i][1].strip()
    in_item_list = in_item.split(' ') # split to list using space as seperator
    tweet_data.append(in_item_list.copy())

te = TransactionEncoder()
te_ary = te.fit(tweet_data).transform(tweet_data)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = fpgrowth(df, min_support=0.4, use_colnames=True)
print(frequent_itemsets)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01)
print(rules)

   support   itemsets
0      0.6      (the)
1      0.4       (in)
2      0.4     (this)
3      0.4  (the, in)
  antecedents consequents  antecedent support  consequent support  support  \
0       (the)        (in)                 0.6                 0.4      0.4   
1        (in)       (the)                 0.4                 0.6      0.4   

   confidence      lift  leverage  conviction  
0    0.666667  1.666667      0.16         1.8  
1    1.000000  1.666667      0.16         inf  
