# Data training using efficient_apriori package

## Import packages

In [21]:
import pandas as pd
import numpy as np
from efficient_apriori import apriori
# apriori?

### Usage and examples of apriori

**Signature:**  
apriori(  
    transactions: Union[List[tuple], Callable],  
    min_support: float = 0.5,  
    min_confidence: float = 0.5,  
    max_length: int = 8,  
    verbosity: int = 0,  
    output_transaction_ids: bool = False,  
)  
  
**Examples:**  
transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')]  
itemsets, rules = apriori(transactions, min_confidence=1)  
rules  
[{a} -> {b}]  

## Data load

In [22]:
dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header=None) # ！Notice: header = None 
dataset.head(5) # view the first 5 raws

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [23]:
dataset.shape  # view the shape of the dataset

(7501, 20)

## Data clean

In [24]:
transactions = [] # pre-defined a dataset
for i in range(0,dataset.shape[0]): 
    # dataset.shape[0] means row of the dataset
    # dataset.shape[1] means col of the dataset
    temp = []
    for j in range(0,dataset.shape[1]):
        if str(dataset.values[i,j]) != 'nan':
            temp.append(dataset.values[i,j])
            # ！Notice 
            # dataset.values not dataset.value
            # dataset.iloc[i,j] also works
    transactions.append(temp)

# transactions 
# view the dataset transactions

## Data training using Apriori package

In [25]:
itemsets,rules = apriori(transactions, min_support=0.02, min_confidence=0.3)

print('Frequent Set: ',itemsets)      # definded by min_support
print('=============================================')
print('Association Rules: ',rules)    # definded by min confidence

# itemsets 
# rules

Frequent Set:  {1: {('mineral water',): 1788, ('shrimp',): 536, ('cottage cheese',): 239, ('energy drink',): 200, ('low fat yogurt',): 574, ('green tea',): 991, ('avocado',): 250, ('salmon',): 319, ('olive oil',): 494, ('almonds',): 153, ('frozen smoothie',): 475, ('vegetables mix',): 193, ('honey',): 356, ('tomato juice',): 228, ('meatballs',): 157, ('eggs',): 1348, ('burgers',): 654, ('turkey',): 469, ('energy bar',): 203, ('whole wheat rice',): 439, ('milk',): 972, ('whole wheat pasta',): 221, ('french fries',): 1282, ('soup',): 379, ('frozen vegetables',): 715, ('spaghetti',): 1306, ('cookies',): 603, ('cooking oil',): 383, ('champagne',): 351, ('chocolate',): 1229, ('chicken',): 450, ('oil',): 173, ('fresh tuna',): 167, ('tomatoes',): 513, ('pepper',): 199, ('red wine',): 211, ('ham',): 199, ('pancakes',): 713, ('grated cheese',): 393, ('fresh bread',): 323, ('ground beef',): 737, ('escalope',): 595, ('herb & pepper',): 371, ('strawberries',): 160, ('cake',): 608, ('hot dogs',): 2

# Data train using Mlxtend package
## import packages

**!Notice   
You have to import the mlxtend package here but not above, because the function (apriori) is the same, and python will take the latest one.**  

In [26]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
# apriori?

### Usage and examples of apriori

**Signature:**    
apriori(  
    df,  
    min_support=0.5,  
    use_colnames=False,  
    max_len=None,  
    verbose=0,  
    low_memory=False,  
)  

**The allowed values are either 0/1 or True/False.  
For example**

```
         Apple  Bananas   Beer  Chicken   Milk   Rice
    0     True    False   True     True  False   True
    1     True    False   True    False  False   True
    2     True    False   True    False  False  False
    3     True     True  False    False  False  False
    4    False    False   True     True   True   True
    5    False    False   True    False   True   True
    6    False    False   True    False   True  False
    7     True     True  False    False  False  False
```

## Data load

In [27]:
data_df = pd.read_csv('./Market_Basket_Optimisation.csv', header=None) # ！Notice: header = None 
data_df.head(5) # view the first 5 raws

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


## Data clean

In [28]:
data_list = data_df.values.tolist()
for i in range(len(data_list)):
    for j in range(len(data_list[i])):
        data_list[i][j] = str(data_list[i][j])      # replace the type of values with string
    data_list[i] = ','.join(data_list[i])

df2 = pd.DataFrame(data_list, columns=['item']) 
df3 = df2.item.str.get_dummies(',').drop('nan',axis=1,inplace=False)
df3.head(5)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


**Here we can get the sparse matrix df3. But there is a problem that there is one column name is `nan`, so we have the use the `.drop('nan',axis=1,inplace=False)` to remove it.  
Using the code below, we can see the 77th column name is `nan`**

In [29]:
df = df2.item.str.get_dummies(',')
print(df3.shape)
print(df.shape)

for i in range(120):
     if df3.columns[i] != df.columns[i]:
           print(i, df3.columns[i], df.columns[i])
           break

(7501, 120)
(7501, 121)
77 napkins nan


## Data training using Mlxtend package

In [30]:
frequent_itemsets = apriori(df3, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) 
rules = rules.sort_values(by="lift" , ascending=False) 

print("frequent sets：", frequent_itemsets) 
print("association rules：", rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.3) ])

frequent sets：       support                           itemsets
0    0.020397                          (almonds)
1    0.033329                          (avocado)
2    0.033729                         (brownies)
3    0.087188                          (burgers)
4    0.030129                           (butter)
..        ...                                ...
98   0.020131  (mineral water, whole wheat rice)
99   0.022930             (olive oil, spaghetti)
100  0.025197              (spaghetti, pancakes)
101  0.021197                (spaghetti, shrimp)
102  0.020931              (spaghetti, tomatoes)

[103 rows x 2 columns]
association rules：             antecedents      consequents  antecedent support  \
71        (ground beef)      (spaghetti)            0.098254   
92          (olive oil)      (spaghetti)            0.065858   
85               (soup)  (mineral water)            0.050527   
1             (burgers)           (eggs)            0.087188   
79          (olive oil)  (mineral 