# Apriori

## Install the apyori

In [1]:
!pip install apyori



## Import the packages

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import datetime
import time

from apyori import apriori

In [4]:
data = pd.read_csv('Market_Basket_Optimisation.csv', header = None)
print('Data loaded on', datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

Data loaded on 2021-09-06 14:32:21


In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


## Data Exploration

In [6]:
print('number of transaction: ', data.shape[0])
print('max item in one transaction: ', data.shape[1])

number of transaction:  7501
max item in one transaction:  20


In [7]:
cnt_item = data.count(axis=1)
stat = pd.DataFrame(cnt_item,columns=['cnt_item'])

stat.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cnt_item,7501.0,3.914545,2.90554,1.0,2.0,3.0,5.0,20.0


In [8]:
item = []
for i in data.columns:
    for j in data[i].unique():
        if j in item:
            continue
        else:
            item.append(j)
            
print('number of unique item:', len(item))
print('\nlist of item:\n', item)

number of unique item: 121

list of item:
 ['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water', 'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables', 'french fries', 'eggs', 'cookies', 'spaghetti', 'meatballs', 'red wine', 'rice', 'parmesan cheese', 'ground beef', 'sparkling water', 'herb & pepper', 'pickles', 'energy bar', 'fresh tuna', 'escalope', 'avocado', 'tomato sauce', 'clothes accessories', 'energy drink', 'chocolate', 'grated cheese', 'yogurt cake', 'mint', 'asparagus', 'champagne', 'ham', 'muffins', 'french wine', 'chicken', 'pasta', 'tomatoes', 'pancakes', 'frozen smoothie', 'carrots', 'yams', 'shallot', 'butter', 'light mayo', 'pepper', 'candy bars', 'cooking oil', 'milk', 'green tea', 'bug spray', 'oil', 'olive oil', 'salmon', 'cake', 'almonds', 'salt', 'strong cheese', 'hot dogs', 'pet food', 'whole wheat rice', 'antioxydant juice', 'honey', 'sandwich', 'salad', 'magazines', 'protein bar', 'mayonnaise', 'cider', 'burger sauce', 'green grapes', 'vegeta

In [9]:
2**121-1

2658455991569831745807614120560689151

## Data Preprocessing

In [10]:
# change dataframe into list of list
transactions = []
for i in range(0, 7501):
    transactions.append([str(data.values[i,j]) for j in range(0, 20)])

In [11]:
# checking the result
print(transactions[0])
print(transactions[5])

['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil']
['low fat yogurt', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']


## Training the Apriori model on the dataset

In [12]:
rules = apriori(transactions=transactions,min_support=0.003,min_confidence=0.2, 
                min_lift=3,min_length=2,max_length=2)

**Parameter input:**

`transactions` list of array/list. Indexed transaction data

`min_support` float. Minimum support threshold to include the itemset into the final result

`min_confident` float. Minimum confident threshold to include the rule into the final result

`min_lift` float. Minimum lift threshold to include the rule into final result

`min_length` int. Minimum itemset count

`max_length` int. Maximum itemset count

## Visualize the results

### Displaying the output of the apriori function

In [13]:
results = list(rules)

In [14]:
results[0]

RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)])

### Transform into a Pandas DataFrame

In [15]:
def inspect(results):
    X         = [tuple(result[2][0][0])[0] for result in results]
    Y         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(X, Y, supports, confidences, lifts))

### Displaying the results

In [16]:
resultsinDataFrame = pd.DataFrame(inspect(results), 
                                  columns=['X','Y','Support','Confidence','Lift'])
resultsinDataFrame

Unnamed: 0,X,Y,Support,Confidence,Lift
0,light cream,chicken,0.004533,0.290598,4.843951
1,mushroom cream sauce,escalope,0.005733,0.300699,3.790833
2,pasta,escalope,0.005866,0.372881,4.700812
3,fromage blanc,honey,0.003333,0.245098,5.164271
4,herb & pepper,ground beef,0.015998,0.32345,3.291994
5,tomato sauce,ground beef,0.005333,0.377358,3.840659
6,light cream,olive oil,0.0032,0.205128,3.11471
7,whole wheat pasta,olive oil,0.007999,0.271493,4.12241
8,pasta,shrimp,0.005066,0.322034,4.506672


### Displaying the results sorted by lifts

In [17]:
resultsinDataFrame.nlargest(n=10, columns='Lift')

Unnamed: 0,X,Y,Support,Confidence,Lift
3,fromage blanc,honey,0.003333,0.245098,5.164271
0,light cream,chicken,0.004533,0.290598,4.843951
2,pasta,escalope,0.005866,0.372881,4.700812
8,pasta,shrimp,0.005066,0.322034,4.506672
7,whole wheat pasta,olive oil,0.007999,0.271493,4.12241
5,tomato sauce,ground beef,0.005333,0.377358,3.840659
1,mushroom cream sauce,escalope,0.005733,0.300699,3.790833
4,herb & pepper,ground beef,0.015998,0.32345,3.291994
6,light cream,olive oil,0.0032,0.205128,3.11471
