# Apriori Notebook

#### *Author: Kunyu He*
#### *University of Chicago, CAPP'20*

In [47]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from apyori import apriori

### Load Data

Each line refers to a transaction record and we do not have titles for the columns. **Each transaction corresponds to a customer's basket of a day in a specific week.**

In [48]:
basket = pd.read_csv("Market_Basket_Optimisation.csv", header=None)
basket.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [49]:
basket.shape

(7501, 20)

### Data Cleaning

In [50]:
basket.isnull().sum(axis=None)

0        0
1     1754
2     3112
3     4156
4     4972
5     5637
6     6132
7     6520
8     6847
9     7106
10    7245
11    7347
12    7414
13    7454
14    7476
15    7493
16    7497
17    7497
18    7498
19    7500
dtype: int64

There are a lot of missing values. However, `Apriori` expects a list of lists as input matrix.

In [51]:
transactions = []

for i in range(basket.shape[0]):
    transactions.append([str(basket.values[i, j]) for j in range(basket.shape[1])
                         if isinstance(basket.values[i, j], str)])

### Model Training

Consider products purchased at least 3 times a day as support, which are considered as popular products.

In [130]:
rules = apriori(transactions, min_support=0.003, min_confidence=0.2, min_lift=3)

In [131]:
def clean_apriori_results(rules, n=10):
    results = list(rules)
    output = {'Rule': [], 'Support': [], 'Confidence': [], 'Lift': []}
    
    for result in results:
        output['Rule'].append(result[0])
        output['Support'].append(round(result[1], 5))
        output['Confidence'].append(round(result[2][0][2], 5))
        output['Lift'].append(round(result[2][0][3], 3))

    return pd.DataFrame(output).sort_values('Lift', ascending=False).head(n)

In [132]:
clean_apriori_results(rules, 6)

Unnamed: 0,Rule,Support,Confidence,Lift
51,"(olive oil, mineral water, whole wheat pasta)",0.00387,0.40278,6.116
69,"(frozen vegetables, mineral water, soup, milk)",0.00307,0.27711,5.484
3,"(fromage blanc, honey)",0.00333,0.2451,5.164
0,"(light cream, chicken)",0.00453,0.2906,4.844
2,"(pasta, escalope)",0.00587,0.37288,4.701
24,"(ground beef, french fries, herb & pepper)",0.0032,0.23077,4.666
