<a href="https://colab.research.google.com/github/MarkusStefan/Data_Analytics/blob/main/Exercise2/AprioriAlgorithm_AssociateRuleLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

In [3]:
retail = pd.read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise2/retail2.csv", sep=';')
retail.describe()

Unnamed: 0,id,Bread,Yogurt,Egg,Dog_Food,Flowers
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,0.67,0.6,0.42,0.22,0.15
std,29.011492,0.472582,0.492366,0.496045,0.416333,0.35887
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,0.0,0.0,0.0,0.0,0.0
50%,50.5,1.0,1.0,0.0,0.0,0.0
75%,75.25,1.0,1.0,1.0,0.0,0.0
max,100.0,1.0,1.0,1.0,1.0,1.0


## Support
It indicates how frequently an item set appears in the data set.

Let $frq(A)$ be the number of occurrences (frequency) from the total number of transactions $frq(T) = T$:

$$
supp(A) = \frac{frq(A)}{T}
$$

In [4]:
def support(df, cols):
  if type(cols) != list:
    cols = [cols]
  bool_mask = df[cols].isin([1]).all(axis=1)
  frqA = len(df[bool_mask])
  T = len(df[cols])
  return frqA/T

support(retail, ["Bread", "Yogurt"])
#support(retail,'Bread')
  

0.55

## Confidence
It says how likely item set B is purchased when item set A is purchased

$$
conf(A \to B) = \frac{supp(A,B)}{supp(A)}
$$

with $supp(A,B) = \frac{frq(A,B)}{frq(T)}$ and $supp(A) = \frac{frq(A)}{frq(T)}$ it holds that:

$$
conf(A \to B) = \frac{supp(A,B)}{supp(A)} = 
\frac{\frac{frq(A,B)}{frq(T)}}{\frac{frq(A)}{frq(T)}} = 
\frac{frq(A,B)}{frq(A)}
$$

In [5]:
def confidence(df, A, B):
  return support(df, [A, B]) / support(df, A)

confidence(retail, "Bread", "Egg")

0.40298507462686567

## Lift
Lift: It says how likely item set B is purchased when item set A is purchased while controlling for how popular item set B is.

Lift is the ratio of the observed support to that expected if A and B were independent or equivalently the ratio of the confidence of the rule to the expected confidence of the RHS item set by independence.

$$
lift(A \to B) = \frac{conf(A,B)}{supp(B)} = \frac{supp(A,B)}{supp(A) \times supp(B)}
$$

Note:

$$
lift(A \to B) == lift(B \to A)
$$

In [6]:
def lift(df, A, B):
  return confidence(df, A, B) / support(df, B)

lift(retail, "Bread", "Egg")

0.9594882729211087

## Apriori Algorithm

In [7]:
retail.columns.tolist()

['id', 'Bread', 'Yogurt', 'Egg', 'Dog_Food', 'Flowers']

In [8]:
def apriori(df, min_supp, min_conf, min_lift):
  import itertools
  # get the list of column names in the DataFrame
  cols = df.columns.tolist()

  # initialize the frequent itemsets dictionary
  frequent_itemsets = {}

  # generate the frequent itemsets of size 1
  frequent_itemsets[1] = {}
  for col in cols:
      sup = support(df, col)
      if sup >= min_supp:
          frequent_itemsets[1][(col,)] = sup

  # generate the frequent itemsets of size 2 and higher
  k = 2
  while True:
      frequent_itemsets[k] = {}
      for itemset in itertools.combinations(frequent_itemsets[k-1], 2):
          itemset = tuple(sorted(set(itemset[0] + itemset[1])))
          # temporarily convert the itemset tuple to a list 
          # as the support function takes in a list
          sup = support(df, list(itemset))
          if sup >= min_supp:
              conf = confidence(df, itemset[0], itemset[1])
              lft = lift(df, itemset[0], itemset[1])  #conf / support(df, itemset[1])
              if conf >= min_conf and lft >= min_lift:
                  frequent_itemsets[k][itemset] = sup


      # If no frequent itemsets of size k were found, stop
      if not frequent_itemsets[k]:
        # remove empty key-value pair of the dictionary
        frequent_itemsets.popitem()
        break
      
      # increment k by one to incorporate higher sized itemsets 
      k += 1

      

  return frequent_itemsets

apriori(retail, min_supp=0.05, min_conf=0.8, min_lift=0.3)


{1: {('Bread',): 0.67,
  ('Yogurt',): 0.6,
  ('Egg',): 0.42,
  ('Dog_Food',): 0.22,
  ('Flowers',): 0.15},
 2: {('Bread', 'Yogurt'): 0.55}}

Apriori algorithm much faster using Numpy which is built on top of C

In [61]:
# numpy faster & efficient

import numpy as np
import itertools

def apriori(df, min_supp, min_conf, min_lift):
    # get the list of column names in the DataFrame
    cols = df.columns.tolist()
    n_cols = len(cols)

    # initialize the frequent itemsets dictionary
    frequent_itemsets = {}

    # generate the frequent itemsets of size 1
    frequent_itemsets[1] = {}
    for i in range(n_cols):
        col = df.iloc[:, i]
        sup = np.sum(col) / len(col)
        if sup >= min_supp:
            frequent_itemsets[1][(cols[i],)] = sup

    # generate the frequent itemsets of size 2 and higher
    k = 2
    while True:
        frequent_itemsets[k] = {}
        itemset_combinations = itertools.combinations(frequent_itemsets[k-1], 2)
        for itemset in itemset_combinations:
            itemset = tuple(sorted(set(itemset[0] + itemset[1])))
            # temporarily convert the itemset tuple to a list 
            # as the support function takes in a list
            cols_indices = [cols.index(item) for item in itemset]
            sup = np.sum(df.iloc[:, cols_indices].all(axis=1)) / len(df)
            if sup >= min_supp:
                conf = np.sum(df.iloc[:, cols_indices].all(axis=1)) / np.sum(df.iloc[:, cols_indices[:-1]].all(axis=1))
                lft = conf / sup
                if conf >= min_conf and lft >= min_lift:
                    frequent_itemsets[k][itemset] = sup

        # If no frequent itemsets of size k were found, stop
        if not frequent_itemsets[k]:
            # remove empty key-value pair of the dictionary
            frequent_itemsets.popitem()
            break

        # increment k by one to incorporate higher sized itemsets 
        k += 1

    return frequent_itemsets


apriori(retail, min_supp=0.05, min_conf=0.8, min_lift=0.3)


{1: {('id',): 50.5,
  ('Bread',): 0.67,
  ('Yogurt',): 0.6,
  ('Egg',): 0.42,
  ('Dog_Food',): 0.22,
  ('Flowers',): 0.15},
 2: {('Bread', 'id'): 0.67,
  ('Yogurt', 'id'): 0.6,
  ('Egg', 'id'): 0.42,
  ('Dog_Food', 'id'): 0.22,
  ('Flowers', 'id'): 0.15,
  ('Bread', 'Yogurt'): 0.55},
 3: {('Bread', 'Yogurt', 'id'): 0.55,
  ('Bread', 'Egg', 'id'): 0.27,
  ('Bread', 'Dog_Food', 'id'): 0.12,
  ('Bread', 'Flowers', 'id'): 0.06,
  ('Egg', 'Yogurt', 'id'): 0.22,
  ('Dog_Food', 'Yogurt', 'id'): 0.09,
  ('Dog_Food', 'Egg', 'id'): 0.07,
  ('Bread', 'Egg', 'Yogurt', 'id'): 0.22,
  ('Bread', 'Dog_Food', 'Yogurt', 'id'): 0.09},
 4: {('Bread', 'Egg', 'Yogurt', 'id'): 0.22,
  ('Bread', 'Dog_Food', 'Yogurt', 'id'): 0.09}}

### Implementaton using the Adults dataset from 
[Adults](http://archive.ics.uci.edu/ml/datasets/Adult)

In [70]:
names = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 
               'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 
               'Capital Gain', 'Capital Loss', 'Hours per week', 'Country', 
               'Income']

'''adults.columns = ["age", "workclass", "fnlwgt", "education", "education-num",
            "merital-status", "occupation", "relationship", "race"
            ,"sex", "capital-gain", "capital-loss","hours-per-week", 
            "native-country", "income"]'''

# retrieve data from my GitHub repo
adults = pd.read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise2/adult.data", 
                     names=names, header=None)

# retrieve data from official web archive
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
adults = pd.read_csv(url, header=None, names=names)

adults.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [71]:
# drop the target variable income (last column)
adults = adults.iloc[:,:-1]

# select the categorical columns
cat_cols = adults.select_dtypes(include=['object']).columns

# apply one-hot encoding to the categorical columns
adults = pd.get_dummies(adults, columns=cat_cols)

# convert all columns to binary values (0 or 1)
adults[adults > 0] = 1

In [72]:
adults

Unnamed: 0,Age,fnlwgt,Education-Num,Capital Gain,Capital Loss,Hours per week,Workclass_ ?,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,...,Country_ Portugal,Country_ Puerto-Rico,Country_ Scotland,Country_ South,Country_ Taiwan,Country_ Thailand,Country_ Trinadad&Tobago,Country_ United-States,Country_ Vietnam,Country_ Yugoslavia
0,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [68]:
apriori(adults, min_supp=0.8, min_conf=0.9, min_lift=0.95)

{1: {('age',): 1.0,
  ('fnlwgt',): 1.0,
  ('education-num',): 1.0,
  ('hours-per-week',): 1.0,
  ('race_ White',): 0.8542735173981143,
  ('native-country_ United-States',): 0.895857006848684},
 2: {('age', 'fnlwgt'): 1.0,
  ('age', 'education-num'): 1.0,
  ('age', 'hours-per-week'): 1.0,
  ('education-num', 'fnlwgt'): 1.0,
  ('fnlwgt', 'hours-per-week'): 1.0,
  ('education-num', 'hours-per-week'): 1.0},
 3: {('age', 'education-num', 'fnlwgt'): 1.0,
  ('age', 'fnlwgt', 'hours-per-week'): 1.0,
  ('age', 'education-num', 'fnlwgt', 'hours-per-week'): 1.0,
  ('age', 'education-num', 'hours-per-week'): 1.0,
  ('education-num', 'fnlwgt', 'hours-per-week'): 1.0},
 4: {('age', 'education-num', 'fnlwgt', 'hours-per-week'): 1.0}}