<a href="https://colab.research.google.com/github/MarkusStefan/Data_Analytics/blob/main/AssociateRuleLearning_Functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
retail = pd.read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise2/retail2.csv", sep=';')
retail.describe()

Unnamed: 0,id,Bread,Yogurt,Egg,Dog_Food,Flowers
0,False,True,True,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,True,False,False,False
...,...,...,...,...,...,...
95,False,True,True,False,False,False
96,False,False,False,False,False,False
97,False,True,True,False,False,False
98,False,True,True,False,False,False


## Support
It indicates how frequently an item set appears in the data set.

Let $frq(A)$ be the number of occurrences (frequency) from the total number of transactions $frq(T) = T$:

$$
supp(A) = \frac{frq(A)}{T}
$$

In [30]:
def support(df, cols):
  if type(cols) != list:
    cols = [cols]
  bool_mask = df[cols].isin([1]).all(axis=1)
  frqA = len(df[bool_mask])
  T = len(df[cols])
  return frqA/T

support(retail, ["Bread", "Yogurt"])
#support(retail,'Bread')
  

0.55

## Confidence
It says how likely item set B is purchased when item set A is purchased

$$
conf(A \to B) = \frac{supp(A,B)}{supp(A)}
$$

with $supp(A,B) = \frac{frq(A,B)}{frq(T)}$ and $supp(A) = \frac{frq(A)}{frq(T)}$ it holds that:

$$
conf(A \to B) = \frac{supp(A,B)}{supp(A)} = 
\frac{\frac{frq(A,B)}{frq(T)}}{\frac{frq(A)}{frq(T)}} = 
\frac{frq(A,B)}{frq(A)}
$$

In [6]:
def confidence(df, A, B):
  return support(df, [A, B]) / support(df, A)

confidence(retail, "Bread", "Egg")

0.40298507462686567

## Lift
Lift: It says how likely item set B is purchased when item set A is purchased while controlling for how popular item set B is.

Lift is the ratio of the observed support to that expected if A and B were independent or equivalently the ratio of the confidence of the rule to the expected confidence of the RHS item set by independence.

$$
lift(A \to B) = \frac{conf(A,B)}{supp(B)} = \frac{supp(A,B)}{supp(A) \times supp(B)}
$$

Note:

$$
lift(A \to B) == lift(B \to A)
$$

In [8]:
def lift(df, A, B):
  return confidence(df, A, B) / support(df, B)

lift(retail, "Bread", "Egg")

0.9594882729211087

## Apriori Algorithm

In [15]:
retail.columns.tolist()

['id', 'Bread', 'Yogurt', 'Egg', 'Dog_Food', 'Flowers']

In [36]:
def apriori(df, min_supp, min_conf, min_lift):
  import itertools
  # get the list of column names in the DataFrame
  cols = df.columns.tolist()

  # Initialize the frequent itemsets dictionary
  frequent_itemsets = {}

  # generate the frequent itemsets of size 1
  frequent_itemsets[1] = {}
  for col in cols:
      sup = support(df, col)
      if sup >= min_supp:
          frequent_itemsets[1][(col,)] = sup

  # generate the frequent itemsets of size 2 and higher
  k = 2
  while True:
      frequent_itemsets[k] = {}
      for itemset in itertools.combinations(frequent_itemsets[k-1], 2):
          itemset = tuple(sorted(set(itemset[0] + itemset[1])))
          # temporarily convert the itemset tuple to a list 
          # as the support function takes in a list
          sup = support(df, list(itemset))
          if sup >= min_supp:
              conf = confidence(df, itemset[0], itemset[1])
              lft = lift(df, itemset[0], itemset[1])  #conf / support(df, itemset[1])
              if conf >= min_conf and lft >= min_lift:
                  frequent_itemsets[k][itemset] = sup


      # If no frequent itemsets of size k were found, stop
      if not frequent_itemsets[k]:
        # remove empty key-value pair of the dictionary
        frequent_itemsets.popitem()
        break
      
      # increment k by one to incorporate higher sized itemsets 
      k += 1

      

  return frequent_itemsets

apriori(retail, min_supp=0.05, min_conf=0.8, min_lift=0.3)


{1: {('Bread',): 0.67,
  ('Yogurt',): 0.6,
  ('Egg',): 0.42,
  ('Dog_Food',): 0.22,
  ('Flowers',): 0.15},
 2: {('Bread', 'Yogurt'): 0.55}}

### Implementaton using the Adults dataset from 
[Adults](http://archive.ics.uci.edu/ml/datasets/Adult)

In [46]:
adults = pd.read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise2/adult.data", header=None)
adults.columns = ["age", "workclass", "fnlwgt", "education", "education-num",
            "merital-status", "occupation", "relationship", "race"
            ,"sex", "capital-gain", "capital-loss","hours-per-week", 
            "native-country", "income"]
adults.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,merital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [48]:
apriori(adults, min_supp=0.0001, min_conf=0.08, min_lift=0.01)

{1: {('education-num',): 0.001566290961579804,
  ('hours-per-week',): 0.0006142317496391389}}