In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

def get_data(which = 'book'):
    paths = dict(book = 'data/bookstore_transactions.csv',
    movie = 'data/movielens_movies.csv',
    retail = 'data/online_retail.csv')
    return pd.read_csv(paths[which])

def get_books_transactions():
    books = get_data('book')
    books['transactions'] = books.Transaction.apply(lambda x:x.split(','))
    return books['transactions'].tolist()

def get_retail_transacations():
    return get_data('retail').groupby('InvoiceNo')['StockCode']\
            .apply(lambda x:list(x)).tolist()

*Use a counter to calculate metrics*

In [6]:
def confidence(transactions, sub, parent):
    return transactions.count(sub) / len([tran for tran in transactions if parent in tran])
    
def support(transactions, item):
    return len([tran for tran in transactions if parent in tran]) / len(transactions)

transactions = get_books_transactions()
confidence(transactions,['History','Bookmark'],'Bookmark')

0.25252525252525254

#### Market Basket Calculator

*To use association algorithms, you have to understand the math, probability calculations*

In [167]:
class BasketCalculator():
    '''
        Use case：
        ```python
            # define a calculator object with dataset
            market = BasketCalculator(get_retail_transacations())

            # select 2 items
            u,v = market.random_items(2)

            # calculate metrics
            support_u = market.support(u)
            support_uv = market.support([u,v])
            confidence_uv = market.confidence(u,v)
            lift_uv = market.lift(u,v)
            conviction_uv = market.conviction(u,v)
            zhang_uv = market.zhang(u,v)
        ```
            
        How to measure metrics:
            For all metrics, the larger the metric,  the stronger the rule is ~
            It is just a game of probability
            
        The range of metrics varies, e.g. 1 is maximum value for leverage, but not lift
        
        When you are doing rule filtering: consider magnitude of metrics:
        
            support: interpreted as probability~
            confidence: interpreted as probability~
            lift: ratio of expectation and observation, greater than 1 for good rules
            leverage: difference of expectation and observation, greater than 0 for good rules
            conviction: ratio of u and ~v, expecation to observation, greater than 1 for good rules
            zhang: (difference of positive confidence and negative confidence) / normalization -1 for disassociation, 1 for association
    '''
    
    def __init__(self, transactions):
        self.transactions = transactions
        self._to_onehot(self.transactions)
    
    
    def _to_onehot(self, transactions):
        '''
            Convert to onehot dataframe
        
        '''
        encoder = TransactionEncoder().fit(transactions)
        onehot = encoder.transform(transactions)
        onehot = pd.DataFrame(onehot, columns = encoder.columns_)
        self.onehot = onehot
        
    def support(self, item):
        '''
        Calculate support for item, 
        Support is metric measuring percent of transactions that contain a certain pattern.
        -------------------------
        Parameters:
            item:  either string of list
        Returns:
            support value
        '''
        if isinstance(item,str):
            return self.onehot[item].mean()
        
        else:
            return (self.onehot[item].mean(axis = 1) == 1).mean()
        
    def support_not(self, item):
        '''
        Calculate support for not item
        -------------------------------
        Parameters:
            item: string item
            
        Returns:
            support of not-item
        '''
        return (~self.onehot[item]).mean()
    
    def support_u_not_v(self, u, v):
        '''Calculate suport of u not v, where u, v are both string item'''
        return np.logical_and(self.onehot[u], ~self.onehot[v]).mean()
        
    
    def confidence(self, u, v):
        '''
        Calculate confidence:
            
            Confidence is actually conditional probability
            
            suppport(u & v) = P(u & v)
            support(u) = P(u)
            confidence = P(u & v) / P(u)
            P(u & v) = P(u) * P(v|u)
            confidence = P(v|u)
            
            ranges between 0 and 1
            
        '''
        return self.support([u,v]) / self.support(u)
    
    def lift(self, u, v):
        '''
            Calculate lift:
            
            support_u = %_transactions_contain_u -> probability of choosing u
            support_v = %_transactions_contain_v -> probability of choosing v
            support_uv has to be calcluated from data
            However, assuming u and v are independent, 
            support_uv_independent = support_u * support_v
            if support_uv > support_uv_independent then 
            support_uv / support_uv_independent > 1 , indicating u and v are dependent
            that is a rule~
            
            This values ranges 0 ~ +inf
            The threshold for determine good reuls is 1
    
            
            Parameters:
                u:string of item
                v: string of item
                
            Returns:
                lift of u & v
        
        '''
        return self.support([u,v]) / (self.support(u) * self.support(v))
    
    def leverage(self, u, v):
        '''
            Calculate leverage of u and v
            similary to lift, the 2 metrics are compared by subtracting independent expectation from joint probability
            
            This value ranges between -1 and 1
            
            The threshold for determin good rules is 0
            
            Parameters:
                u:string of item
                v: string of item
                
            Returns:
                leverage of u & v
        '''
        
        return self.support([u,v]) - self.support(u) * self.support(v)
    
    
    def conviction(self, u, v):
        '''
        Calculate the conviction of u and v, Greater than 1 for good rule
        -----------------------------------
        
        If u and not v are independent:  support_u_not_v = support(u) *  support(~v)
        If u and not v are dependent:  support_u_not_v > support(u) * suport(~v)
        Dependent:  support(u) * suport(~v) / self.support_u_not_v(u,v) < 1
        
        u and ~v are dependent -> whenever you buy u, you don't want to buy v
        then, u and v are negatively correlated, then is not a rule
        
        Finally u and v has rule if conviction is larger, when it is smaller, there is not a rule
        
        
        
        '''
        return self.support(u) * self.support_not(v) / self.support_u_not_v(u,v)
    
    
    def zhang(self, u, v):
        '''
        zhang = P(B|A) - P(B|~A) / max(P(B|A) ,P(B|~A))   
        zhang = confidence(A -> B) - confidence(~A -> B) / max(onfidence(A -> B) ,confidence(~A -> B))   
        If you buy A, you want to buy B                                           
        if you don't buy A, you want to buy B                                   
        Add a minus sign, You want A and B occur together more                         

        Subtracting the two is a value between -1 and 1, you want it to be bigger                 

        if Confidence(A- > B) > Confidence(\~A -> B), there is a positive rule
        if Confidence(A- > B) < Confidence(\~A -> B), there is a negative rule, which means
        negative rule means, if you don't buy A, you probablily buy B.

        Zhang's metric is more comprehensive, and ranges between -1 and 1
        '''
        pos_conf = self.confidence(u,v)
        support_not_u_v = self.support(v) - self.support([v,u])
        support_not_u = self.support_not(u)
        neg_conf = support_not_u_v / support_not_u
        return (pos_conf - neg_conf) / max(pos_conf, neg_conf)
        
    def random_items(self, size):
        return np.random.choice(self.onehot.columns, size = size)

In [168]:
# define a calculator object with dataset
market = BasketCalculator(get_retail_transacations())

# select 2 items
u,v = market.random_items(2)

# calculate metrics
support_u = market.support(u)
support_uv = market.support([u,v])
confidence_uv = market.confidence(u,v)
lift_uv = market.lift(u,v)
conviction_uv = market.conviction(u,v)
zhang_uv = market.zhang(u,v)

In [193]:
# define a calculator object with dataset
market = BasketCalculator(get_books_transactions())

# select 2 items
u,v = market.random_items(2)

print(u,v)
# calculate metrics

dict(
    support_u = market.support(u),
    support_uv = market.support([u,v]),
    confidence_uv = market.confidence(u,v),
    lift_uv = market.lift(u,v),
    conviction_uv = market.conviction(u,v),
    leverage_uv = market.leverage(u,v),
    zhang_uv = market.zhang(u,v)
)

Biography Poetry


{'support_u': 0.40404040404040403,
 'support_uv': 0.0,
 'confidence_uv': 0.0,
 'lift_uv': 0.0,
 'conviction_uv': 0.9090909090909091,
 'leverage_uv': -0.03673094582185491,
 'zhang_uv': -1.0}

In [203]:
gifts = pd.read_csv('data/online_retail.csv')

# description is not unique, stockCode is the unique ID
gifts.InvoiceNo.nunique(), gifts.Description.nunique(), gifts.StockCode.nunique()

(9709, 3460, 3353)

In [204]:
gifts.Description

0             IVORY STRING CURTAIN WITH POLE 
1               PINK AND BLACK STRING CURTAIN
2                       PSYCHEDELIC TILE HOOK
3                       ENAMEL COLANDER CREAM
4         SMALL FOLDING SCISSOR(POINTED EDGE)
                         ...                 
227755            SET OF 5 LUCKY CAT MAGNETS 
227756                       CARD DOLLY GIRL 
227757                TEA PARTY BIRTHDAY CARD
227758                VINTAGE KID DOLLY CARD 
227759                ELEPHANT BIRTHDAY CARD 
Name: Description, Length: 227760, dtype: object