# PCY Algorithm

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as iter_t

retail_data = [i.strip().split() for i in open("retail.dat").readlines()]

retail_data = pd.DataFrame(retail_data)

# allocate a limited number of rows, for testing
retail_data = retail_data[:100]

In [4]:
def PCY(data, f_hash, min_support=0.05, min_confidence=0.10):
    def n_candidate(prev_L, n, hash_d = None):
        names_helper = np.array([], dtype=int)
        for i in range(n-1):
            names_helper = np.append(names_helper, prev_L[i].to_numpy(dtype=int))

        candidate         = iter_t.combinations(set(names_helper), n)
        candidate         = pd.DataFrame([i for i in candidate])
        candidate["freq"] = np.zeros(len(candidate[0]), dtype=int)

        candidate.is_copy = False

        if hash_d is not None:
            for i in range(len(candidate[0])):
                p = candidate.iloc[i, 0:n]
                h = f_hash(p[0], p[1])
                if hash_dict[h] == 0:
                    candidate.drop(candidate.index[i])
        
        for i in range(data.shape[0]):
            for j in range(candidate.shape[0]):
                if set(candidate.loc[j][0:n].to_numpy(dtype=int)).issubset(set(data.loc[i].dropna().to_numpy(dtype=int))):
                    candidate.loc[j, "freq"] += 1
        return candidate
    
    # get unique items
    # construct array of all items
    items     = []
    hash_dict = {}
    for i in range(data.shape[1]):
        for j in range(data.shape[0]):
            items.append(data[i][j])
        pairs = iter_t.combinations(set(data[i]), 2)
        for p in pairs:
            if (p[0] is not None) and (p[1] is not None):
                h = f_hash(p[0], p[1])
                v = hash_dict.get(h)
                if v is None:
                    hash_dict[h] = 1
                else:
                    hash_dict[h] += 1
                
    for k in hash_dict.keys():
        if hash_dict[k] >= min_support:
            hash_dict[k] = 1
        else:
            hash_dict[k] = 0
        
    unique_items = list(set(items))
    
    # construct candidate sets
    C = []
    L = []
    
    C1 = []
    for val in unique_items:
        C1.append((val, items.count(val)))
    
    total_transactions = len(data)
    
    del(items)
    
    C1 = pd.DataFrame(C1, columns=[0, "freq"], dtype=int)
    C1 = C1.dropna()
    
    C1["conf"] = np.ones(len(C1[0]), dtype=float)
    C1["sup"]  = C1["freq"] / total_transactions
    
    L1 = C1[C1["sup"] >= min_support]
    L1 = L1.astype({0: int})
    
    C.append(C1)
    L.append(L1)
    
    def conf(_L, prevL, n):
        # Build confidence
        pd.set_option('mode.chained_assignment', 'warn')
        _L["conf"] = np.zeros(len(_L[0]), dtype=float)

        _L.is_copy = False
        for i in range(len(_L["freq"])):
            oldSup = _L["freq"].iloc[i]                                        #(prevL["freq"][prevL.iloc[:, 0:n] == L_cur[0:n]]).iat[0]
            for j in range(len(prevL[0])):
                if set(prevL.iloc[j, 0:n-1]) == set(_L.iloc[i, 0:n-1]):
                    oldSup = prevL["freq"].iloc[j]
                    break
            _L["conf"].iloc[i] = (_L["freq"].iloc[i] / oldSup)
        
        return _L
        
    
    i = 2
    while True:
        cand         = n_candidate(L[i-2], i, hash_dict)
        hash_dict    = None
        cand["sup"]  = cand["freq"] / total_transactions
        L_cur        = cand[cand["sup"] >= min_support]
        
        if len(L_cur[0]) == 0:
            break
        
        L_cur = conf(L_cur, L[i-2], i)
        
        L_cur = L_cur[L_cur["conf"] >= min_confidence]
        
        C.append(cand)
        L.append(L_cur)
        i += 1
    
    return L

In [6]:
res = PCY(retail_data, lambda x, y: (int(x)*int(y))**3 % 300)
res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _L["conf"] = np.zeros(len(_L[0]), dtype=float)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/

[      0  freq  conf   sup
 29   41    24   1.0  0.24
 88   48    47   1.0  0.47
 191  38    26   1.0  0.26
 286  32    10   1.0  0.10
 357  65     5   1.0  0.05
 412  36     8   1.0  0.08
 451  39    57   1.0  0.57,
      0   1  freq   sup      conf
 3   32  39     6  0.06  0.600000
 5   32  48     6  0.06  0.600000
 11  36  38     8  0.08  1.000000
 12  36  39     6  0.06  0.750000
 15  38  39    19  0.19  0.730769
 16  38  41    10  0.10  0.384615
 17  38  48    15  0.15  0.576923
 18  39  41    16  0.16  0.280702
 19  39  48    34  0.34  0.596491
 20  41  48    13  0.13  0.541667,
      0   1   2  freq   sup      conf
 8   32  39  48     6  0.06  1.000000
 10  36  38  39     6  0.06  0.750000
 16  38  39  41     6  0.06  0.315789
 17  38  39  48    11  0.11  0.578947
 18  38  41  48     5  0.05  0.500000
 19  39  41  48    11  0.11  0.687500]