In [1]:
import numpy as np
import pandas as pd
import shutil, os
import json
from io import StringIO
from IPython.display import display, HTML

#!pip install apyori
from apyori import apriori
import apyori

In [2]:
df = pd.read_csv('data.csv', header=None, usecols=np.arange(0,7))

In [3]:
df.fillna(method='ffill', axis=1, inplace=True)

In [4]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,Lassi,Coffee Powder,Butter,Yougurt,Ghee,Cheese,Cheese
1,Ghee,Coffee Powder,Coffee Powder,Coffee Powder,Coffee Powder,Coffee Powder,Coffee Powder
2,Lassi,Tea Powder,Butter,Cheese,Cheese,Cheese,Cheese
3,Cheese,Tea Powder,Panner,Coffee Powder,Butter,Bread,Bread
4,Cheese,Yougurt,Coffee Powder,Sugar,Butter,Sweet,Sweet
...,...,...,...,...,...,...,...
12521,Panner,Sugar,Bread,Milk,Cheese,Cheese,Cheese
12522,Sugar,Bread,Coffee Powder,Cheese,Cheese,Cheese,Cheese
12523,Bread,Cheese,Yougurt,Milk,Milk,Milk,Milk
12524,Ghee,Bread,Yougurt,Sugar,Cheese,Cheese,Cheese


# Apriori

In [6]:
# создаим из них матрицу
transactions = []
for i in range(0, df.shape[0]): 
    transactions.append([str(df.values[i,j]) for j in range(0, df.shape[1])])

In [14]:
%%time
result = list(apriori(transactions, min_support = 0.004, min_confidence = 0.2, min_lift = 1, min_length = 2)) 
# строим ассоциативные правила

CPU times: user 12.6 s, sys: 48.2 ms, total: 12.6 s
Wall time: 12.6 s


In [15]:
output = []

# преобразуем в датафрейм, чтобы посмотреть
for record in result:
    o = StringIO()
    apyori.dump_as_json(record, o)
    output.append(json.loads(o.getvalue())) 

data_df = pd.DataFrame(output)
data_df

Unnamed: 0,items,support,ordered_statistics
0,[Bread],0.422401,"[{'items_base': [], 'items_add': ['Bread'], 'c..."
1,[Butter],0.423280,"[{'items_base': [], 'items_add': ['Butter'], '..."
2,[Cheese],0.421044,"[{'items_base': [], 'items_add': ['Cheese'], '..."
3,[Coffee Powder],0.422801,"[{'items_base': [], 'items_add': ['Coffee Powd..."
4,[Ghee],0.422322,"[{'items_base': [], 'items_add': ['Ghee'], 'co..."
...,...,...,...
157,"[Lassi, Milk, Panner, Sweet]",0.031055,"[{'items_base': ['Milk', 'Panner', 'Sweet'], '..."
158,"[Lassi, Panner, Sweet, Tea Powder]",0.032812,"[{'items_base': ['Lassi', 'Panner', 'Sweet'], ..."
159,"[Milk, Panner, Sugar, Tea Powder]",0.029459,"[{'items_base': ['Milk', 'Panner', 'Tea Powder..."
160,"[Bread, Cheese, Milk, Panner, Tea Powder]",0.010778,"[{'items_base': ['Cheese', 'Milk', 'Panner', '..."


# Eclat

In [29]:
class Eclat:
    #инициализация объекта класса
    def __init__(self, min_support = 0.01, max_items = 5, min_items = 2):
        self.min_support = min_support
        self.max_items = max_items
        self.min_items = min_items
        self.item_lst = list()
        self.item_len = 0
        self.item_dict = dict()
        self.final_dict = dict()
        self.data_size = 0
    
    #создание словаря из ненулевых объектов из всех транзакций (вертикальный датасет)
    def read_data(self, dataset):
        for index, row in dataset.iterrows():
            row_wo_na = set(row[0])
            for item in row_wo_na:
                item = item.strip()
                if item in self.item_dict:
                    self.item_dict[item][0] += 1
                else:
                    self.item_dict.setdefault(item, []).append(1)
                self.item_dict[item].append(index)
        #задаем переменные экземпляра (instance variables)
        self.data_size = dataset.shape[0]
        self.item_lst = list(self.item_dict.keys())
        self.item_len = len(self.item_lst)
        self.min_support = self.min_support * self.data_size
        #print ("min_supp", self.min_support)
        
    #рекурсивный метод для поиска всех ItemSet по алгоритму Eclat
    #структура данных: {Item: [Supp number, tid1, tid2, tid3, ...]}
    def recur_eclat(self, item_name, tids_array, minsupp, num_items, k_start):
        if tids_array[0] >= minsupp and num_items <= self.max_items:
            for k in range(k_start+1, self.item_len):
                if self.item_dict[self.item_lst[k]][0] >= minsupp:
                    new_item = item_name + " | " + self.item_lst[k]
                    new_tids = np.intersect1d(tids_array[1:], self.item_dict[self.item_lst[k]][1:])
                    new_tids_size = new_tids.size
                    new_tids = np.insert(new_tids, 0, new_tids_size)
                    if new_tids_size >= minsupp:
                        if num_items >= self.min_items: self.final_dict.update({new_item: new_tids})
                        self.recur_eclat(new_item, new_tids, minsupp, num_items+1, k)
    
    #последовательный вызов функций определенных выше
    def fit(self, dataset):
        i = 0
        self.read_data(dataset)
        for w in self.item_lst:
            self.recur_eclat(w, self.item_dict[w], self.min_support, 2, i)
            i+=1
        return self
        
    #вывод в форме словаря {ItemSet: support(ItemSet)}
    def transform(self):
        return {k: "{0:.2f}%".format((v[0]+0.0)/self.data_size*100) for k, v in self.final_dict.items()}

In [30]:
model = Eclat(min_support = 0.01, max_items = 4, min_items = 3)
model.fit(df)

<__main__.Eclat at 0x142e87820>

In [31]:
model.transform()

{'s | i | a': '7.95%',
 's | i | a | L': '7.95%',
 's | i | L': '7.95%',
 's | a | L': '7.95%',
 's | e | h': '8.40%',
 's | e | h | C': '8.40%',
 's | e | C': '8.40%',
 's | h | C': '8.40%',
 'i | a | L': '7.95%',
 'i | k | l': '8.52%',
 'i | k | l | M': '8.52%',
 'i | k | M': '8.52%',
 'i | l | M': '8.52%',
 'a | e | r': '24.51%',
 'a | e | r | w': '8.24%',
 'a | e | r | B': '8.04%',
 'a | e | r | n': '8.23%',
 'a | e | r | P': '16.47%',
 'a | e | r | d': '16.28%',
 'a | e | r | ': '8.24%',
 'a | e | r | o': '8.24%',
 'a | e | r | T': '8.24%',
 'a | e | w': '8.24%',
 'a | e | w | P': '8.24%',
 'a | e | w | d': '8.24%',
 'a | e | w | ': '8.24%',
 'a | e | w | o': '8.24%',
 'a | e | w | T': '8.24%',
 'a | e | B': '8.04%',
 'a | e | B | d': '8.04%',
 'a | e | n': '8.23%',
 'a | e | n | P': '8.23%',
 'a | e | P': '16.47%',
 'a | e | P | d': '8.24%',
 'a | e | P | ': '8.24%',
 'a | e | P | o': '8.24%',
 'a | e | P | T': '8.24%',
 'a | e | d': '16.28%',
 'a | e | d | ': '8.24%',
 'a | e | 

# FP-Growth

In [16]:
#!pip install pyfpgrowth
import pyfpgrowth

In [17]:
#Сгенериуем паттерны
patterns = pyfpgrowth.find_frequent_patterns(transactions, 2)
#Выучим правила
rules = pyfpgrowth.generate_association_rules(patterns, 1);
#Покажем
rules

{('Panner', 'Panner', 'Sugar', 'Sweet', 'Tea Powder', 'Yougurt'): (('Bread',),
  1.0),
 ('Ghee', 'Lassi', 'Panner', 'Panner', 'Sugar', 'Tea Powder'): (('Milk',),
  1.0),
 ('Cheese', 'Ghee', 'Milk', 'Panner', 'Panner', 'Yougurt'): (('Tea Powder',),
  1.0),
 ('Milk', 'Panner', 'Panner', 'Panner', 'Tea Powder', 'Yougurt'): (('Cheese',),
  1.0),
 ('Coffee Powder',
  'Lassi',
  'Panner',
  'Panner',
  'Tea Powder',
  'Yougurt'): (('Sugar',), 1.0),
 ('Cheese',
  'Coffee Powder',
  'Panner',
  'Panner',
  'Sweet',
  'Yougurt'): (('Tea Powder',), 1.0),
 ('Coffee Powder',
  'Panner',
  'Panner',
  'Sweet',
  'Tea Powder',
  'Yougurt'): (('Cheese',), 1.0),
 ('Bread',
  'Cheese',
  'Milk',
  'Panner',
  'Tea Powder',
  'Tea Powder'): (('Sweet',), 1.0),
 ('Cheese',
  'Milk',
  'Panner',
  'Sweet',
  'Tea Powder',
  'Tea Powder'): (('Bread',), 1.0),
 ('Butter',
  'Panner',
  'Sugar',
  'Sweet',
  'Tea Powder',
  'Tea Powder'): (('Lassi',), 1.0),
 ('Ghee', 'Panner', 'Sugar', 'Sweet', 'Tea Powder', '