# Laboratorium 6 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab6`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [80]:
from collections import defaultdict

# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [81]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
#EPSILON = 0.1

In [82]:
# wczytujemy dane o koszykach

def read_baskets(path):
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets):
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)
print(baskets)
print(products)

[{'pastry', 'salty snack', 'whole milk'}, {'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}, {'pickled vegetables', 'soda'}, {'canned beer', 'misc. beverages'}, {'hygiene articles', 'sausage'}, {'rolls/buns', 'sausage', 'whole milk'}, {'soda', 'whole milk'}, {'whipped/sour cream', 'frankfurter', 'soda'}, {'frankfurter', 'curd'}, {'beef', 'white bread'}, {'butter', 'whole milk'}, {'other vegetables', 'frozen vegetables'}, {'tropical fruit', 'sugar'}, {'specialty chocolate', 'butter milk'}, {'frozen meals', 'dental care'}, {'rolls/buns'}, {'detergent', 'root vegetables'}, {'rolls/buns', 'sausage'}, {'dish cleaner', 'cling film/bags'}, {'canned beer', 'frozen fish'}, {'tropical fruit', 'pip fruit', 'whole milk'}, {'pastry', 'root vegetables', 'whole milk'}, {'red/blush wine', 'chocolate', 'rolls/buns'}, {'shopping bags', 'other vegetables'}, {'packaged fruit/vegetables', 'chocolate', 'rolls/buns', 'whole milk'}, {'other vegetables', 'hygiene articles'}, {'whipped/sour cream', 'ma

## Część 2. - obliczanie wskaźników

In [83]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
# dict posortowanie kluczy, zamienic na liczby

class SupportCounter:
    def __init__(self, baskets, products, epsilon):
        self.curr_prods=[]
        self.supports = defaultdict(int)
        self.baskets = baskets
        self.products = products
        self.epsilon = epsilon

    def __support(self, products,baskets):
        counter=0
        products=set(products)
        for basket in baskets:
            if products.issubset(basket):
                counter+=1
        return counter/len(baskets)

    def traverse(self):
        for prod in products:
            if prod not in self.curr_prods:
                self.curr_prods.append(prod)
                current_support = self.__support(self.curr_prods, baskets)
                if current_support > self.epsilon:
                    self.supports[str(sorted(self.curr_prods))] = current_support
                    self.traverse()
                self.curr_prods.remove(prod)

def get_supports(baskets, all_products, epsilon):
    support_counter = SupportCounter(baskets,all_products, epsilon)
    support_counter.traverse()
    return support_counter.supports
    
supports = get_supports(baskets, products, EPSILON)

print(supports)

defaultdict(<class 'int'>, {"['abrasive cleaner']": 0.0014702933903628951, "['artif. sweetener']": 0.0019381140145692708, "['baking powder']": 0.008086613646995923, "['bathroom cleaner']": 0.0011361358016440553, "['beef']": 0.03395041101383412, "['beef', 'bottled beer']": 0.0010693042839002875, "['beef', 'bottled water']": 0.0013366303548753592, "['beef', 'brown bread']": 0.0015371249081066632, "['beef', 'butter']": 0.0011361358016440553, "['beef', 'canned beer']": 0.0010024727661565194, "['beef', 'citrus fruit']": 0.001804450979081735, "['beef', 'curd']": 0.0012697988371315912, "['beef', 'domestic eggs']": 0.0011361358016440553, "['beef', 'frankfurter']": 0.0010024727661565194, "['beef', 'frozen vegetables']": 0.0012697988371315912, "['beef', 'fruit/vegetable juice']": 0.0010693042839002875, "['beef', 'margarine']": 0.001403461872619127, "['beef', 'newspapers']": 0.001670787943594199, "['beef', 'other vegetables']": 0.002806923745238254, "['beef', 'pastry']": 0.0012029673193878234, "[

In [84]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products):
    return supports[str(sorted(products))]

def confidence(supports, prior_products, following_products):
    set_sum = prior_products.union(following_products)
    numerator = support(supports,set_sum)
    denominator = support(supports,prior_products)
    if denominator == 0:
        return 0
    return numerator/denominator
    
def lift(supports, prior_products, following_products):
    numerator = confidence(supports,prior_products,following_products)
    denominator = support(supports,following_products)
    if denominator == 0 :
        return 0
    return numerator/denominator

In [85]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448514


## Część 3. - generowanie rekomendacji

In [110]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket, products, supports):
    result = []
    for subbasket in powerset(basket):
        for product in products:
            confidence_ = confidence(supports, set(subbasket), {product})
            lift_ = lift(supports, set(subbasket), {product})
            if lift_ > 1:
                result.append((product, set(subbasket), confidence_, lift_))
    return sorted(result,key=lambda x: -x[2])
# return [(item, subbasket, confidence, lift)]


In [111]:
print(baskets[1])
generate_next_product_candidates({'whole milk', 'rolls/buns'}, products, supports)

{'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}


[('rolls/buns', {'rolls/buns'}, 1.0, 9.09052247873633),
 ('whole milk', {'whole milk'}, 1.0, 6.332204824375794),
 ('rolls/buns', {'rolls/buns', 'whole milk'}, 1.0, 9.09052247873633),
 ('whole milk', {'rolls/buns', 'whole milk'}, 1.0, 6.332204824375794),
 ('yogurt',
  {'rolls/buns', 'whole milk'},
  0.09569377990430622,
  1.1142926293448514),
 ('sausage',
  {'rolls/buns', 'whole milk'},
  0.08133971291866028,
  1.3478251654506244),
 ('fruit/vegetable juice',
  {'rolls/buns'},
  0.034021871202916165,
  1.0001360683874942),
 ('chocolate', {'rolls/buns'}, 0.02551640340218712, 1.0815919096513482),
 ('ham', {'whole milk'}, 0.017350825222175202, 1.0141421789039355),
 ('ice cream', {'rolls/buns'}, 0.015795868772782502, 1.0412052178288307),
 ('hard cheese', {'rolls/buns'}, 0.015188335358444716, 1.0330139180382194),
 ('processed cheese',
  {'rolls/buns'},
  0.013365735115431349,
  1.3157335166592057),
 ('red/blush wine', {'rolls/buns'}, 0.012150668286755772, 1.1580283412402967),
 ('packaged frui

In [112]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'yogurt', 'white wine', 'tropical fruit', 'soda', 'photo/film', 'root vegetables', 'domestic eggs'}


[('yogurt', {'yogurt'}, 1.0, 11.644357976653696),
 ('white wine', {'white wine'}, 1.0, 85.50285714285714),
 ('tropical fruit', {'tropical fruit'}, 1.0, 14.756410256410257),
 ('soda', {'soda'}, 1.0, 10.298004129387476),
 ('photo/film', {'photo/film'}, 1.0, 189.40506329113924),
 ('root vegetables', {'root vegetables'}, 1.0, 14.37367915465898),
 ('domestic eggs', {'domestic eggs'}, 1.0, 26.96036036036036),
 ('tropical fruit', {'tropical fruit', 'yogurt'}, 1.0, 14.756410256410257),
 ('yogurt', {'tropical fruit', 'yogurt'}, 1.0, 11.644357976653696),
 ('soda', {'soda', 'yogurt'}, 1.0, 10.298004129387476),
 ('yogurt', {'soda', 'yogurt'}, 1.0, 11.644357976653696),
 ('root vegetables', {'root vegetables', 'yogurt'}, 1.0, 14.37367915465898),
 ('yogurt', {'root vegetables', 'yogurt'}, 1.0, 11.644357976653696),
 ('domestic eggs', {'domestic eggs', 'yogurt'}, 1.0, 26.96036036036036),
 ('yogurt', {'domestic eggs', 'yogurt'}, 1.0, 11.644357976653696),
 ('soda', {'soda', 'tropical fruit'}, 1.0, 10.298