# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [3]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.0001

In [4]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [8]:
def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    all_products = sorted(all_products)
    baskets = [set(basket) for basket in baskets]
    basket_count = len(baskets)
    def rec(taken: set[str], available: list[set[str]], index: int):
        if len(taken) ==4:
            return dict()
        
        if index == len(all_products):
            return dict()
        item = all_products[index]
        supports = rec(taken, available, index + 1)
        taken.add(item)
        available = [basket for basket in available if item in basket] 
        # item in basket is equivalent to basket.issuperset(taken) because all baskets in available must contain all items in taken except the current item due to construction
        supp = len(available) / basket_count
        if supp > epsilon:
            supports[frozenset(taken)] = supp
            supports.update(rec(taken, available, index + 1))
        taken.remove(item)
        return supports
    return rec(set(), baskets, 0)
    
    
supports = get_supports(baskets, products, EPSILON)
print(len(supports))
supports

8322


{frozenset({'zwieback'}): 0.004009891064626078,
 frozenset({'yogurt'}): 0.08587850030074183,
 frozenset({'yogurt', 'zwieback'}): 0.00026732607097507187,
 frozenset({'whole milk'}): 0.15792287642852368,
 frozenset({'whole milk', 'zwieback'}): 0.00046782062420637575,
 frozenset({'whole milk', 'yogurt'}): 0.011160863463209249,
 frozenset({'white wine'}): 0.011695515605159393,
 frozenset({'white wine', 'yogurt'}): 0.0005346521419501437,
 frozenset({'white wine', 'whole milk'}): 0.0012697988371315912,
 frozenset({'white bread'}): 0.023992514870012697,
 frozenset({'white bread', 'zwieback'}): 0.00013366303548753594,
 frozenset({'white bread', 'yogurt'}): 0.0010693042839002875,
 frozenset({'white bread', 'whole milk'}): 0.003141081333957094,
 frozenset({'white bread', 'whole milk', 'yogurt'}): 0.00026732607097507187,
 frozenset({'white bread', 'white wine'}): 0.00026732607097507187,
 frozenset({'whisky'}): 0.0005346521419501437,
 frozenset({'whipped/sour cream'}): 0.043707812604424245,
 froze

In [9]:
import itertools

def support(supports, products: tuple[str]) -> float:
    key = frozenset(products)
    if key in supports:
        return supports[key]
    return 0

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    supp_sum = support(supports, tuple(itertools.chain(prior_products,following_products)))
    supp_prior = support(supports, prior_products)
    if supp_prior == 0:
        return 0
    return supp_sum / supp_prior
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    supp_sum = support(supports, tuple(itertools.chain(prior_products,following_products)))
    supp_prior = support(supports, prior_products)
    supp_follow = support(supports, following_products)
    if supp_prior == 0 or supp_follow == 0:
        return 0
    return supp_sum / supp_prior / supp_follow

In [10]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448514


## Część 3. - generowanie rekomendacji

In [11]:
def generate_next_product_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    products = tuple(product for product in products if product not in basket)
    recommendations = []
    for subbasket in powerset(basket):
        for item in products:
            conf = confidence(supports, subbasket, (item, ))
            lft = lift(supports, subbasket, (item, ))
            if  lft > 1:
                recommendations.append((item, subbasket, conf, lft))
    return sorted(recommendations, key=lambda x: x[2], reverse=True)

In [12]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'semi-finished bread', 'yogurt', 'sausage', 'whole milk'}


[('rolls/buns',
  ('yogurt', 'sausage', 'whole milk'),
  0.2272727272727273,
  2.066027836076439),
 ('pork',
  ('yogurt', 'sausage', 'whole milk'),
  0.18181818181818185,
  4.901883701883703),
 ('soda',
  ('yogurt', 'sausage', 'whole milk'),
  0.18181818181818185,
  1.8723643871613593),
 ('other vegetables',
  ('yogurt', 'sausage', 'whole milk'),
  0.13636363636363635,
  1.1168084788774444),
 ('rolls/buns',
  ('sausage', 'whole milk'),
  0.12686567164179105,
  1.1532752398396837),
 ('domestic eggs',
  ('semi-finished bread', 'whole milk'),
  0.12,
  3.2352432432432434),
 ('rolls/buns',
  ('yogurt', 'whole milk'),
  0.11976047904191618,
  1.0886853267947703),
 ('soda', ('sausage', 'whole milk'), 0.11940298507462688, 1.2296124333596987),
 ('soda', ('yogurt', 'sausage'), 0.11627906976744186, 1.1974423406264505),
 ('soda', ('sausage',), 0.09856035437430785, 1.0149749363405152),
 ('bottled beer',
  ('yogurt', 'sausage', 'whole milk'),
  0.09090909090909093,
  2.0063019576293915),
 ('butter'

In [13]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'root vegetables', 'tropical fruit', 'soda', 'white wine', 'yogurt', 'domestic eggs', 'photo/film'}


[('whole milk', ('yogurt', 'photo/film'), 0.5, 3.166102412187897),
 ('onions',
  ('root vegetables', 'tropical fruit', 'yogurt'),
  0.42857142857142855,
  21.164073550212162),
 ('pastry',
  ('root vegetables', 'tropical fruit', 'soda'),
  0.4,
  7.732816537467701),
 ('frozen vegetables',
  ('white wine', 'yogurt'),
  0.37499999999999994,
  13.391706443914078),
 ('bottled beer',
  ('tropical fruit', 'white wine'),
  0.3333333333333333,
  7.356440511307767),
 ('whole milk',
  ('root vegetables', 'tropical fruit', 'yogurt'),
  0.28571428571428575,
  1.8092013783930843),
 ('whole milk',
  ('tropical fruit', 'white wine'),
  0.22222222222222224,
  1.4071566276390655),
 ('beef', ('soda', 'white wine'), 0.21428571428571427, 6.311726659167604),
 ('shopping bags',
  ('soda', 'white wine'),
  0.21428571428571427,
  4.503310593900482),
 ('whole milk',
  ('soda', 'domestic eggs'),
  0.1842105263157895,
  1.1664587834376463),
 ('rolls/buns',
  ('root vegetables', 'tropical fruit'),
  0.163636363636