# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety
%pip install more_itertools

from more_itertools import powerset

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001

In [3]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [4]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def do_get_supports(baskets, products, epsilon, items, supports):
    for (i, item) in enumerate(products):
        new_products = products[i+1:]
        new_items = items + [item]

        itemset = frozenset(new_items)
        support = sum(itemset.issubset(basket) for basket in baskets) / len(baskets)

        if support > epsilon:
            supports[itemset] = support
            do_get_supports(baskets, new_products, epsilon, new_items, supports)
    
def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    do_get_supports(baskets, all_products, epsilon, [], supports)

    return supports
    
supports = get_supports(baskets, products, EPSILON)
supports

{frozenset({'abrasive cleaner'}): 0.0014702933903628951,
 frozenset({'artif. sweetener'}): 0.0019381140145692708,
 frozenset({'baking powder'}): 0.008086613646995923,
 frozenset({'bathroom cleaner'}): 0.0011361358016440553,
 frozenset({'beef'}): 0.03395041101383412,
 frozenset({'beef', 'bottled beer'}): 0.0010693042839002875,
 frozenset({'beef', 'bottled water'}): 0.0013366303548753592,
 frozenset({'beef', 'brown bread'}): 0.0015371249081066632,
 frozenset({'beef', 'butter'}): 0.0011361358016440553,
 frozenset({'beef', 'canned beer'}): 0.0010024727661565194,
 frozenset({'beef', 'citrus fruit'}): 0.001804450979081735,
 frozenset({'beef', 'curd'}): 0.0012697988371315912,
 frozenset({'beef', 'domestic eggs'}): 0.0011361358016440553,
 frozenset({'beef', 'frankfurter'}): 0.0010024727661565194,
 frozenset({'beef', 'frozen vegetables'}): 0.0012697988371315912,
 frozenset({'beef', 'fruit/vegetable juice'}): 0.0010693042839002875,
 frozenset({'beef', 'margarine'}): 0.001403461872619127,
 frozen

In [24]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    return supports.get(frozenset(products), 0)

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    prior_or_following = prior_products | following_products
    l = support(supports, prior_products)
    if l == 0: return 0
    return support(supports, prior_or_following)/l
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    prior_or_following = prior_products | following_products
    l = support(supports, prior_products) * support(supports, following_products)
    if l == 0: return 0
    return support(supports, prior_or_following)/l

In [25]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [34]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    recommendations = []
    for subbasket in powerset(basket):
        if len(subbasket) == 0: continue
        subbasket = frozenset(subbasket)
        for item in (item for item in products if item not in basket):
            lift_v = lift(supports, subbasket, {item})
            if lift_v > 1:
                conf_v = confidence(supports, subbasket, {item})
                recommendations.append((item, subbasket, conf_v, lift_v))
                
    recommendations.sort(key=lambda x: x[2], reverse=true)
    return recommendations

In [35]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'semi-finished bread', 'yogurt', 'whole milk', 'sausage'}


NameError: name 'true' is not defined

In [36]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'yogurt', 'soda', 'tropical fruit', 'domestic eggs', 'root vegetables', 'photo/film', 'white wine'}


NameError: name 'true' is not defined