# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

In [1]:
!pip install more-itertools

Collecting more-itertools
  Downloading more_itertools-10.1.0-py3-none-any.whl (55 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: more-itertools
Successfully installed more-itertools-10.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Część 1. - przygotowanie danych

In [2]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [3]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001

In [4]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [62]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
from itertools import combinations


def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    
    for basket in baskets:
        basket_combinations = powerset(basket)
        for combination in basket_combinations:
            if len(combination) > 0:
                combination = tuple(sorted(list(combination)))
                supports[combination] = supports.get(combination, 0) + 1
                
    supports = {k: v / len(baskets) for k, v in supports.items() if v / len(baskets) >= epsilon}
    
    return supports
    
supports = get_supports(baskets, products, EPSILON)
supports

{('salty snack',): 0.018779656485998796,
 ('pastry',): 0.0517275947336764,
 ('whole milk',): 0.15792287642852368,
 ('salty snack', 'whole milk'): 0.0019381140145692708,
 ('pastry', 'whole milk'): 0.006482657221145492,
 ('yogurt',): 0.08587850030074183,
 ('semi-finished bread',): 0.009490075519615051,
 ('sausage',): 0.06034886052262247,
 ('sausage', 'yogurt'): 0.005747510525964045,
 ('whole milk', 'yogurt'): 0.011160863463209249,
 ('semi-finished bread', 'whole milk'): 0.001670787943594199,
 ('sausage', 'whole milk'): 0.008955423377664907,
 ('sausage', 'whole milk', 'yogurt'): 0.0014702933903628951,
 ('soda',): 0.09710619528169484,
 ('pickled vegetables',): 0.008955423377664907,
 ('canned beer',): 0.04691572545612511,
 ('misc. beverages',): 0.01577223818752924,
 ('hygiene articles',): 0.013700461137472432,
 ('rolls/buns',): 0.11000467820624206,
 ('rolls/buns', 'sausage'): 0.005346521419501437,
 ('rolls/buns', 'whole milk'): 0.013967787208447505,
 ('rolls/buns', 'sausage', 'whole milk'):

In [63]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    key = tuple(sorted(list(products)))
    return supports.get(key, 0)

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    union = set(prior_products).union(set(following_products))
    key = tuple(sorted(list(union)))
    div = support(supports, prior_products)
    return supports.get(key, 0) / div if div > 0 else 0
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    div = support(supports, following_products)
    return confidence(supports, prior_products, following_products) / div if div > 0 else 0

In [73]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448514


## Część 3. - generowanie rekomendacji

In [74]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket, products, supports):
    # return [(item, subbasket, confidence, lift)]
    recommendation = []
    for subset in powerset(basket):
        if len(subset) > 0:
            subbasket = set(subset)
            for item in products:
                if item not in basket:
                    confidence_value = confidence(supports, subbasket, set([item]))
                    lift_value = lift(supports, subbasket, set([item]))
                    if lift_value > 1:
                        recommendation.append((item, subbasket, confidence_value, lift_value))
    # sort po confidence i n najllepszych wez
    return sorted(recommendation, key=lambda x: x[2])

In [75]:
print(baskets)
generate_next_product_candidates(baskets[1], products, supports)

[{'salty snack', 'pastry', 'whole milk'}, {'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}, {'soda', 'pickled vegetables'}, {'canned beer', 'misc. beverages'}, {'hygiene articles', 'sausage'}, {'rolls/buns', 'sausage', 'whole milk'}, {'soda', 'whole milk'}, {'whipped/sour cream', 'soda', 'frankfurter'}, {'curd', 'frankfurter'}, {'white bread', 'beef'}, {'butter', 'whole milk'}, {'other vegetables', 'frozen vegetables'}, {'tropical fruit', 'sugar'}, {'specialty chocolate', 'butter milk'}, {'dental care', 'frozen meals'}, {'rolls/buns'}, {'detergent', 'root vegetables'}, {'rolls/buns', 'sausage'}, {'cling film/bags', 'dish cleaner'}, {'canned beer', 'frozen fish'}, {'pip fruit', 'tropical fruit', 'whole milk'}, {'whole milk', 'pastry', 'root vegetables'}, {'rolls/buns', 'chocolate', 'red/blush wine'}, {'other vegetables', 'shopping bags'}, {'rolls/buns', 'packaged fruit/vegetables', 'chocolate', 'whole milk'}, {'other vegetables', 'hygiene articles'}, {'whipped/sour cream', 'ma

[('detergent', {'whole milk'}, 0.008887008040626322, 1.0308240411774545),
 ('detergent', {'yogurt'}, 0.012451361867704281, 1.4442614544686756),
 ('herbs', {'yogurt'}, 0.013229571984435798, 1.2528739595133724),
 ('specialty bar', {'yogurt'}, 0.014007782101167316, 1.002863366410366),
 ('hard cheese', {'yogurt'}, 0.014785992217898832, 1.0056490979837283),
 ('soft cheese', {'yogurt'}, 0.014785992217898832, 1.4749520103761349),
 ('chewing gum', {'yogurt'}, 0.016342412451361865, 1.3585084306095978),
 ('ham', {'whole milk'}, 0.017350825222175202, 1.0141421789039355),
 ('grapes', {'sausage'}, 0.017718715393134, 1.2274311964234448),
 ('misc. beverages', {'sausage'}, 0.017718715393134, 1.1234116035062036),
 ('salty snack', {'sausage'}, 0.018826135105204873, 1.002474945121639),
 ('sliced cheese', {'sausage'}, 0.018826135105204873, 1.34140695037705),
 ('frozen meals', {'sausage'}, 0.02104097452934662, 1.2543270991339182),
 ('dessert', {'sausage'}, 0.024363233665559245, 1.0327112332514532),
 ('beve

In [76]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'photo/film', 'yogurt', 'soda', 'root vegetables', 'tropical fruit', 'white wine', 'domestic eggs'}


[('detergent', {'yogurt'}, 0.012451361867704281, 1.4442614544686756),
 ('herbs', {'yogurt'}, 0.013229571984435798, 1.2528739595133724),
 ('specialty bar', {'yogurt'}, 0.014007782101167316, 1.002863366410366),
 ('hygiene articles',
  {'root vegetables'},
  0.01440922190201729,
  1.0517326210726083),
 ('grapes', {'soda'}, 0.014452856159669649, 1.0011948459126712),
 ('hard cheese', {'yogurt'}, 0.014785992217898832, 1.0056490979837283),
 ('soft cheese', {'yogurt'}, 0.014785992217898832, 1.4749520103761349),
 ('cat food', {'tropical fruit'}, 0.014792899408284025, 1.250543242068666),
 ('grapes', {'root vegetables'}, 0.015369836695485112, 1.064716974419184),
 ('processed cheese',
  {'root vegetables'},
  0.015369836695485112,
  1.5130188583851563),
 ('flour', {'tropical fruit'}, 0.01577909270216963, 1.6171408500175628),
 ('chewing gum', {'yogurt'}, 0.016342412451361865, 1.3585084306095978),
 ('sliced cheese',
  {'root vegetables'},
  0.01729106628242075,
  1.2320296418279129),
 ('oil', {'soda