# <u>Introducción al Market Basket Analysis</u>

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns

sns.set()

## Lectura de datos

In [2]:
groceries = pd.read_csv('Groceries_dataset.csv')

In [3]:
groceries.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
groceries.shape

(38765, 3)

In [5]:
groceries.columns = ['id_cliente', 'fecha', 'producto']

In [6]:
groceries.head()

Unnamed: 0,id_cliente,fecha,producto
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


## Productos más frecuentes

¿cuántos productos hay?

In [7]:
groceries['producto'].nunique()

167

In [8]:
groceries['id_cliente'].nunique()

3898

In [9]:
groceries['producto'].value_counts()

whole milk               2502
other vegetables         1898
rolls/buns               1716
soda                     1514
yogurt                   1334
                         ... 
rubbing alcohol             5
bags                        4
baby cosmetics              3
kitchen utensil             1
preservation products       1
Name: producto, Length: 167, dtype: int64

In [10]:
groceries['producto'].value_counts().head(20)

whole milk            2502
other vegetables      1898
rolls/buns            1716
soda                  1514
yogurt                1334
root vegetables       1071
tropical fruit        1032
bottled water          933
sausage                924
citrus fruit           812
pastry                 785
pip fruit              744
shopping bags          731
canned beer            717
bottled beer           687
whipped/sour cream     662
newspapers             596
frankfurter            580
brown bread            571
pork                   566
Name: producto, dtype: int64

## Preparación del conjunto de datos para las reglas de asociación

In [11]:
groceries.head()

Unnamed: 0,id_cliente,fecha,producto
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [12]:
trx = groceries.groupby(['id_cliente', 'fecha']).agg(lista_productos = ('producto', lambda x: x.tolist()))

In [13]:
trx.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lista_productos
id_cliente,fecha,Unnamed: 2_level_1
1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1000,24-06-2014,"[whole milk, pastry, salty snack]"
1000,24-07-2015,"[canned beer, misc. beverages]"
1000,25-11-2015,"[sausage, hygiene articles]"
1000,27-05-2015,"[soda, pickled vegetables]"


In [14]:
trx = trx['lista_productos'].tolist()

In [15]:
trx

[['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['sausage', 'whole milk', 'rolls/buns'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream'],
 ['frozen vegetables', 'other vegetables'],
 ['butter', 'whole milk'],
 ['tropical fruit', 'sugar'],
 ['butter milk', 'specialty chocolate'],
 ['sausage', 'rolls/buns'],
 ['root vegetables', 'detergent'],
 ['frozen meals', 'dental care'],
 ['rolls/buns', 'rolls/buns'],
 ['dish cleaner', 'cling film/bags'],
 ['canned beer', 'frozen fish'],
 ['other vegetables', 'hygiene articles'],
 ['pip fruit', 'whole milk', 'tropical fruit'],
 ['rolls/buns', 'red/blush wine', 'chocolate'],
 ['other vegetables', 'shopping bags'],
 ['whole milk', 'chocolate', 'packaged fruit/vegetables', 'rolls/buns'],
 ['root vegetables', 'whole milk'

## Reglas de asociación

In [19]:
!pip install mlxtend



In [20]:
from mlxtend.preprocessing import TransactionEncoder

In [21]:
te = TransactionEncoder()

In [22]:
transformadas = te.fit(trx).transform(trx)

In [23]:
transformadas

array([[False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [24]:
df = pd.DataFrame(transformadas, columns=te.columns_)
df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
from mlxtend.frequent_patterns import apriori

frecuencias = apriori(df, min_support=0.001, use_colnames=True)

In [26]:
frecuencias.shape

  and should_run_async(code)


(750, 2)

In [27]:
frecuencias

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.004010,(Instant food products)
1,0.021386,(UHT-milk)
2,0.001470,(abrasive cleaner)
3,0.001938,(artif. sweetener)
4,0.008087,(baking powder)
...,...,...
745,0.001136,"(sausage, whole milk, rolls/buns)"
746,0.001002,"(whole milk, rolls/buns, soda)"
747,0.001337,"(whole milk, yogurt, rolls/buns)"
748,0.001069,"(whole milk, sausage, soda)"


In [28]:
from mlxtend.frequent_patterns import association_rules

  and should_run_async(code)


In [29]:
reglas = association_rules(frecuencias, metric = 'confidence', min_threshold=0.05, )

  and should_run_async(code)


In [30]:
reglas

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(UHT-milk),(bottled water),0.021386,0.060683,0.001069,0.050000,0.823954,-0.000228,0.988755,-0.179204
1,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.100000,0.818993,-0.000473,0.975443,-0.184234
2,(UHT-milk),(rolls/buns),0.021386,0.110005,0.001804,0.084375,0.767013,-0.000548,0.972009,-0.236873
3,(UHT-milk),(sausage),0.021386,0.060349,0.001136,0.053125,0.880298,-0.000154,0.992371,-0.121998
4,(UHT-milk),(soda),0.021386,0.097106,0.001270,0.059375,0.611444,-0.000807,0.959887,-0.393704
...,...,...,...,...,...,...,...,...,...,...
445,"(whole milk, soda)",(sausage),0.011629,0.060349,0.001069,0.091954,1.523708,0.000368,1.034806,0.347750
446,"(sausage, soda)",(whole milk),0.005948,0.157923,0.001069,0.179775,1.138374,0.000130,1.026642,0.122281
447,"(whole milk, yogurt)",(sausage),0.011161,0.060349,0.001470,0.131737,2.182917,0.000797,1.082219,0.548014
448,"(whole milk, sausage)",(yogurt),0.008955,0.085879,0.001470,0.164179,1.911760,0.000701,1.093681,0.481231


In [31]:
reglas.sort_values(by = 'confidence', ascending = False).head(10)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
449,"(yogurt, sausage)",(whole milk),0.005748,0.157923,0.00147,0.255814,1.619866,0.000563,1.131541,0.384877
436,"(rolls/buns, sausage)",(whole milk),0.005347,0.157923,0.001136,0.2125,1.345594,0.000292,1.069304,0.258214
446,"(sausage, soda)",(whole milk),0.005948,0.157923,0.001069,0.179775,1.138374,0.00013,1.026642,0.122281
383,(semi-finished bread),(whole milk),0.00949,0.157923,0.001671,0.176056,1.114825,0.000172,1.022008,0.103985
443,"(yogurt, rolls/buns)",(whole milk),0.007819,0.157923,0.001337,0.17094,1.082428,0.000102,1.015701,0.076751
448,"(whole milk, sausage)",(yogurt),0.008955,0.085879,0.00147,0.164179,1.91176,0.000701,1.093681,0.481231
156,(detergent),(whole milk),0.008621,0.157923,0.001403,0.162791,1.030824,4.2e-05,1.005814,0.030162
205,(ham),(whole milk),0.017109,0.157923,0.00274,0.160156,1.014142,3.8e-05,1.002659,0.014188
35,(bottled beer),(whole milk),0.045312,0.157923,0.007151,0.157817,0.99933,-5e-06,0.999874,-0.000702
173,(frozen fish),(whole milk),0.006817,0.157923,0.001069,0.156863,0.993287,-7e-06,0.998743,-0.006759


In [32]:
reglas.sort_values(by = 'lift', ascending = False).head(40)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
447,"(whole milk, yogurt)",(sausage),0.011161,0.060349,0.00147,0.131737,2.182917,0.000797,1.082219,0.548014
448,"(whole milk, sausage)",(yogurt),0.008955,0.085879,0.00147,0.164179,1.91176,0.000701,1.093681,0.481231
121,(specialty chocolate),(citrus fruit),0.015973,0.053131,0.001403,0.087866,1.653762,0.000555,1.038081,0.401735
449,"(yogurt, sausage)",(whole milk),0.005748,0.157923,0.00147,0.255814,1.619866,0.000563,1.131541,0.384877
165,(flour),(tropical fruit),0.009757,0.067767,0.001069,0.109589,1.617141,0.000408,1.046969,0.385385
23,(beverages),(sausage),0.016574,0.060349,0.001537,0.092742,1.536764,0.000537,1.035704,0.355168
445,"(whole milk, soda)",(sausage),0.011629,0.060349,0.001069,0.091954,1.523708,0.000368,1.034806,0.34775
257,(napkins),(pastry),0.022121,0.051728,0.001738,0.07855,1.518529,0.000593,1.029109,0.349192
333,(processed cheese),(root vegetables),0.010158,0.069572,0.001069,0.105263,1.513019,0.000363,1.039891,0.342549
215,(hard cheese),(pip fruit),0.014703,0.049054,0.001069,0.072727,1.482586,0.000348,1.02553,0.33036
