## Análisis del carrito de compras para identificar las combinaciones de productos que se compran con más frecuencia.

In [1]:
# Importar
import pandas as pd

In [2]:
# Leer csv y visualizar
df = pd.read_csv('./dataset/Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


<strong>Resultado:</strong><br>
Este dataframe contiene información sobre las compras realizadas en una tienda de comestibles, incluida la fecha de la transacción, la descripción del producto y un ID de cliente.

### Pre-procesamiento de datos:
Convertir los datos a un formato adecuado para el algoritmo A priori: una estructura tabular de unos y ceros.

In [3]:
# Primero, añadir una nueva columna que junte un grupo de productos con el mismo número de cliente y fecha.
# Esto es para tener una lista de productos comprados en la misma transacción.
df['single_transaction'] = df['Member_number'].astype(str)+'_'+df['Date'].astype(str)
df.head()

Unnamed: 0,Member_number,Date,itemDescription,single_transaction
0,1808,21-07-2015,tropical fruit,1808_21-07-2015
1,2552,05-01-2015,whole milk,2552_05-01-2015
2,2300,19-09-2015,pip fruit,2300_19-09-2015
3,1187,12-12-2015,other vegetables,1187_12-12-2015
4,3037,01-02-2015,whole milk,3037_01-02-2015


In [4]:
# Segundo, convertir los productos en columnas y la transacciones en filas
df2 = pd.crosstab(df['single_transaction'], df['itemDescription'])
df2.head()

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
single_transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000_15-03-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1000_24-06-2014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000_24-07-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_25-11-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_27-05-2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<strong>Resultado:</strong><br>
Una tabla que nos dice cuántas veces se ha comprado cada producto en una transacción.

In [5]:
# Tercero, codificar todos los valores del dataframe anterior a unos y ceros
def codificar(item_freq):
    res = 0
    if item_freq > 0:
        res = 1
    return res
    
carrito_input = df2.applymap(codificar)

### Usar el algoritmo A Priori
Importar el algoritmo A Priori del paquete MLXtend

In [6]:
# Importar
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [7]:
frequent_itemsets = apriori(carrito_input, min_support=0.001, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift")
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(UHT-milk),(bottled water),0.021386,0.060683,0.001069,0.05,0.823954,-0.000228,0.988755
1,(bottled water),(UHT-milk),0.060683,0.021386,0.001069,0.017621,0.823954,-0.000228,0.996168
2,(UHT-milk),(other vegetables),0.021386,0.122101,0.002139,0.1,0.818993,-0.000473,0.975443
3,(other vegetables),(UHT-milk),0.122101,0.021386,0.002139,0.017515,0.818993,-0.000473,0.99606
4,(UHT-milk),(sausage),0.021386,0.060349,0.001136,0.053125,0.880298,-0.000154,0.992371


<strong>Resultado:</strong><br>
Las columnas "antecedents" y "consequents" muestran productos que se compran juntos con frecuencia.

En el ejemplo anterior, la primera fila del conjunto de datos nos dice que si una persona comprará agua embotellada, es probable que también compre leche.

In [10]:
# Ordenar el conjunto de datos por support, confidence y lift
tabla_final = rules.sort_values(["support", "confidence","lift"], axis = 0, ascending = False)
tabla_final.head(8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
623,(rolls/buns),(whole milk),0.110005,0.157923,0.013968,0.126974,0.804028,-0.003404,0.96455
622,(whole milk),(rolls/buns),0.157923,0.110005,0.013968,0.088447,0.804028,-0.003404,0.97635
695,(yogurt),(whole milk),0.085879,0.157923,0.011161,0.129961,0.82294,-0.002401,0.967861
694,(whole milk),(yogurt),0.157923,0.085879,0.011161,0.070673,0.82294,-0.002401,0.983638
551,(soda),(other vegetables),0.097106,0.122101,0.009691,0.099794,0.817302,-0.002166,0.975219
550,(other vegetables),(soda),0.122101,0.097106,0.009691,0.079365,0.817302,-0.002166,0.980729
649,(sausage),(whole milk),0.060349,0.157923,0.008955,0.148394,0.939663,-0.000575,0.988811
648,(whole milk),(sausage),0.157923,0.060349,0.008955,0.056708,0.939663,-0.000575,0.99614


<strong>Resultado:</strong><br>
Obtener las combinaciones de productos más frecuentes en todo el conjunto de datos.

### Exportar dataset_final como un archivo csv

In [9]:
tabla_final.to_csv('./dataset/data.csv')

Esto nos servirá para hacer una visualización de datos. Ver en <a href="./3-visualization.ipynb">jupyter de visualización.</a>

Ir al <a href="./4-conclusion.ipynb">jupyter de conclusión</a> para visualizar los resultados.

<strong>Resultado:</strong><br>
La tabla anterior muestra que las cuatro combinaciones de productos que se compran con más frecuencia son:
+ panecillos y leche 
+ yogur y leche
+ salchichas y leche
+ refrescos y verduras