# Notebook de Jupyter para algoritmo Apriori - Nicole Góngora

## Importando las librerías y la base de datos

### Importando librerías

In [1]:
import numpy as np
import pandas as pd

! pip install mlxtend --user



In [2]:
from mlxtend.frequent_patterns import apriori, association_rules

### Importando y leyendo la base de datos

In [3]:
data = pd.read_excel('Online Retail.xlsx')
data.head() #las primeras filas

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
data.columns  #las columnas de la base de datos

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [5]:
data.shape #el tamaño de columna x filas de la base de datos

(541909, 8)

### Verificando la existencia de datos nulos para el preproceso

In [6]:
data.isnull().values.any() #verificando si existen valores nulos en la tabla

True

In [7]:
data.isnull().sum() #mostrando la suma de valores nulos en cada columna

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

## Preprocesando los datos

### Eliminación de filas y espacios en datos de celdas

In [8]:
# Borrando los espacios de la información de la celda de descripción
data['Description'] = data['Description'].str.strip() 
  
# Borrando las filas sin InvoiceNo
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
  
# Borrando las transacciones hechas en crédito C
data = data[~data['InvoiceNo'].str.contains('C')] 

### Obteniendo los países de la base de datos

In [9]:
data.Country.unique() 

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Israel', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

### Agrupando la base de datos de acuerdo a un país para reducción de tamaño

In [10]:
# Dividiendo los datos según la región de transacción

basket = (data[data['Country'] =="France"] 
         .groupby(['InvoiceNo', 'Description'])['Quantity'] 
         .sum().unstack().reset_index().fillna(0) 
         .set_index('InvoiceNo')) 

In [11]:
# Realizando una función de codificación binaria para mejor rendimiento  

def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

In [12]:
# Aplicando el encoding en nuestra porción de evaluación

basket_encoded = basket.applymap(hot_encode) 
basket = basket_encoded 


In [13]:
basket.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preparando el modelo de reglas de asociación

In [14]:
#Construyendo el modelo con la librería apriori, indicando el soporte mínimo requerido 0.1 y la
#porcion de la base de datos a utilizar
frq_items = apriori(basket, min_support = 0.1, use_colnames = True) 
  
# Creando las reglas de asociación a partir de los items frecuentes y
# ordenandolos por levante y confidencia descendientes
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 

In [15]:
print(rules.head()) 

                                 antecedents                      consequents  \
40           (SET/6 RED SPOTTY PAPER PLATES)    (SET/6 RED SPOTTY PAPER CUPS)   
42  (SET/6 RED SPOTTY PAPER PLATES, POSTAGE)    (SET/6 RED SPOTTY PAPER CUPS)   
35       (STRAWBERRY LUNCH BOX WITH CUTLERY)                        (POSTAGE)   
27      (ROUND SNACK BOXES SET OF4 WOODLAND)                        (POSTAGE)   
41             (SET/6 RED SPOTTY PAPER CUPS)  (SET/6 RED SPOTTY PAPER PLATES)   

    antecedent support  consequent support   support  confidence      lift  \
40            0.127551            0.137755  0.122449    0.960000  6.968889   
42            0.107143            0.137755  0.102041    0.952381  6.913580   
35            0.122449            0.765306  0.114796    0.937500  1.225000   
27            0.158163            0.765306  0.147959    0.935484  1.222366   
41            0.137755            0.127551  0.122449    0.888889  6.968889   

    leverage  conviction  
40  0.104878   21