In [1]:
# imports the pandas and numpy libraries for data processing and manipulation
import numpy as np   # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd  # linear algebra

# imports the Apriori algorithm and association rules from the mlxtend library 
# for frequent pattern mining and association rule generation
from mlxtend.frequent_patterns import apriori, association_rules

# Import the portugal csv dataset into Jupyter using pandas
portugal = pd.read_csv("Portugal_online_retail.csv")

# Print the dataset to observe columns and rows
portugal

Unnamed: 0,InvoiceNo,10 COLOUR SPACEBOY PEN,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,20 DOLLY PEGS RETROSPOT,3 PIECE SPACEBOY COOKIE CUTTER SET,3 STRIPEY MICE FELTCRAFT,...,WRAP FLOWER SHOP,WRAP GINGHAM ROSE,WRAP PAISLEY PARK,WRAP PINK FAIRY CAKES,WRAP RED APPLES,WRAP RED VINTAGE DOILY,WRAP SUKI AND FRIENDS,YOU'RE CONFUSING ME METAL SIGN,ZINC FOLKART SLEIGH BELLS,ZINC WIRE KITCHEN ORGANISER
0,536990,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,537246,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,537818,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,537915,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,538311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,539353,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,540519,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,540546,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,541430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,542147,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
#InvoiceNo column is not required, hence it is dropped in the dataset
portugal = portugal.drop('InvoiceNo', axis=1)

#Print the Portugal dataset head (first ten rows) to confirm if invoice column has been dropped
portugal.head(10)

#Define the Confidence Levels as 70%
confidence_level = 0.7

#Convert the dataset to One-hot encoded format and name it portugal_encoded
portugal_encoded = pd.get_dummies(portugal, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value as 0.05
frq_items = apriori(portugal_encoded, min_support = 0.05, use_colnames = True)

# Generate Association Rules
rules = association_rules(frq_items, metric ="confidence", min_threshold= confidence_level)

#Sort rules by Confidence in Descending Order(from the best to the least)
rules = rules.sort_values(by='confidence', ascending=False)

#Select the Top Three Rules and Print
portugal_major3 = rules.head(3)
portugal_major3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
120046,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...","(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, LUNCH BA...",0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf,1.0
112311,"(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, RED RETR...",(JUMBO BAG PINK VINTAGE PAISLEY),0.051724,0.155172,0.051724,1.0,6.444444,0.043698,inf,0.890909
201035,"(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, RED RETR...","(LUNCH BAG SPACEBOY DESIGN, JUMBO BAG PINK VIN...",0.051724,0.068966,0.051724,1.0,14.5,0.048157,inf,0.981818


In [3]:
#Define the Confidence Levels to be 80%
confidence_level = 0.8

#Convert the dataset to One-hot encoded format and name it portugal
portugal_encoded = pd.get_dummies(portugal, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value as 0.05
frequent_items = apriori(portugal_encoded, min_support = 0.05, use_colnames = True)

#Generate Association Rules
rules = association_rules(frequent_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=False)

#Select and Print the Top Three rules for this parameter
portugal_top3 = rules.head(3)
portugal_top3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(50'S CHRISTMAS GIFT BAG LARGE),(WOODEN TREE CHRISTMAS SCANDINAVIAN),0.051724,0.068966,0.051724,1.0,14.5,0.048157,inf,0.981818
100163,"(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, LUNCH BA...","(LUNCH BAG PINK POLKADOT, JUMBO BAG RED RETROS...",0.051724,0.086207,0.051724,1.0,11.6,0.047265,inf,0.963636
100147,"(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, RED RETR...","(LUNCH BAG PINK POLKADOT, LUNCH BAG CARS BLUE)",0.051724,0.103448,0.051724,1.0,9.666667,0.046373,inf,0.945455


In [4]:
#Define the Confidence Levels to be 90%
confidence_level = 0.9

#Convert the dataset to One-hot encoded format and name it portugal
portugal_encoded = pd.get_dummies(portugal, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value as 0.05
frq_products = apriori(portugal_encoded, min_support = 0.05, use_colnames = True)

#Generate Association Rules
rules = association_rules(frq_products, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=False)

#Select and Print the Top Three rules for this parameter
portugal_top3 = rules.head(3)
portugal_top3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(50'S CHRISTMAS GIFT BAG LARGE),(WOODEN TREE CHRISTMAS SCANDINAVIAN),0.051724,0.068966,0.051724,1.0,14.5,0.048157,inf,0.981818
99938,"(RED RETROSPOT CHARLOTTE BAG, JUMBO BAG RED RE...","(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, JUMBO SH...",0.051724,0.086207,0.051724,1.0,11.6,0.047265,inf,0.963636
99922,"(LUNCH BAG DOLLY GIRL DESIGN, LUNCH BAG RED RE...","(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, JUMBO SH...",0.051724,0.137931,0.051724,1.0,7.25,0.04459,inf,0.909091


# Sweden Dataset

In [5]:
# Import the sweden csv dataset into Jupyter using pandas
sweden = pd.read_csv("Sweden_online_retail.csv")

# Print Sweden dataset to observe the columns and rows
sweden

#InvoiceNo column is not required, hence it is dropped in the dataset
sweden = sweden.drop('InvoiceNo', axis=1)

#Print the first 20 rows of the Dataset to confirm if invoice column has been dropped
sweden.head(10)

Unnamed: 0,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE WOODLAND,3 PIECE SPACEBOY COOKIE CUTTER SET,3 RAFFIA RIBBONS 50'S CHRISTMAS,3 RAFFIA RIBBONS VINTAGE CHRISTMAS,3 TIER CAKE TIN RED AND CREAM,3 TRADITIONAl BISCUIT CUTTERS SET,36 DOILIES DOLLY GIRL,...,WOODEN STAR CHRISTMAS SCANDINAVIAN,WOODEN TREE CHRISTMAS SCANDINAVIAN,WOODLAND CHARLOTTE BAG,WOODLAND SMALL RED FELT HEART,WORLD WAR 2 GLIDERS ASSTD DESIGNS,WRAP VINTAGE DOILY,WRAP ALPHABET DESIGN,WRAP DOLLY GIRL,WRAP RED VINTAGE DOILY,ZINC WILLIE WINKIE CANDLE STICK
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#Define the Confidence Levels to be 70%
confidence_level = 0.7

#Convert the dataset to One-hot encoded format and name it portugal
sweden_encoded = pd.get_dummies(sweden, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value as 0.05
frq_products = apriori(sweden_encoded, min_support = 0.05, use_colnames = True)

#Generate Association Rules
rules = association_rules(frq_products, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
sweden_major3 = rules.head(3)
sweden_major3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
459,(60 CAKE CASES DOLLY GIRL DESIGN),"(PACK OF 60 SPACEBOY CAKE CASES, SET OF 3 CAKE...",0.111111,0.083333,0.083333,0.75,9.0,0.074074,3.666667,1.0
123,(WOODEN BOX OF DOMINOES),(MINI PAINT SET VINTAGE),0.111111,0.166667,0.083333,0.75,4.5,0.064815,3.333333,0.875
452,(60 CAKE CASES DOLLY GIRL DESIGN),"(PACK OF 60 SPACEBOY CAKE CASES, PACK OF 72 RE...",0.111111,0.111111,0.083333,0.75,6.75,0.070988,3.555556,0.958333


In [7]:
#Define the Confidence Levels to be 80%
confidence_level = 0.8

#Convert the dataset to One-hot encoded format and name it portugal
sweden_encoded = pd.get_dummies(sweden, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value of the items as 0.04
frq_items = apriori(sweden_encoded, min_support = 0.04, use_colnames = True)

#Generate Association Rules
rules = association_rules(frq_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
sweden_top3 = rules.head(3)
sweden_top3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf,1.0
57825,"(SET OF 3 CAKE TINS PANTRY DESIGN, 60 CAKE CAS...",(MINI PAINT SET VINTAGE),0.055556,0.166667,0.055556,1.0,6.0,0.046296,inf,0.882353
57824,"(SET OF 3 CAKE TINS PANTRY DESIGN, MINI PAINT ...",(60 CAKE CASES DOLLY GIRL DESIGN),0.055556,0.111111,0.055556,1.0,9.0,0.049383,inf,0.941176


In [8]:
#Define the Confidence Levels to be 90%
confidence_level = 0.9

#Convert the dataset to One-hot encoded format and name it portugal
sweden_encoded = pd.get_dummies(sweden, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value of the items as 0.05
frequent_items = apriori(sweden_encoded, min_support = 0.05, use_colnames = True)

#Generate Association Rules
rules = association_rules(frequent_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
sweden_TopRank3 = rules.head(3)
sweden_TopRank3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf,1.0
57825,"(SET OF 3 CAKE TINS PANTRY DESIGN, 60 CAKE CAS...",(MINI PAINT SET VINTAGE),0.055556,0.166667,0.055556,1.0,6.0,0.046296,inf,0.882353
57824,"(SET OF 3 CAKE TINS PANTRY DESIGN, MINI PAINT ...",(60 CAKE CASES DOLLY GIRL DESIGN),0.055556,0.111111,0.055556,1.0,9.0,0.049383,inf,0.941176


# UK Dataset

In [9]:
# Import the UK csv dataset into Jupyter using pandas and name it UK
UK = pd.read_csv("UK_online_retail.csv")

#Print UK dataset for observations of columns, rows and datatype
UK

# Dropping the column InvoiceNo
UK = UK.drop('InvoiceNo',axis=1)

# Convert dataset to One-Hot encoded format
UK_encoded = pd.get_dummies(UK, prefix='', prefix_sep='')

# Finding the frequent itemsets
frq_items3 = apriori(UK, min_support = 0.07, use_colnames = True)

# Display the first nine freuqent bought items with their support value
frq_items3.sort_values(by=['support'],ascending=False)

  UK = pd.read_csv("UK_online_retail.csv")


Unnamed: 0,support,itemsets
5,0.116034,(WHITE HANGING HEART T-LIGHT HOLDER)
1,0.10382,(JUMBO BAG RED RETROSPOT)
4,0.090266,(REGENCY CAKESTAND 3 TIER)
3,0.085391,(PARTY BUNTING)
2,0.07457,(LUNCH BAG RED RETROSPOT)
0,0.073445,(ASSORTED COLOUR BIRD ORNAMENT)


In [10]:
# Extracting 10k randomly from the workshop6 data and name it dataset
dataset = UK.sample(n=5000, random_state = 40)

#Reset the indexing of the dataset
dataset.reset_index(drop=True, inplace=True)

dataset

#Define the Confidence Levels to be 60%
confidence_level = 0.6

#Convert the dataset to One-hot encoded format and name it portugal
dataset_encoded = pd.get_dummies(dataset, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value of the items as 0.02
frequent_items = apriori(dataset_encoded, min_support = 0.02, use_colnames = True)

#Generate Association Rules
rules = association_rules(frequent_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
UK_TopRank3 = rules.head(3)
UK_TopRank3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
18,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.0516,0.109,0.031,0.600775,5.511699,0.025376,2.231825,0.863104
25,(JUMBO STORAGE BAG SKULLS),(JUMBO BAG RED RETROSPOT),0.0392,0.109,0.0236,0.602041,5.52331,0.019327,2.238923,0.852362
7,(CHARLOTTE BAG SUKI DESIGN),(RED RETROSPOT CHARLOTTE BAG),0.0458,0.049,0.0276,0.60262,12.298369,0.025356,2.393176,0.962784


In [11]:
#Define the Confidence Levels to be 70%
confidence_level = 0.7

#Convert the dataset to One-hot encoded format and name it portugal
dataset_encoded = pd.get_dummies(dataset, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value of the items as 0.02
frequent_items = apriori(dataset_encoded, min_support = 0.02, use_colnames = True)

#Generate Association Rules
rules = association_rules(frequent_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
UK_TopRank3 = rules.head(3)
UK_TopRank3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
8,"(ROSES REGENCY TEACUP AND SAUCER, GREEN REGENC...",(PINK REGENCY TEACUP AND SAUCER),0.0398,0.039,0.0284,0.713568,18.296611,0.026848,3.35507,0.984529
3,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.0556,0.0538,0.0398,0.715827,13.305341,0.036809,3.329666,0.979291
0,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.035,0.049,0.0254,0.725714,14.810496,0.023685,3.467187,0.966301


In [12]:
#Define the Confidence Levels to be 80%
confidence_level = 0.8

#Convert the dataset to One-hot encoded format and name it portugal
dataset_encoded = pd.get_dummies(dataset, prefix='', prefix_sep ='')

# Apply Apriori Algorithm to the encoded dataset
# set the minimum support value of the items as 0.02
frequent_items = apriori(dataset_encoded, min_support = 0.02, use_colnames = True)

#Generate Association Rules
rules = association_rules(frequent_items, metric = "confidence", min_threshold= confidence_level)

#Sort Rules by Confidence in Descending Order
rules = rules.sort_values(by='confidence', ascending=True)

#Select and Print the Top Three rules for this parameter
UK_TopRank3 = rules.head(3)
UK_TopRank3



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(PINK REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.039,0.0556,0.0314,0.805128,14.480723,0.029232,4.846263,0.968723
3,"(PINK REGENCY TEACUP AND SAUCER, GREEN REGENCY...",(ROSES REGENCY TEACUP AND SAUCER),0.0336,0.0556,0.0284,0.845238,15.202124,0.026532,6.102277,0.966701
0,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.039,0.0538,0.0336,0.861538,16.013726,0.031502,6.833667,0.975602
