Loading data from dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from apyori import apriori

data = pd.read_csv('groceries - groceries.csv', sep=',', header=None)

# drop column for number of items
data = data.drop(data.columns[0], axis=1)

# drop first row that is item1, item2, etc.
data = data.drop(data.index[[0]])
data.head()


Transform transactions into lists of items and use transaction encoder

In [None]:
transactions = []
for i in range(0, 9835):
    transactions.append([str(data.values[i, j]) for j in range(0, 32)])
    transactions[i] = list(filter('nan'.__ne__, transactions[i]))

# use transaction encoder
te = TransactionEncoder()
transactions_data = te.fit_transform(transactions)
transactions_data = pd.DataFrame(transactions_data, columns=te.columns_)
transactions_data.head()


Find number of purchases for each product

In [None]:
purchases = []
# find number of purchases
for c in transactions_data.columns:
    purchases.append(transactions_data[transactions_data[c] == True][c].value_counts()
                     .tolist()[0])

purchases_number = pd.DataFrame([purchases], columns=transactions_data.columns)

# sort products by number of purchases
purchases_most = purchases_number.sort_values(by=0, ascending=False, axis=1)
purchases_least = purchases_number.sort_values(by=0, ascending=True, axis=1)
purchases_least.head()


Plot most and least purchased products

In [None]:
# plot most purchased products
plt.figure(figsize=(18, 7))
plt.ylabel("Frequency")
plt.bar(purchases_most.columns[:10], purchases_most.values[0, :10], color=(1.0, 0.0, 0.0))
plt.xticks(rotation=90)
plt.title("Most purchased products")
plt.show()

# plot least purchased products
plt.figure(figsize=(18, 7))
plt.ylabel("Frequency")
plt.bar(purchases_least.columns[:10], purchases_least.values[0, :10], color=(0.0, 0.0, 1.0))
plt.xticks(rotation=90)
plt.title("Least purchased products")
plt.show()


Frequency of purchased products

In [None]:
vals = np.linspace(0, 1, 169)

plt.figure(figsize=(18, 7))
color = plt.cm.autumn(vals[:45])
plt.bar(purchases_most.columns[:45], purchases_most.values[0, :45], color=color)
plt.xticks(rotation=90)
plt.title("Frequency of purchased products part 1")
plt.show()

plt.figure(figsize=(18, 7))
color = plt.cm.autumn(vals[45:90])
plt.bar(purchases_most.columns[45:90], purchases_most.values[0, 45:90], color=color)
plt.xticks(rotation=90)
plt.title("Frequency of purchased products part 2")
plt.show()

plt.figure(figsize=(18, 7))
color = plt.cm.autumn(vals[90:135])
plt.bar(purchases_most.columns[90:135], purchases_most.values[0, 90:135], color=color)
plt.xticks(rotation=90)
plt.title("Frequency of purchased products part 3")
plt.show()

plt.figure(figsize=(18, 7))
color = plt.cm.autumn(vals[135:169])
plt.bar(purchases_most.columns[135:169], purchases_most.values[0, 135:169], color=color)
plt.xticks(rotation=90)
plt.title("Frequency of purchased products part 4")
plt.show()


Applying apriori algorithm to find association rules

In [None]:
association_rules = apriori(transactions, min_support=0.01, min_confidence=0.2, 
                            min_lift=2.5, min_length=2)
association_rules = list(association_rules)

for item in association_rules:
    print("Rule: " + str([x for x in item[2][0][0]]) + " -> " + 
          str([x for x in item[2][0][1]]))

    print("Support: " + str(item[1]))

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("....................................................")
