Imports

In [None]:
import sqlite3
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import apriori

Gather data

In [None]:
def getDbAsDF(conn: sqlite3.Connection, sql: str):
    cursor = conn.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()

    columns = [column[0] for column in cursor.description]

    formatted_data = {}

    for i in range(len(columns)):
        dataList = []
        for j in data:
            dataList.append(j[i])
        formatted_data[columns[i]] = dataList

    df = pd.DataFrame(data=formatted_data, columns=columns)
    return df


# Connect to SQLite database
connection = sqlite3.connect(r'..\..\Data\merged.sqlite')

SQL = r"""
SELECT *
FROM order_details as OD
INNER JOIN order_header as OH ON OH.ORDER_NUMBER = OD.ORDER_NUMBER
INNER JOIN product as P ON P.PRODUCT_NUMBER = OD.PRODUCT_NUMBER
"""

Dataframe = getDbAsDF(connection, SQL)
print(Dataframe.columns)
Dataframe

Select & Format data

In [None]:
Data = Dataframe[["ORDER_NUMBER","PRODUCT_NAME"]]
Data = Data.T.drop_duplicates().T
Data.dropna(inplace = True)
OrderNrs = Data["ORDER_NUMBER"].unique()
Data

In [None]:
dataset = []
for number in OrderNrs:
    products = []
    FoundRows = Data.loc[Data['ORDER_NUMBER'] == number]
    for row in FoundRows["PRODUCT_NAME"]:
        products.append(row)
    dataset.append(products)
print(dataset)

Apply apriori

In [None]:
tr = TransactionEncoder()
tr_arr = tr.fit(dataset).transform(dataset)
FormatDF = pd.DataFrame(tr_arr, columns=tr.columns_)
FormatDF

In [None]:
frequent_itemsets = apriori(FormatDF, min_support = 0.01, use_colnames = True, verbose=1) #0.1, 0.001 barely works w/32gb
frequent_itemsets = frequent_itemsets.sort_values(by=['support'], ascending=False)
print(frequent_itemsets)

rules = association_rules(frequent_itemsets, metric='confidence',min_threshold=0.4)
rules = rules.sort_values(by='lift', ascending = False)
rules

Support (item) = (Transactions relating item) / (Total transactions)
Confidence = (Transactions relating both item1 and item2) / (Total transactions involving item1)

Apriori is traag wanneer grote datasets in memory geladen moeten worden en de min_support klein is, gelukkig is deze dataset niet al te groot

### per bedrijf

In [None]:
Data = Dataframe[["RETAILER_NAME","PRODUCT_NAME"]]
Data = Data.T.drop_duplicates().T
Data.dropna(inplace = True)
OrderNrs = Data["RETAILER_NAME"].unique()
Data

In [None]:
dataset = []
for number in OrderNrs:
    products = []
    FoundRows = Data.loc[Data['RETAILER_NAME'] == number]
    for row in FoundRows["PRODUCT_NAME"]:
        products.append(row)
    dataset.append(products)
print(dataset)

In [None]:
tr = TransactionEncoder()
tr_arr = tr.fit(dataset).transform(dataset)
FormatDF = pd.DataFrame(tr_arr, columns=tr.columns_)
FormatDF

In [13]:
frequent_itemsets = apriori(FormatDF, min_support = 0.3, use_colnames = True, verbose=1) #0.1, 0.001 barely works w/32gb
frequent_itemsets = frequent_itemsets.sort_values(by=['support'], ascending=False)
print(frequent_itemsets)

rules = association_rules(frequent_itemsets, metric='confidence',min_threshold=0.4)
rules = rules.sort_values(by='lift', ascending = False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4965403,"(Firefly Multi-light, Firefly Extreme, Hiberna...","(Firefly 2, EverGlow Butane, Glacier Deluxe, E...",0.302752,0.302752,0.302752,1.000000,3.303030,0.211093,inf,1.000000
7915331,"(Firefly 2, EverGlow Butane, Glacier Deluxe, S...","(Firefly Multi-light, Hibernator, Hibernator P...",0.302752,0.302752,0.302752,1.000000,3.303030,0.211093,inf,1.000000
10778376,"(Edge Extreme, Hibernator Pad, TrailChef Doubl...","(Bear Survival Edge, Firefly Extreme, Firefly ...",0.302752,0.302752,0.302752,1.000000,3.303030,0.211093,inf,1.000000
10778375,"(Edge Extreme, Hibernator Pad, TrailChef Doubl...","(Bear Survival Edge, Firefly Extreme, Firefly ...",0.302752,0.302752,0.302752,1.000000,3.303030,0.211093,inf,1.000000
10778374,"(Edge Extreme, Hibernator Pad, TrailChef Doubl...","(Bear Survival Edge, Firefly Extreme, Firefly ...",0.302752,0.302752,0.302752,1.000000,3.303030,0.211093,inf,1.000000
...,...,...,...,...,...,...,...,...,...,...
18838538,(Polar Sports),(Canyon Mule Weekender Backpack),0.449541,0.513761,0.302752,0.673469,1.310860,0.071795,1.489106,0.430808
18838509,(Canyon Mule Climber Backpack),(Glacier GPS),0.504587,0.458716,0.302752,0.600000,1.308000,0.071290,1.353211,0.475309
18838508,(Glacier GPS),(Canyon Mule Climber Backpack),0.458716,0.504587,0.302752,0.660000,1.308000,0.071290,1.457097,0.435028
18838454,(Star Gazer 2),(Polar Sports),0.522936,0.449541,0.302752,0.578947,1.287863,0.067671,1.307339,0.468531
