# Input Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# CSV dataset file name
file_name = 'D1.csv'

In [3]:
# Please put csv file in the same folder with this jupyter notebook
df = pd.read_csv(file_name)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131706 entries, 0 to 131705
Data columns (total 7 columns):
Date            131706 non-null object
Customer_ID     131706 non-null int64
Sales_ID        131706 non-null int64
SKU_Category    131706 non-null object
SKU             131706 non-null object
Quantity        131706 non-null float64
Sales_Amount    131706 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 7.0+ MB
None


### Need to add first:
-measures of relationships between variables to identify interestingness of relationship. This will allow us to provide more info on what to do in terms of association mining. 
-data appears clean on first glance. 
-data types appear appropriate on first glance
-is there interestingness in the correlations in specific values?

In [5]:
#Remaining cells are Mel's version of using the tutorial 8 code with the assignment data. 13/9/22
#group by account, then list all services
transactions = df.groupby(['Customer_ID'])['SKU_Category'].apply(list)

print(transactions.head(5))

Customer_ID
1                   [0H2, N8U]
2                   [TVL, F9B]
3              [LPF, TW8, TW8]
4    [69B, YMJ, 29A, N8U, JR5]
5    [P42, P42, P42, P42, LGI]
Name: SKU_Category, dtype: object


In [6]:
#need to install apyori at home using terminal command pip install apyori
from apyori import apriori

#type cast the transactions from panadas into normal list format and 
#run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support = 0.055, min_confidence = 0.055))

#print first 5 rules
print(results)

[RelationRecord(items=frozenset({'0H2'}), support=0.1403756906077348, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0H2'}), confidence=0.1403756906077348, lift=1.0)]), RelationRecord(items=frozenset({'1VL'}), support=0.06227624309392265, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1VL'}), confidence=0.06227624309392265, lift=1.0)]), RelationRecord(items=frozenset({'29A'}), support=0.0976353591160221, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'29A'}), confidence=0.0976353591160221, lift=1.0)]), RelationRecord(items=frozenset({'2ML'}), support=0.0576353591160221, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'2ML'}), confidence=0.0576353591160221, lift=1.0)]), RelationRecord(items=frozenset({'8HU'}), support=0.058740331491712705, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'8HU'}), confidence=0.058740

In [7]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            #items_base = left side of rules, items_add = right side
            #support, confidence, lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, 
                          rule.confidence, rule.lift])
        
    #typecaset to pandas df
    return pd.DataFrame(rules, columns = ['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df)

   Left_side Right_side   Support  Confidence      Lift
0                   0H2  0.140376    0.140376  1.000000
1                   1VL  0.062276    0.062276  1.000000
2                   29A  0.097635    0.097635  1.000000
3                   2ML  0.057635    0.057635  1.000000
4                   8HU  0.058740    0.058740  1.000000
5                   FEW  0.095337    0.095337  1.000000
6                   FU5  0.069260    0.069260  1.000000
7                   H15  0.101613    0.101613  1.000000
8                   IEV  0.106785    0.106785  1.000000
9                   J4R  0.069525    0.069525  1.000000
10                  JPI  0.059006    0.059006  1.000000
11                  LPF  0.120840    0.120840  1.000000
12                  LSD  0.061525    0.061525  1.000000
13                  N8U  0.274077    0.274077  1.000000
14                  OXH  0.075492    0.075492  1.000000
15                  P42  0.143558    0.143558  1.000000
16                  Q4N  0.086365    0.086365  1

In [8]:
result_df = result_df.sort_values(by='Lift', ascending = False)
print(result_df.head(10))

   Left_side Right_side   Support  Confidence      Lift
28       LPF        IEV  0.066077    0.546818  5.120759
27       IEV        LPF  0.066077    0.618791  5.120759
30       LPF        N8U  0.056796    0.470007  1.714871
31       N8U        LPF  0.056796    0.207225  1.714871
24       0H2        N8U  0.056751    0.404282  1.475066
25       N8U        0H2  0.056751    0.207063  1.475066
34       P42        N8U  0.056619    0.394397  1.438997
33       N8U        P42  0.056619    0.206580  1.438997
37       R6E        N8U  0.055249    0.367647  1.341399
36       N8U        R6E  0.055249    0.201580  1.341399


In [9]:
#need to install apyori at home using terminal command pip install apyori
from apyori import apriori

#type cast the transactions from panadas into normal list format and 
#run apriori
transaction_list2 = list(transactions)
results2 = list(apriori(transaction_list2, min_support = 0.02))

#print first 5 rules
print(results2)

[RelationRecord(items=frozenset({'01F'}), support=0.0416353591160221, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'01F'}), confidence=0.0416353591160221, lift=1.0)]), RelationRecord(items=frozenset({'0H2'}), support=0.1403756906077348, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0H2'}), confidence=0.1403756906077348, lift=1.0)]), RelationRecord(items=frozenset({'0WT'}), support=0.025502762430939227, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0WT'}), confidence=0.025502762430939227, lift=1.0)]), RelationRecord(items=frozenset({'1EO'}), support=0.025767955801104973, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1EO'}), confidence=0.025767955801104973, lift=1.0)]), RelationRecord(items=frozenset({'1L6'}), support=0.023646408839779004, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1L6'}), confidence=0.

In [10]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            #items_base = left side of rules, items_add = right side
            #support, confidence, lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, 
                          rule.confidence, rule.lift])
        
    #typecaset to pandas df
    return pd.DataFrame(rules, columns = ['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])

result_df = convert_apriori_results_to_pandas_df(results2)

print(result_df)

       Left_side       Right_side   Support  Confidence      Lift
0                             01F  0.041635    0.041635  1.000000
1                             0H2  0.140376    0.140376  1.000000
2                             0WT  0.025503    0.025503  1.000000
3                             1EO  0.025768    0.025768  1.000000
4                             1L6  0.023646    0.023646  1.000000
5                             1TS  0.026519    0.026519  1.000000
6                             1VL  0.062276    0.062276  1.000000
7                             29A  0.097635    0.097635  1.000000
8                             2ML  0.057635    0.057635  1.000000
9                             69K  0.025680    0.025680  1.000000
10                            6BZ  0.051801    0.051801  1.000000
11                            8HU  0.058740    0.058740  1.000000
12                            9ZX  0.032398    0.032398  1.000000
13                            A38  0.042077    0.042077  1.000000
14        