# Input Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# CSV dataset file name
file_name = 'D1.csv'

In [3]:
# Please put csv file in the same folder with this jupyter notebook
df = pd.read_csv(file_name)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131706 entries, 0 to 131705
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    131706 non-null  int64  
 1   Date          131706 non-null  object 
 2   Customer_ID   131706 non-null  int64  
 3   Sales_ID      131706 non-null  int64  
 4   SKU_Category  131706 non-null  object 
 5   SKU           131706 non-null  object 
 6   Quantity      131706 non-null  int64  
 7   Sales_Amount  131706 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 8.0+ MB
None


# 1. Data pre-processing

In [5]:
# Checking for '?' as 'Invalid/Unknown'
for col in df.columns:
    print(col,df[col][df[col] == 'Invalid/Unknown'].count())

Unnamed: 0 0
Date 0
Customer_ID 0
Sales_ID 0
SKU_Category 0
SKU 0
Quantity 0
Sales_Amount 0


##### Quantity variable should be in int64 data type. Before converting to integer, this variable is rounded to become integer.

In [6]:
df['Quantity'] = df['Quantity'].apply(np.ceil)

In [7]:
print(df)

        Unnamed: 0       Date  Customer_ID  Sales_ID SKU_Category    SKU  \
0                0  2/01/2016         2547         1          X52  0EM7L   
1                1  2/01/2016          822         2          2ML  68BRQ   
2                2  2/01/2016         3686         3          0H2  CZUZX   
3                3  2/01/2016         3719         4          0H2  549KK   
4                4  2/01/2016         9200         5          0H2  K8EHH   
...            ...        ...          ...       ...          ...    ...   
131701      131701  4/07/2016        20203     32900          IEV  FO112   
131702      131702  4/07/2016        20203     32900          N8U  I36F2   
131703      131703  4/07/2016        20203     32900          U5F  4X8P4   
131704      131704  4/07/2016        20203     32900          0H2  ZVTO4   
131705      131705  4/07/2016        20203     32900          Q4N  QM9BP   

        Quantity  Sales_Amount  
0            1.0          3.13  
1            1.0     

In [8]:
df['Quantity'] = df['Quantity'].astype('int64')

In [9]:
# # Load formarted dataframe into dataset file
# df.to_csv(file_name)
# print(df.info())

# 2. Perform Association mining

### Need to add first:
-measures of relationships between variables to identify interestingness of relationship. This will allow us to provide more info on what to do in terms of association mining. 
-data appears clean on first glance. 
-data types appear appropriate on first glance
-is there interestingness in the correlations in specific values?

In [10]:
#Remaining cells are Mel's version of using the tutorial 8 code with the assignment data. 13/9/22
#group by account, then list all services
transactions = df.groupby(['Sales_ID'])['SKU_Category'].apply(list)

print(transactions.head(5))

Sales_ID
1    [X52]
2    [2ML]
3    [0H2]
4    [0H2]
5    [0H2]
Name: SKU_Category, dtype: object


In [11]:
#need to install apyori at home using terminal command pip install apyori
from apyori import apriori

#type cast the transactions from panadas into normal list format and 
#run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support = 0.015, min_confidence = 0.015))
# 0.07 => loose Lift more than 1 (item with very lift is interesting)

#print first 5 rules
print(results)

[RelationRecord(items=frozenset({'01F'}), support=0.02682353668717727, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'01F'}), confidence=0.02682353668717727, lift=1.0)]), RelationRecord(items=frozenset({'0H2'}), support=0.06429918679076095, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0H2'}), confidence=0.06429918679076095, lift=1.0)]), RelationRecord(items=frozenset({'1VL'}), support=0.027658390278593736, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1VL'}), confidence=0.027658390278593736, lift=1.0)]), RelationRecord(items=frozenset({'29A'}), support=0.044587365882316564, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'29A'}), confidence=0.044587365882316564, lift=1.0)]), RelationRecord(items=frozenset({'2ML'}), support=0.023422281314739803, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'2ML'}), confidenc

In [12]:
# Define new min_support and min_confidence to looking for '01F' product category
results_3 = list(apriori(transaction_list, min_support = 0.005, min_confidence = 0.005))
# 0.07 => loose Lift more than 1 (item with very lift is interesting)

#print first 5 rules
print(results_3)

[RelationRecord(items=frozenset({'01F'}), support=0.02682353668717727, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'01F'}), confidence=0.02682353668717727, lift=1.0)]), RelationRecord(items=frozenset({'0H2'}), support=0.06429918679076095, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0H2'}), confidence=0.06429918679076095, lift=1.0)]), RelationRecord(items=frozenset({'0KX'}), support=0.006802510744874927, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0KX'}), confidence=0.006802510744874927, lift=1.0)]), RelationRecord(items=frozenset({'0WT'}), support=0.010296527627469776, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'0WT'}), confidence=0.010296527627469776, lift=1.0)]), RelationRecord(items=frozenset({'1EO'}), support=0.013697782999907239, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1EO'}), confidenc

In [13]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            #items_base = left side of rules, items_add = right side
            #support, confidence, lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, 
                          rule.confidence, rule.lift])
        
    #typecaset to pandas df
    return pd.DataFrame(rules, columns = ['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])


In [14]:
result_df = convert_apriori_results_to_pandas_df(results)

print(result_df)

   Left_side Right_side   Support  Confidence      Lift
0                   01F  0.026824    0.026824  1.000000
1                   0H2  0.064299    0.064299  1.000000
2                   1VL  0.027658    0.027658  1.000000
3                   29A  0.044587    0.044587  1.000000
4                   2ML  0.023422    0.023422  1.000000
5                   6BZ  0.028045    0.028045  1.000000
6                   8HU  0.035868    0.035868  1.000000
7                   9ZX  0.018985    0.018985  1.000000
8                   A38  0.022356    0.022356  1.000000
9                   C8Z  0.023206    0.023206  1.000000
10                  EKM  0.016434    0.016434  1.000000
11                  FEW  0.045407    0.045407  1.000000
12                  FU5  0.041789    0.041789  1.000000
13                  H15  0.042809    0.042809  1.000000
14                  IEV  0.063712    0.063712  1.000000
15                  J4R  0.033317    0.033317  1.000000
16                  JPI  0.024752    0.024752  1

In [15]:

result_df_3 = convert_apriori_results_to_pandas_df(results_3)

print(result_df_3)

    Left_side Right_side   Support  Confidence      Lift
0                    01F  0.026824    0.026824  1.000000
1                    0H2  0.064299    0.064299  1.000000
2                    0KX  0.006803    0.006803  1.000000
3                    0WT  0.010297    0.010297  1.000000
4                    1EO  0.013698    0.013698  1.000000
..        ...        ...       ...         ...       ...
313       N8U    OXH,LPF  0.006973    0.045519  2.268294
314       OXH    N8U,LPF  0.006973    0.167161  9.919540
315   N8U,LPF        OXH  0.006973    0.413761  9.919540
316   OXH,LPF        N8U  0.006973    0.347458  2.268294
317   OXH,N8U        LPF  0.006973    0.626389  8.715011

[318 rows x 5 columns]


In [16]:
result_df = result_df.sort_values(by='Lift', ascending = False)
print(result_df.head(10))

   Left_side Right_side   Support  Confidence      Lift
35       8HU        U5F  0.015476    0.431466  7.739338
36       U5F        8HU  0.015476    0.277593  7.739338
42       LPF        FU5  0.020670    0.287589  6.881914
41       FU5        LPF  0.020670    0.494636  6.881914
44       IEV        LPF  0.031152    0.488959  6.802935
45       LPF        IEV  0.031152    0.433427  6.802935
57       OXH        LPF  0.020067    0.481097  6.693552
56       LPF        OXH  0.020067    0.279200  6.693552
50       IEV        OXH  0.017656    0.277117  6.643623
51       OXH        IEV  0.017656    0.423277  6.643623


In [17]:
# printing the frequntly items 
result_df[(result_df['Lift'] > 1) &
                   (result_df['Support'] >= 0.015) ]

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
35,8HU,U5F,0.015476,0.431466,7.739338
36,U5F,8HU,0.015476,0.277593,7.739338
42,LPF,FU5,0.02067,0.287589,6.881914
41,FU5,LPF,0.02067,0.494636,6.881914
44,IEV,LPF,0.031152,0.488959,6.802935
45,LPF,IEV,0.031152,0.433427,6.802935
57,OXH,LPF,0.020067,0.481097,6.693552
56,LPF,OXH,0.020067,0.2792,6.693552
50,IEV,OXH,0.017656,0.277117,6.643623
51,OXH,IEV,0.017656,0.423277,6.643623


### Performing Sequential Rule Mining Using SPMF

In [18]:
transactions = df.groupby(['Sales_ID'])['SKU_Category'].apply(list)
sequences = transactions.values.tolist()

# show the first 5 sequences
print(sequences[:10])

[['X52'], ['2ML'], ['0H2'], ['0H2'], ['0H2'], ['JPI'], ['XG4', 'FEW'], ['0H2'], ['N5F', 'H8O'], ['N8U', 'JR5']]


In [19]:
from collections import defaultdict
import subprocess
import re

#print first 5 rules
print(results2)

In [10]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            #items_base = left side of rules, items_add = right side
            #support, confidence, lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, 
                          rule.confidence, rule.lift])
        
    #typecaset to pandas df
    return pd.DataFrame(rules, columns = ['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])

result_df = convert_apriori_results_to_pandas_df(results2)

print(result_df)

       Left_side       Right_side   Support  Confidence      Lift
0                             01F  0.041635    0.041635  1.000000
1                             0H2  0.140376    0.140376  1.000000
2                             0WT  0.025503    0.025503  1.000000
3                             1EO  0.025768    0.025768  1.000000
4                             1L6  0.023646    0.023646  1.000000
5                             1TS  0.026519    0.026519  1.000000
6                             1VL  0.062276    0.062276  1.000000
7                             29A  0.097635    0.097635  1.000000
8                             2ML  0.057635    0.057635  1.000000
9                             69K  0.025680    0.025680  1.000000
10                            6BZ  0.051801    0.051801  1.000000
11                            8HU  0.058740    0.058740  1.000000
12                            9ZX  0.032398    0.032398  1.000000
13                            A38  0.042077    0.042077  1.000000
14        

In [20]:
get_association_rules(sequences, 0.055, 0.055)

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
0,[0H2],[N8U],0.005643,0.087761
1,[P42],[N8U],0.007421,0.107575
2,[N8U],[R6E],0.007684,0.050161
3,[N8U],[OXH],0.007344,0.047941
4,[N8U],[29A],0.005736,0.037444
5,[N8U],[FU5],0.007282,0.047537
6,[N8U],[LPF],0.013188,0.086092
7,[N8U],"[LPF, IEV]",0.00671,0.043803
8,[N8U],[IEV],0.012956,0.084578
9,[OXH],[FU5],0.006431,0.154188


### Question 3: Identify the top-5 common product categories that customers bought with the product category ‘01F’.

In [21]:
# LPF, IEV, N8U, OXH, FU5
result_df_3[(result_df_3['Right_side'] == '01F') | (result_df_3['Left_side'] == '01F')].sort_values(by='Support', ascending = False)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
0,,01F,0.026824,0.026824,1.0
87,01F,IEV,0.012909,0.481268,7.553841
88,IEV,01F,0.012909,0.202621,7.553841
90,01F,LPF,0.012198,0.454755,6.327052
91,LPF,01F,0.012198,0.169714,6.327052
96,01F,OXH,0.008163,0.304323,7.295851
97,OXH,01F,0.008163,0.195701,7.295851
85,FU5,01F,0.007498,0.17943,6.689284
84,01F,FU5,0.007498,0.279539,6.689284
93,01F,N8U,0.007421,0.276657,1.806089


In [22]:
# LPF, IEV, N8U, OXH, FU5
result_df_3[(result_df_3['Right_side'] == '01F') | (result_df_3['Left_side'] == '01F')].sort_values(by='Confidence', ascending = False)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
87,01F,IEV,0.012909,0.481268,7.553841
90,01F,LPF,0.012198,0.454755,6.327052
96,01F,OXH,0.008163,0.304323,7.295851
84,01F,FU5,0.007498,0.279539,6.689284
93,01F,N8U,0.007421,0.276657,1.806089
249,01F,"IEV,LPF",0.006447,0.240346,7.715161
99,01F,U5F,0.005859,0.218444,3.918298
81,01F,6BZ,0.005612,0.209222,7.460249
254,"IEV,LPF",01F,0.006447,0.206948,7.715161
88,IEV,01F,0.012909,0.202621,7.553841
