# Input Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# CSV dataset file name
file_name = 'D1.csv'

In [None]:
# Please put csv file in the same folder with this jupyter notebook
df = pd.read_csv(file_name)

In [None]:
print(df.info())

# 1. Data pre-processing

In [None]:
# Checking for '?' as 'Invalid/Unknown'
for col in df.columns:
    print(col,df[col][df[col] == 'NaN'].count())

##### Quantity variable should be in int64 data type. Before converting to integer, this variable is rounded to become integer.

In [None]:
df['Quantity'] = df['Quantity'].apply(np.ceil)

In [None]:
print(df)

In [None]:
df['Quantity'] = df['Quantity'].astype('int64')

In [None]:
# Load formarted dataframe into dataset file
df.to_csv(file_name)
print(df.info())

# 2. Perform Association mining

### Need to add first:
-measures of relationships between variables to identify interestingness of relationship. This will allow us to provide more info on what to do in terms of association mining. 
-data appears clean on first glance. 
-data types appear appropriate on first glance
-is there interestingness in the correlations in specific values?

In [None]:
#Remaining cells are Mel's version of using the tutorial 8 code with the assignment data. 13/9/22
#group by account, then list all services
transactions = df.groupby(['Customer_ID'])['SKU_Category'].apply(list)

print(transactions.head(5))

In [None]:
#need to install apyori at home using terminal command pip install apyori
from apyori import apriori

#type cast the transactions from panadas into normal list format and 
#run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support = 0.055, min_confidence = 0.055))
# 0.07 => loose Lift more than 1 (item with very lift is interesting)

#print first 5 rules
print(results)

In [None]:
# Define new min_support and min_confidence to looking for '01F' product category
results_3 = list(apriori(transaction_list, min_support = 0.015, min_confidence = 0.015))
# 0.07 => loose Lift more than 1 (item with very lift is interesting)

#print first 5 rules
print(results_3)

In [None]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            #items_base = left side of rules, items_add = right side
            #support, confidence, lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, 
                          rule.confidence, rule.lift])
        
    #typecaset to pandas df
    return pd.DataFrame(rules, columns = ['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])


In [None]:
result_df = convert_apriori_results_to_pandas_df(results)

print(result_df)

In [None]:

result_df_3 = convert_apriori_results_to_pandas_df(results_3)

print(result_df_3)

In [None]:
result_df = result_df.sort_values(by='Lift', ascending = False)
print(result_df.head(5))

In [None]:
# printing the frequntly items 
result_df[(result_df['Lift'] > 1) &
                   (result_df['Support'] >= 0.05) ]

### Performing Sequential Rule Mining Using SPMF

In [None]:
transactions = df.groupby(['Customer_ID'])['SKU_Category'].apply(list)
sequences = transactions.values.tolist()

# show the first 5 sequences
print(sequences[:10])

In [None]:
from collections import defaultdict
import subprocess
import re

''' Uses SPMF to find association rules in supplied transactions '''
def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF
    
    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1
    
    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])
                    
                # end of itemset
                z.append(-1)
            
            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')
    
    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth', 'seq_rule_input.txt', 'seq_rule_output.txt', '0.55%', '0.55%'], shell=True)
    
    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [output_dict[x] for x in right.split(',')], sup, conf])
    
    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns = ['Left_rule', 'Right_rule', 'Support', 'Confidence'])

In [None]:
get_association_rules(sequences, 0.055, 0.055)

### Question 3: Identify the top-5 common product categories that customers bought with the product category ‘01F’.

In [None]:
# LPF, IEV, N8U, OXH, FU5
result_df_3[(result_df_3['Right_side'] == '01F') | (result_df_3['Left_side'] == '01F')].sort_values(by='Support', ascending = False)

In [None]:
# LPF, IEV, N8U, OXH, FU5
result_df_3[(result_df_3['Right_side'] == '01F') | (result_df_3['Left_side'] == '01F')].sort_values(by='Confidence', ascending = False)