In [None]:
import numpy as np
from tqdm import tqdm
from numba import njit
import pandas as pd
# from project import Project

In [None]:
dataSet = pd.read_csv('card_transdata.csv')
# dataSet = pd.read_csv('Book1.csv', header=None)

## Remove missing data

In [None]:
dataSet.dropna(inplace=True)

## Handle Outliers

In [None]:
#Handle outliers
def handle_outliers(df):
    # Select numerical columns only
    num_cols = df.select_dtypes(include=[np.number])
    
    # Compute the 1st and 99th percentile of each numerical column
    percentiles = np.nanpercentile(num_cols, [1, 99], axis=0)

    # Winsorize the numerical columns
    num_cols = np.clip(num_cols, percentiles[0], percentiles[1])

    # Replace the original numerical columns in the dataframe with the winsorized ones
    df[num_cols.columns] = num_cols

# handle_outliers(dataSet)

# for col in NUM_COL:
#     plotting(dataSet, col)

# dataSet.describe()
(dataSet.head(5))


In [None]:
##convert the distance from home where >40 is Far and 15<distance<40 is Medium and <15 is Close
#use function cut

dataSet['distance_from_home'] = pd.qcut(dataSet['distance_from_home'], q=3, labels=['Close_from_home', 'Medium_from_home', 'Far_from_home'])
dataSet['distance_from_last_transaction'] = pd.qcut(dataSet['distance_from_last_transaction'], q=3, labels=['Close_from_lt', 'Medium_from_lt', 'Far_from_lt'])
dataSet['ratio_to_median_purchase_price'] = pd.qcut(dataSet['ratio_to_median_purchase_price'], q=4, labels=['Low_ratio', 'Medium_ratio', 'High_ratio','Extreme_ratio'])
dataSet['repeat_retailer'] = pd.cut(dataSet['repeat_retailer'], bins=[-0.5, 0.9, np.inf], labels=['no_repeat', 'repeat'])
dataSet['used_chip'] = pd.cut(dataSet['used_chip'], bins=[-0.5, 0.9, np.inf], labels=['no_chip', 'chip'])
dataSet['used_pin_number'] = pd.cut(dataSet['used_pin_number'], bins=[-0.5, 0.9, np.inf], labels=['no_pin', 'pin'])
dataSet['online_order'] = pd.cut(dataSet['online_order'], bins=[-0.5, 0.9, np.inf], labels=['offline', 'online'])
dataSet['fraud'] = pd.cut(dataSet['fraud'], bins=[-0.5, 0.9, np.inf], labels=['not_fraud', 'fraud'])
(dataSet.head(5))


In [None]:
# Convert the dataframe into a list of transactions
transactions = []
for i in tqdm(range(len(dataSet))):
    transactions.append([str(item) for item in dataSet.iloc[i]])
    
print(transactions[0:5])

# Set the minimum support and confidence thresholds
min_support = 2/9
min_confidence = 0.5





In [None]:
# Generate a list of frequent 1-itemsets
def generate_frequent_1_itemsets(transactions, min_support):
    #key = item, value = count
    item_counts = {}
    frequent_items = []
    for transaction in transactions:
        for item in transaction:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1
    print(item_counts)
    for item, count in item_counts.items():
        # if item == 'nan':
        #     continue
        support = count / len(transactions)
        if support >= min_support:
            frequent_items.append({item})
    return frequent_items
frequent_itemsets = generate_frequent_1_itemsets(transactions, min_support)

print((frequent_itemsets))


In [None]:
# Generate frequent k-itemsets
from itertools import combinations
k = 2
final_itemSets = []
while len(frequent_itemsets) > 0:
    candidate_itemsets :list[list] = []
    # Generate candidate itemsets of size k
    for i in range(len(frequent_itemsets)):
        for j in range(i+1, len(frequent_itemsets)):
            itemset1 = set(frequent_itemsets[i])
            itemset2 = set(frequent_itemsets[j])
            # print("itemset1", itemset1, "itemset2", itemset2)
            candidate_itemset = list(itemset1.union(itemset2))
            # Prune the candidate itemsets
            if all([set(itemset) in frequent_itemsets for itemset in combinations(candidate_itemset, k-1)]):
                candidate_itemsets.append(candidate_itemset)
            # print(frequent_itemsets)

                # if itemset in frequent_itemsets:
    
                    
    # Count the support of each candidate itemset
    item_counts = {}
    for transaction in tqdm(transactions):
        for candidate_itemset in candidate_itemsets:
            if set(candidate_itemset).issubset(set(transaction)):
                item = sorted(candidate_itemset)
                if str(item) in item_counts:
                    item_counts[str(item)] += 1
                else:
                    item_counts[str(item)] = 1

    # print("item_counts", item_counts)
    # Generate a list of frequent k-itemsets
    
    frequent_itemsets = []
    for itemset, count in item_counts.items():
        support = count / len(transactions)
        if support >= min_support:
            # print("itemset", itemset)
            tempSet = set()
            for item in eval(itemset):
                tempSet.add(item)
            frequent_itemsets.append(tempSet)
    if len(frequent_itemsets) > 0:
        final_itemSets = frequent_itemsets
    
    k += 1
    

In [None]:
print(final_itemSets)
frequent_itemsets = final_itemSets

In [None]:
# Generate association rules from frequent itemsets
def generate_association_rules(frequent_itemsets, min_confidence):
    rules = []
    for itemset in tqdm(frequent_itemsets):
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                for left in combinations(itemset, i):
                    right = list(set(itemset) - set(left))
                    left_support = 0
                    right_support = 0
                    itemset_support = 0
                    for transaction in transactions:
                        if set(left).issubset(set(transaction)):
                            left_support += 1
                        if set(right).issubset(set(transaction)):
                            right_support += 1
                        if set(itemset).issubset(set(transaction)):
                            itemset_support += 1
                    left_support /= len(transactions)
                    right_support /= len(transactions)
                    itemset_support /= len(transactions)
                    confidence = itemset_support / left_support
                    lift = confidence / right_support
                    if confidence >= min_confidence:
                        rules.append((left, right, confidence, lift))
    return rules

association_rules = generate_association_rules(frequent_itemsets, min_confidence)

# Print the frequent itemsets and association rules
print("Frequent itemsets:")
print(frequent_itemsets)
print("Association rules:")
for rule in association_rules:
    print("{} => {} (confidence: {:.2f}, lift: {:.2f})".format(rule[0], rule[1], rule[2], rule[3]))

In [185]:
print("Frequent itemsets:")
print(frequent_itemsets)
print("Association rules:")
for rule in association_rules:
    print("{} => {} (confidence: {:.2f}, lift: {:.2f})".format(rule[0], rule[1], rule[2], rule[3]))

Frequent itemsets:
[{'not_fraud', 'offline', 'no_pin', 'repeat', 'no_chip'}, {'not_fraud', 'online', 'no_pin', 'repeat', 'no_chip'}, {'chip', 'not_fraud', 'online', 'no_pin', 'repeat'}]
Association rules:
('offline',) => ['repeat', 'no_chip', 'no_pin', 'not_fraud'] (confidence: 0.51, lift: 1.12)
('not_fraud', 'offline') => ['repeat', 'no_chip', 'no_pin'] (confidence: 0.52, lift: 1.01)
('offline', 'no_pin') => ['repeat', 'no_chip', 'not_fraud'] (confidence: 0.57, lift: 1.10)
('offline', 'repeat') => ['no_chip', 'no_pin', 'not_fraud'] (confidence: 0.58, lift: 1.12)
('offline', 'no_chip') => ['repeat', 'no_pin', 'not_fraud'] (confidence: 0.79, lift: 1.10)
('not_fraud', 'offline', 'no_pin') => ['repeat', 'no_chip'] (confidence: 0.58, lift: 1.01)
('not_fraud', 'offline', 'repeat') => ['no_chip', 'no_pin'] (confidence: 0.58, lift: 1.00)
('not_fraud', 'offline', 'no_chip') => ['repeat', 'no_pin'] (confidence: 0.80, lift: 1.01)
('offline', 'no_pin', 'repeat') => ['no_chip', 'not_fraud'] (confi

## Frequent itemsets:
[{'not_fraud', 'offline', 'no_pin', 'repeat', 'no_chip'}, {'not_fraud', 'online', 'no_pin', 'repeat', 'no_chip'},</br> 
{'chip', 'not_fraud', 'online', 'no_pin', 'repeat'}]
## Association rules:
('offline',) => ['repeat', 'no_chip', 'no_pin', 'not_fraud'] (confidence: 0.51, lift: 1.12)</br>
('not_fraud', 'offline') => ['repeat', 'no_chip', 'no_pin'] (confidence: 0.52, lift: 1.01)</br>
('offline', 'no_pin') => ['repeat', 'no_chip', 'not_fraud'] (confidence: 0.57, lift: 1.10)</br>
('offline', 'repeat') => ['no_chip', 'no_pin', 'not_fraud'] (confidence: 0.58, lift: 1.12)</br>
('offline', 'no_chip') => ['repeat', 'no_pin', 'not_fraud'] (confidence: 0.79, lift: 1.10)</br>
('not_fraud', 'offline', 'no_pin') => ['repeat', 'no_chip'] (confidence: 0.58, lift: 1.01)</br>
('not_fraud', 'offline', 'repeat') => ['no_chip', 'no_pin'] (confidence: 0.58, lift: 1.00)</br>
('not_fraud', 'offline', 'no_chip') => ['repeat', 'no_pin'] (confidence: 0.80, lift: 1.01)</br>
('offline', 'no_pin', 'repeat') => ['no_chip', 'not_fraud'] (confidence: 0.65, lift: 1.10)</br>
('offline', 'no_pin', 'no_chip') => ['repeat', 'not_fraud'] (confidence: 0.88, lift: 1.09)</br>
('offline', 'repeat', 'no_chip') => ['no_pin', 'not_fraud'] (confidence: 0.89, lift: 1.10)</br>
('not_fraud', 'offline', 'no_pin', 'repeat') => ['no_chip'] (confidence: 0.65, lift: 1.00)</br>
('not_fraud', 'offline', 'no_pin', 'no_chip') => ['repeat'] (confidence: 0.89, lift: 1.01)</br>
('not_fraud', 'offline', 'repeat', 'no_chip') => ['no_pin'] (confidence: 0.90, lift: 1.00)</br>
('offline', 'no_pin', 'repeat', 'no_chip') => ['not_fraud'] (confidence: 0.99, lift: 1.09)</br>
('online', 'no_chip') => ['repeat', 'no_pin', 'not_fraud'] (confidence: 0.66, lift: 0.92)</br>
('not_fraud', 'online', 'no_pin') => ['repeat', 'no_chip'] (confidence: 0.56, lift: 0.97)</br>
('not_fraud', 'online', 'repeat') => ['no_chip', 'no_pin'] (confidence: 0.56, lift: 0.96)</br>
('not_fraud', 'online', 'no_chip') => ['repeat', 'no_pin'] (confidence: 0.77, lift: 0.97)</br>
('not_fraud', 'no_pin', 'no_chip') => ['repeat', 'online'] (confidence: 0.54, lift: 0.94)</br>
('not_fraud', 'repeat', 'no_chip') => ['no_pin', 'online'] (confidence: 0.54, lift: 0.93)</br>
('online', 'no_pin', 'repeat') => ['no_chip', 'not_fraud'] (confidence: 0.54, lift: 0.93)</br>
('online', 'no_pin', 'no_chip') => ['repeat', 'not_fraud'] (confidence: 0.73, lift: 0.91)</br>
('online', 'repeat', 'no_chip') => ['no_pin', 'not_fraud'] (confidence: 0.75, lift: 0.92)</br>
('no_pin', 'repeat', 'no_chip') => ['not_fraud', 'online'] (confidence: 0.54, lift: 0.95)</br>
('not_fraud', 'online', 'no_pin', 'repeat') => ['no_chip'] (confidence: 0.63, lift: 0.97)</br>
('not_fraud', 'online', 'no_pin', 'no_chip') => ['repeat'] (confidence: 0.87, lift: 0.99)</br>
('not_fraud', 'online', 'repeat', 'no_chip') => ['no_pin'] (confidence: 0.88, lift: 0.98)</br>
('not_fraud', 'no_pin', 'repeat', 'no_chip') => ['online'] (confidence: 0.61, lift: 0.94)</br>
('online', 'no_pin', 'repeat', 'no_chip') => ['not_fraud'] (confidence: 0.83, lift: 0.91)</br>
('chip', 'online') => ['repeat', 'no_pin', 'not_fraud'] (confidence: 0.71, lift: 0.99)</br>
('chip', 'no_pin') => ['repeat', 'not_fraud', 'online'] (confidence: 0.51, lift: 1.03)</br>
('chip', 'repeat') => ['not_fraud', 'no_pin', 'online'] (confidence: 0.53, lift: 1.05)</br>
('chip', 'not_fraud', 'online') => ['repeat', 'no_pin'] (confidence: 0.79, lift: 0.99)</br>
('chip', 'not_fraud', 'no_pin') => ['repeat', 'online'] (confidence: 0.55, lift: 0.97)</br>
('chip', 'not_fraud', 'repeat') => ['no_pin', 'online'] (confidence: 0.56, lift: 0.96)</br>
('chip', 'online', 'no_pin') => ['repeat', 'not_fraud'] (confidence: 0.79, lift: 0.98)</br>
('chip', 'online', 'repeat') => ['no_pin', 'not_fraud'] (confidence: 0.81, lift: 0.99)</br>
('chip', 'no_pin', 'repeat') => ['not_fraud', 'online'] (confidence: 0.58, lift: 1.03)</br>
('chip', 'not_fraud', 'online', 'no_pin') => ['repeat'] (confidence: 0.88, lift: 1.00)</br>
('chip', 'not_fraud', 'online', 'repeat') => ['no_pin'] (confidence: 0.89, lift: 0.99)</br>
('chip', 'not_fraud', 'no_pin', 'repeat') => ['online'] (confidence: 0.63, lift: 0.97)</br>
('chip', 'online', 'no_pin', 'repeat') => ['not_fraud'] (confidence: 0.90, lift: 0.98)</br>