## Import-Moudle

In [3]:
from apyori import apriori
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.frequent_patterns import fpgrowth
import pandas as pd
import numpy as np 
import time

## 讀取檔案

In [None]:
df = pd.read_excel(r'交易資料集.xlsx')

# 數量為零或負值的交易代表退貨或註銷，應於前置處理中剔除
df_clear = df.drop(df[df['QUANTITY']<=0].index)

df_clear = df_clear.drop(columns=['ITEM_ID'])
df_clear = df_clear.drop(columns=['ITEM_NO'])

df_clear = df_clear.drop(columns=['CUST_ID'])
df_clear = df_clear.drop(columns=['TRX_DATE'])

# 聚合
df_clear = (df_clear
          .groupby(['INVOICE_NO', 'PRODUCT_TYPE'])['QUANTITY']
          .sum().unstack().reset_index().fillna(0)
          .set_index('INVOICE_NO'))

# 清理只有一個item的交易

# 先挑出滿足條件的行
mask = (df_clear.iloc[:, 1:] > 0).sum(axis=1) == 1
# 刪除滿足條件的行
df_clear = df_clear.drop(df_clear[mask].index)

df_clear.to_csv("pretreatment.csv", encoding="utf-8")
# df_clear

print("欄位" + str(df_clear.columns))
print("數據量" + str(df_clear.shape))

In [2]:
def hot_encode(x):
    if(x<= 0):
        return False
    if(x>= 1):
        return True
    
# Encoding the datasets
df_clear_encoded = df_clear.applymap(hot_encode)

data = df_clear_encoded

# data.columns
data

PRODUCT_TYPE,CHIPSET / ASP,CPU / MPU,DISCRETE,LINEAR IC,LOGIC IC,MEMORY_EMBEDED,MEMORY_SYSTEM,OPTICAL AND SENSOR,OTHERS,PEMCO
INVOICE_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5153,False,True,False,False,False,False,False,False,True,False
216070003,False,False,True,False,True,False,False,False,False,False
216070004,False,False,True,False,True,False,False,False,False,False
216070011,False,False,True,True,True,False,False,False,False,False
216070018,False,False,False,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
W016079115,False,True,False,True,False,False,False,False,False,False
W016079116,False,False,True,True,False,False,False,False,True,False
W016079120,False,False,True,True,False,False,False,False,False,False
W016079121,False,False,True,True,True,True,False,False,False,False


In [3]:
fpg = fpgrowth(data, min_support=0.00395, use_colnames = True)
fpg

Unnamed: 0,support,itemsets
0,0.130384,(OTHERS)
1,0.110700,(CPU / MPU)
2,0.411616,(LOGIC IC)
3,0.406938,(DISCRETE)
4,0.501657,(LINEAR IC)
...,...,...
87,0.004483,"(MEMORY_SYSTEM, LOGIC IC)"
88,0.029234,"(OTHERS, MEMORY_SYSTEM)"
89,0.014617,"(MEMORY_SYSTEM, MEMORY_EMBEDED)"
90,0.004288,"(LINEAR IC, MEMORY_SYSTEM, DISCRETE)"


In [4]:
# 從頻繁項集中生成關聯規則

# confidence 
# metric 感興趣的指標  'support'、'confidence'
# min threshold  該感性指標的最低標準
rules = association_rules(fpg, metric='confidence', min_threshold=0.8)
#rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules_df = pd.DataFrame(rules)
rules_df.to_csv("fpg_rules.csv", encoding="utf-8", index=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(OTHERS, DISCRETE, LOGIC IC)",(LINEAR IC),0.019684,0.501657,0.015786,0.80198,1.598664,0.005912,2.516634,0.381997
1,"(DISCRETE, CPU / MPU, LOGIC IC)",(LINEAR IC),0.014812,0.501657,0.012083,0.815789,1.626191,0.004653,2.705293,0.390855
2,"(OTHERS, LINEAR IC, MEMORY_EMBEDED)",(DISCRETE),0.005067,0.406938,0.004288,0.846154,2.079318,0.002226,3.854902,0.521717
3,"(OTHERS, DISCRETE, MEMORY_EMBEDED)",(LINEAR IC),0.004483,0.501657,0.004288,0.956522,1.906726,0.002039,11.461898,0.477682
4,"(LINEAR IC, CPU / MPU, MEMORY_EMBEDED)",(DISCRETE),0.006431,0.406938,0.005652,0.878788,2.159512,0.003035,4.89276,0.540408
5,"(DISCRETE, CPU / MPU, MEMORY_EMBEDED)",(LINEAR IC),0.006431,0.501657,0.005652,0.878788,1.751772,0.002426,4.111333,0.431927
6,"(CPU / MPU, LOGIC IC, MEMORY_EMBEDED)",(DISCRETE),0.005847,0.406938,0.005262,0.9,2.211638,0.002883,5.930618,0.551068
7,"(DISCRETE, CPU / MPU, MEMORY_EMBEDED)",(LOGIC IC),0.006431,0.411616,0.005262,0.818182,1.987732,0.002615,3.236114,0.500131
8,"(CPU / MPU, LOGIC IC, MEMORY_EMBEDED)",(LINEAR IC),0.005847,0.501657,0.004872,0.833333,1.661163,0.001939,2.99006,0.400353
9,"(DISCRETE, CPU / MPU, LOGIC IC, MEMORY_EMBEDED)",(LINEAR IC),0.005262,0.501657,0.004677,0.888889,1.771907,0.002038,4.485091,0.437941


In [5]:
# 現在時間
to = time()
fpg = fpgrowth(data, min_support = 0.00395, use_colnames = True)
rules = association_rules(fpg)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
# 結束時間
time = time() - to
print("花費時間"+ str(time))

花費時間0.037996530532836914


In [None]:
# 定義一個函數，將字符串轉換為列表
def string_to_list(string):
    string = string.replace("frozenset({'", "")
    string = string.replace("'})", "")
    string = string.replace("'", "")
    string = string.split(", ")
    return string
rules_df = pd.read_csv(r'fpg_rules.csv')

user_input = input("請輸入您想購買的產品名稱：")
item = user_input.split(',')
prediction_items = list(map(str, item))

# 集合
item = []
item = set(item)

for index, row in rules_df.iterrows():
    rules = string_to_list(row['antecedents'])
    if set(prediction_items).issubset(rules):
        item.update(string_to_list(row['consequents']))
print("為您推薦其他的產品:" + str(item))

In [None]:
def fpgrowth_test(ms, mt,data): 
    fpg = fpgrowth(data, min_support = ms, use_colnames = True)
    rules = association_rules(fpg, metric='confidence', min_threshold=mt)
    rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
    dd = pd.DataFrame(rules)
    return len(dd)


data = df_clear_encoded
min_support =  np.arange(0.00195, 0.004, 0.00025)
min_confidence = np.arange(0.6, 0.8, 0.05)
times = []
min_support_l = []
min_confidence_l = []
rule = []

for ms in min_support:
    for mt in min_confidence:
        to = time.time()
        e = fpgrowth_test(ms, mt,data)
        time_d = time.time() - to
        min_support_l.append(ms)
        min_confidence_l.append(mt)
        times.append(time_d)
        rule.append(e)
df = {
    "minSupport": min_support_l,
    "minConfidence": min_confidence_l,
    "time(s)": times,
    "count": rule
}
df_a = pd.DataFrame(df)
df_a