In [1]:
from apyori import apriori
import pandas as pd
import numpy as np 

df = pd.read_excel(r'交易資料集.xlsx')

#數量為零或負值的交易代表退貨或註銷在前置處理中刪除
df_clear = df.drop(df[df['QUANTITY']<=0].index)

df_clear = df_clear.drop(columns=['ITEM_ID'])
df_clear = df_clear.drop(columns=['ITEM_NO'])
#df_clear = df_clear.drop(columns=['PRODUCT_TYPE'])
df_clear = df_clear.drop(columns=['CUST_ID'])
df_clear = df_clear.drop(columns=['TRX_DATE'])

df_clear = (df_clear
          .groupby(['INVOICE_NO', 'PRODUCT_TYPE'])['QUANTITY']
          .sum().unstack().reset_index().fillna(0)
          .set_index('INVOICE_NO'))

#清理只有一個item的交易

# 先挑出滿足條件的行
mask = (df_clear.iloc[:, 1:] > 0).sum(axis=1) == 1
# 刪除滿足條件的行
df_clear = df_clear.drop(df_clear[mask].index)

df_clear.to_csv("pretreatment.csv", encoding="utf-8")


print("欄位" + str(df_clear.columns))
print("數據量" + str(df_clear.shape))


欄位Index(['CHIPSET / ASP', 'CPU / MPU', 'DISCRETE', 'LINEAR IC', 'LOGIC IC',
       'MEMORY_EMBEDED', 'MEMORY_SYSTEM', 'OPTICAL AND SENSOR', 'OTHERS',
       'PEMCO'],
      dtype='object', name='PRODUCT_TYPE')
數據量(5131, 10)


In [2]:
def hot_encode(x):
    if(x<= 0):
        return False
    if(x>= 1):
        return True
    
# Encoding the datasets
df_clear_encoded = df_clear.applymap(hot_encode)

data = df_clear_encoded

#data.columns
data

PRODUCT_TYPE,CHIPSET / ASP,CPU / MPU,DISCRETE,LINEAR IC,LOGIC IC,MEMORY_EMBEDED,MEMORY_SYSTEM,OPTICAL AND SENSOR,OTHERS,PEMCO
INVOICE_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5153,False,True,False,False,False,False,False,False,True,False
216070003,False,False,True,False,True,False,False,False,False,False
216070004,False,False,True,False,True,False,False,False,False,False
216070011,False,False,True,True,True,False,False,False,False,False
216070018,False,False,False,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
W016079115,False,True,False,True,False,False,False,False,False,False
W016079116,False,False,True,True,False,False,False,False,True,False
W016079120,False,False,True,True,False,False,False,False,False,False
W016079121,False,False,True,True,True,True,False,False,False,False


In [3]:
from mlxtend.frequent_patterns import apriori, association_rules
#min_support最低支持度
#use_colnames轉換對應的物品(itemsets)

frq_items = apriori(data, min_support = 0.00395, use_colnames = True)
frq_items


Unnamed: 0,support,itemsets
0,0.256090,(CHIPSET / ASP)
1,0.110700,(CPU / MPU)
2,0.406938,(DISCRETE)
3,0.501657,(LINEAR IC)
4,0.411616,(LOGIC IC)
...,...,...
87,0.007211,"(PEMCO, DISCRETE, LOGIC IC, LINEAR IC)"
88,0.004288,"(DISCRETE, MEMORY_EMBEDED, LINEAR IC, OTHERS)"
89,0.004093,"(DISCRETE, OPTICAL AND SENSOR, LINEAR IC, OTHERS)"
90,0.004288,"(DISCRETE, OPTICAL AND SENSOR, LOGIC IC, OTHERS)"


In [4]:

#找規則
#confidence 
#metric感興趣的指標'support'、'confidence'
#min threshold感性指標的最低標準
rules = association_rules(frq_items, metric='confidence', min_threshold=0.8)
rules_df = pd.DataFrame(rules)
rules_df.to_csv("apriori_rules.csv", encoding="utf-8", index=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(OPTICAL AND SENSOR),(DISCRETE),0.049113,0.406938,0.039563,0.805556,1.979552,0.019577,3.050032,0.520394
1,"(DISCRETE, CHIPSET / ASP)",(LINEAR IC),0.008575,0.501657,0.007601,0.886364,1.766873,0.003299,4.385422,0.437783
2,"(LOGIC IC, CHIPSET / ASP)",(LINEAR IC),0.007991,0.501657,0.006626,0.829268,1.65306,0.002618,2.918869,0.398243
3,"(MEMORY_SYSTEM, LINEAR IC)",(DISCRETE),0.005262,0.406938,0.004288,0.814815,2.002306,0.002146,3.202534,0.503224
4,"(LOGIC IC, MEMORY_SYSTEM)",(DISCRETE),0.004483,0.406938,0.004288,0.956522,2.350533,0.002464,13.640421,0.577152
5,"(OPTICAL AND SENSOR, LOGIC IC)",(DISCRETE),0.017346,0.406938,0.015592,0.898876,2.208877,0.008533,5.864722,0.556942
6,"(PEMCO, LOGIC IC)",(DISCRETE),0.012278,0.406938,0.010134,0.825397,2.02831,0.005138,3.396627,0.513281
7,"(DISCRETE, LOGIC IC, CHIPSET / ASP)",(LINEAR IC),0.005262,0.501657,0.004872,0.925926,1.845737,0.002233,6.727636,0.460635
8,"(DISCRETE, LOGIC IC, CPU / MPU)",(LINEAR IC),0.014812,0.501657,0.012083,0.815789,1.626191,0.004653,2.705293,0.390855
9,"(DISCRETE, MEMORY_EMBEDED, CPU / MPU)",(LINEAR IC),0.006431,0.501657,0.005652,0.878788,1.751772,0.002426,4.111333,0.431927


In [5]:
from time import time

#現在時間
to = time()
frq_items = apriori(data, min_support = 0.00395, use_colnames = True)
rules = association_rules(frq_items)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
#結束時間
time = time() - to
print("花費時間"+ str(time))

花費時間0.01499629020690918


In [6]:
# 定義一個函數，將字符串轉換為列表
def string_to_list(string):
    string = string.replace("frozenset({'", "")
    string = string.replace("'})", "")
    string = string.replace("'", "")
    string = string.split(", ")
    return string

rules_df = pd.read_csv(r'apriori_rules.csv')
user_input = input("請輸入您想購買的產品名稱：")
item = user_input.split(',')
prediction_items = list(map(str, item))

#集合
item = []
item = set(item)

for index, row in rules_df.iterrows():
    #處理規則(轉成list)
    rules = string_to_list(row['antecedents'])
    if set(prediction_items).issubset(rules):
        item.update(string_to_list(row['consequents']))
print("為您推薦其他的產品:" + str(item))

請輸入您想購買的產品名稱：DISCRETE
為您推薦其他的產品:{'LOGIC IC', 'LINEAR IC'}


In [7]:
import time
def apriori_test(ms, mt,data): 
   
    frq_items = apriori(data, min_support = ms, use_colnames = True)
    rules = association_rules(frq_items, metric='confidence', min_threshold=mt)
    return rules


data = df_clear_encoded
# print(data)
min_support =  np.arange(0.00195, 0.004, 0.00025)
min_confidence = np.arange(0.6, 0.8, 0.05)
times = []
min_support_l = []
min_confidence_l = []
rule = []

for ms in min_support:
    for mt in min_confidence:
        #現在時間
        to = time.time()
        rules = apriori_test(ms, mt,data)
        #結束時間
#         print(e)
        time_d = time.time() - to
        dd = pd.DataFrame(rules)
        e = len(dd)
        min_support_l.append(ms)
        min_confidence_l.append(mt)
        times.append(time_d)
        rule.append(e)
df = {
    "minSupport": min_support_l,
    "minConfidence": min_confidence_l,
    "time(s)": times,
    "count": rule
}
df = pd.DataFrame(df)
df        

Unnamed: 0,minSupport,minConfidence,time(s),count
0,0.00195,0.6,0.016002,140
1,0.00195,0.65,0.013028,124
2,0.00195,0.7,0.012998,106
3,0.00195,0.75,0.012995,81
4,0.00195,0.8,0.012007,57
5,0.0022,0.6,0.012001,127
6,0.0022,0.65,0.013,113
7,0.0022,0.7,0.011993,97
8,0.0022,0.75,0.011006,72
9,0.0022,0.8,0.012971,51


In [8]:
import time
from mlxtend.frequent_patterns import fpgrowth
def fpgrowth_test(ms, mt,data): 
    #現在時間
    #to = time()
    fpg = fpgrowth(data, min_support = ms, use_colnames = True)
    rules = association_rules(fpg, metric='confidence', min_threshold=mt)
    return rules


data = df_clear_encoded
#print(data)
min_support =  np.arange(0.00195, 0.004, 0.00025)
min_confidence = np.arange(0.6, 0.8, 0.05)
times = []
min_support_l = []
min_confidence_l = []
rule = []

for ms in min_support:
    for mt in min_confidence:
        #現在時間
        to = time.time()
        rules = fpgrowth_test(ms, mt,data)
        #結束時間
#        print(e)
        time_d = time.time() - to
        dd = pd.DataFrame(rules)
        e = len(dd)
        min_support_l.append(ms)
        min_confidence_l.append(mt)
        times.append(time_d)
        rule.append(e)
df = {
    "minSupport": min_support_l,
    "minConfidence": min_confidence_l,
    "time(s)": times,
    "count": rule
}
df_a = pd.DataFrame(df)
df_a


Unnamed: 0,minSupport,minConfidence,time(s),count
0,0.00195,0.6,0.043996,140
1,0.00195,0.65,0.040001,124
2,0.00195,0.7,0.034,106
3,0.00195,0.75,0.046001,81
4,0.00195,0.8,0.037999,57
5,0.0022,0.6,0.035026,127
6,0.0022,0.65,0.034997,113
7,0.0022,0.7,0.032979,97
8,0.0022,0.75,0.033027,72
9,0.0022,0.8,0.03297,51
