In [3]:
#import random
import datetime
import time
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
#import jieba
#import jieba.analyse
#import gensim
#from sklearn.manifold import TSNE

from sklearn.cross_validation import train_test_split

%matplotlib notebook

plt.rcParams['font.sans-serif']=['SimHei'] # chinese

In [7]:
import apriori as ap
import freq_patt_tree as fpt
from fastFM import als
from scipy import sparse

In [None]:
import imp
fp = imp.reload(fp)
ap = imp.reload(ap)

In [5]:
time_s = time.time()
large_itemsets, recomm_rules = ap.run(ap.load_file('../data/sample_simple.csv'), 2, 0.8,
                                      use_fp_tree=False, output_support_only=True)
time_e = time.time()
print('time diff:', time_e - time_s)
ap.dump(large_itemsets, recomm_rules)

time diff: 0.002992868423461914
Itemset: ('mango',) support: 2.000
Itemset: ('chicken',) support: 2.000
Itemset: ('chicken', 'beer') support: 2.000
Itemset: ('apple', 'rice') support: 2.000
Itemset: ('milk', 'rice') support: 2.000
Itemset: ('chicken', 'rice') support: 2.000
Itemset: ('rice', 'apple', 'beer') support: 2.000
Itemset: ('milk', 'rice', 'beer') support: 2.000
Itemset: ('chicken', 'rice', 'beer') support: 2.000
Itemset: ('apple', 'beer') support: 3.000
Itemset: ('milk', 'beer') support: 3.000
Itemset: ('rice',) support: 4.000
Itemset: ('milk',) support: 4.000
Itemset: ('apple',) support: 4.000
Itemset: ('rice', 'beer') support: 4.000
Itemset: ('beer',) support: 6.000
Rule: ('rice',) => ('beer',) confidence: 1.000
Rule: ('chicken',) => ('beer',) confidence: 1.000
Rule: ('chicken',) => ('rice',) confidence: 1.000
Rule: ('apple', 'rice') => ('beer',) confidence: 1.000
Rule: ('rice', 'milk') => ('beer',) confidence: 1.000
Rule: ('chicken',) => ('rice', 'beer') confidence: 1.000


In [9]:
time_s = time.time()
root, header = fpt.create_tree(fpt.load_file('../data/sample_simple.csv'), 2)
large_itemsets = fpt.compute_large_itemsets(root, header, 2)
time_e = time.time()
print('time diff:', time_e - time_s)
print(large_itemsets)
root.dump()

time diff: 0.002887248992919922
defaultdict(<class 'int'>, {frozenset({'beer'}): 6, frozenset({'rice'}): 4, frozenset({'rice', 'beer'}): 4, frozenset({'apple'}): 4, frozenset({'apple', 'beer'}): 3, frozenset({'apple', 'rice'}): 2, frozenset({'rice', 'apple', 'beer'}): 2, frozenset({'chicken'}): 2, frozenset({'chicken', 'rice'}): 2, frozenset({'chicken', 'beer'}): 2, frozenset({'chicken', 'rice', 'beer'}): 2, frozenset({'mango'}): 2, frozenset({'milk'}): 4, frozenset({'milk', 'beer'}): 3, frozenset({'milk', 'rice'}): 2, frozenset({'milk', 'rice', 'beer'}): 2})
-ROOT:1
--beer:6
---rice:4
----apple:2
-----chicken:1
----milk:2
-----chicken:1
---apple:1
---milk:1
--apple:1
---mango:1
--milk:1
---mango:1


In [None]:
def create_item_set_from_file(fn):
    itemset = set()
    fd = open(fn, 'r')
    for line in fd:
        for e in line.rstrip().split(','):
            itemset.add(e)
    fd.close()
    return itemset

In [None]:
def create_item_set(df):
    itemset = set()
    for index, row in df.iterrows():
        for e in row.tolist():
            itemset.add(e)
    itemset.remove(np.nan)
    return itemset

In [None]:
itemset = create_item_set(df)
itemset

In [None]:
def create_onehot_vec(itemset):
    items = list(itemset)
    itemvec = sparse.eye(len(items))
    itemvec_dict = {}
    for i in range(len(items)):
        itemvec_dict[items[i]] = itemvec.getrow(i)
    return itemvec_dict

In [None]:
itemvec = create_onehot_vec(itemset)
itemvec

In [None]:
(itemvec["beer"] + itemvec["mango"]).todense()

In [None]:
uservec = create_onehot_vec(df.index)

In [None]:
uservec

In [None]:
def create_user_item_matrix(df, uservec, itemvec, mimic="SVD++"):
    mat = None
    for index, row in df.iterrows():
        vu = uservec[index]
        vi_sum = None
        if mimic != "MF":
            for item_name in row.tolist():
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in row.tolist():
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [None]:
def create_user_item_matrix_compacted_sid(df, uservec, itemvec, mimic="SVD++"):
    mat = None
    for index, row in df.iterrows():
        vu = uservec[index]
        vi_sum = None
        if mimic != "MF":
            for item_name in row['sid'].split(','):
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in row['sid'].split(','):
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [None]:
def create_user_item_predict_mat_compacted_sid(uid, df_sid, uservec, itemvec, mimic="SVD++"):
    mat = None
    vu = uservec[uid]
    if True:
        vi_sum = None
        if mimic != "MF":
            for item_name in df_sid.loc[uid]['sid'].split(','):
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in itemvec.keys():
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [None]:
mat = create_user_item_matrix(df, uservec, itemvec, mimic="SVD++")
#mat = create_user_item_matrix(df, uservec, itemvec, mimic="MF")
#mat.todense()

In [None]:
X = mat
y = np.ones(mat.shape[0])
X_ = sparse.csc_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_, y)

In [None]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [None]:
y_pred

In [None]:
### apriori/fpgrowth

In [None]:
df_puffer = pd.read_excel('180419mark_satisfied_puffer.xlsx')

In [None]:
dfp = df_puffer.copy()

In [None]:
df1 = dfp[dfp['mark'] == 1].copy()

In [None]:
def print_apriori_rules(df_sidlookup, fn):
    def get_sid_info(df, sid):
        return df.loc[sid].iloc[0]['content'] + df.loc[sid].iloc[0]['slots']

    fd = open(fn, 'w')
    for index, row in df_rule.iterrows():
        for sid in row[1].split('|'):
            if sid == '':
                continue
            fd.write("<<< " + sid + ":" + get_sid_info(df_sidlookup, sid) + '\n')
        for sid in row[2].split('|'):
            if sid == '':
                continue
            fd.write(">>> " + sid + ":" + get_sid_info(df_sidlookup, sid) + '\n')
        fd.write('\n')
    fd.close()

In [None]:
def mark_sid_type(row):
    botid = row['source_type']
    sid = row['sid']
    if botid == "audio_music":
        row['sid'] = "m" + sid
    elif botid == "audio_unicast":
        row['sid'] = "u" + sid
    elif botid == "ai.dueros.bot.short_video":
        row['sid'] = "s" + sid
    elif botid == "ai.dueros.bot.video_on_demand":
        row['sid'] = "v" + sid
    return row

In [None]:
dfs = df1.apply(mark_sid_type, axis=1)

In [None]:
df_sidlookup = dfs[['sid', 'slots', 'content', 'nlu']].set_index('sid')
df_sidlookup.to_excel('out_sid_lookup.xlsx')
df_sidlookup.head()

In [None]:
df1 = df_sidlookup.reset_index()

In [None]:
len(df1['sid'])

In [None]:
df_sid = dfs[['uid', 'sid']].groupby('uid').agg(lambda x: ','.join(x))

In [None]:
df_sid.to_csv('out_uid_sid.csv', index=False, header=None, sep='|')

In [None]:
tree, header = build_fp_tree('out_uid_sid.csv', support=10)

In [None]:
tree.disp(fname='out_fptree.txt', df=df_sidlookup)

In [None]:
large_itemsets, recomm_rules = run_apriori('out_uid_sid.csv', 0.001, 0.2)
apriori.printResults(large_itemsets, recomm_rules, fn_items='out_large_itemsets.csv', fn_rules='out_recomm_rules.csv')

In [None]:
ap = imp.reload(ap)
time_s = time.time()
large_itemsets, recomm_rules = ap.run(ap.load_file('out_uid_sid.csv'), 0.001, 0.2, output_support_only=False)
time_e = time.time()
print('time diff:', time_e - time_s)

In [None]:
ap = imp.reload(ap)
time_s = time.time()
large_itemsets, recomm_rules = ap.run(ap.load_file('out_uid_sid.csv'), 0.001, 0.2, output_support_only=True)
time_e = time.time()
print('time diff:', time_e - time_s)

In [None]:
ap = imp.reload(ap)
time_s = time.time()
large_itemsets, recomm_rules = ap.run(ap.load_file('out_uid_sid.csv'), 0.001, 0.2, use_fp_tree=False, output_support_only=False)
time_e = time.time()
print('time diff:', time_e - time_s)

In [None]:
ap.dump(large_itemsets, recomm_rules)

In [None]:
dfs[dfs['sid'] == 'm1023606622'].head()[['resource', 'query']]

In [None]:
df_rule = pd.read_csv('out_recomm_rules_20%.csv', header=None)

In [None]:
print_apriori_rules(df_sidlookup, 'out_rules_display.txt')

In [None]:
### SVD++

In [None]:
df_sid.head()

In [None]:
itemset = create_item_set_from_file('out_uid_sid.csv')

In [None]:
itemvec = create_onehot_vec(itemset)

In [None]:
len(itemvec)

In [None]:
itemvec

In [None]:
uservec = create_onehot_vec(df_sid.index)

In [None]:
uservec

In [None]:
df_sid.loc['3F18061186542DB1']

In [None]:
mat = create_user_item_matrix_compacted_sid(df_sid, uservec, itemvec, mimic="SVD++")

In [None]:
itemvec['u57751445013']

In [None]:
X = mat
y = np.ones(mat.shape[0])
X_ = sparse.csc_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_, y)

In [None]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=50, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [None]:
y_pred

In [None]:
X_eval = create_user_item_predict_mat_compacted_sid('3F180611863094C1', df_sid, uservec, itemvec, mimic="SVD++")

In [None]:
y_eval = fm.predict(X_eval)

In [None]:
y_eval[:200]

In [None]:
###