In [80]:
#import random
import datetime
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
#import jieba
#import jieba.analyse
#import gensim
#from sklearn.manifold import TSNE

from sklearn.cross_validation import train_test_split

%matplotlib notebook

plt.rcParams['font.sans-serif']=['SimHei'] # chinese

In [78]:
import apriori as ap
import fpgrowth as fpg
from fastFM import als
from scipy import sparse

In [79]:
import imp
fpg = imp.reload(fpg)
ap = imp.reload(ap)

In [4]:
def build_fp_tree(file, support=2):
    data = fpg.loadData(file)
    tree, header = fpg.createTree(data, minSup=support)
    return tree, header

In [101]:
large_itemsets, recomm_rules = ap.run(ap.load_file('../data/sample_simple.csv'), 0.2, 0.8)
ap.dump(large_itemsets, recomm_rules)

Itemset: ('chicken',) support: 0.250
Itemset: ('mango',) support: 0.250
Itemset: ('rice', 'apple') support: 0.250
Itemset: ('rice', 'chicken') support: 0.250
Itemset: ('chicken', 'beer') support: 0.250
Itemset: ('milk', 'rice') support: 0.250
Itemset: ('milk', 'rice', 'beer') support: 0.250
Itemset: ('rice', 'chicken', 'beer') support: 0.250
Itemset: ('rice', 'apple', 'beer') support: 0.250
Itemset: ('milk', 'beer') support: 0.375
Itemset: ('apple', 'beer') support: 0.375
Itemset: ('apple',) support: 0.500
Itemset: ('milk',) support: 0.500
Itemset: ('rice',) support: 0.500
Itemset: ('rice', 'beer') support: 0.500
Itemset: ('beer',) support: 0.750
Rule: ('chicken',) => ('rice',) support: 1.000
Rule: ('chicken',) => ('beer',) support: 1.000
Rule: ('rice',) => ('beer',) support: 1.000
Rule: ('milk', 'rice') => ('beer',) support: 1.000
Rule: ('chicken',) => ('rice', 'beer') support: 1.000
Rule: ('rice', 'chicken') => ('beer',) support: 1.000
Rule: ('chicken', 'beer') => ('rice',) support: 

In [None]:
tree, header = build_fp_tree('../data/sample_simple.csv')

In [None]:
tree.disp(fname='out_fptree.txt')

In [None]:
df = pd.read_csv('../data/sample_simple.csv', header=None)
df

In [8]:
def create_item_set_from_file(fn):
    itemset = set()
    fd = open(fn, 'r')
    for line in fd:
        for e in line.rstrip().split(','):
            itemset.add(e)
    fd.close()
    return itemset

In [9]:
def create_item_set(df):
    itemset = set()
    for index, row in df.iterrows():
        for e in row.tolist():
            itemset.add(e)
    itemset.remove(np.nan)
    return itemset

In [None]:
itemset = create_item_set(df)
itemset

In [10]:
def create_onehot_vec(itemset):
    items = list(itemset)
    itemvec = sparse.eye(len(items))
    itemvec_dict = {}
    for i in range(len(items)):
        itemvec_dict[items[i]] = itemvec.getrow(i)
    return itemvec_dict

In [None]:
itemvec = create_onehot_vec(itemset)
itemvec

In [None]:
(itemvec["beer"] + itemvec["mango"]).todense()

In [None]:
uservec = create_onehot_vec(df.index)

In [None]:
uservec

In [6]:
def create_user_item_matrix(df, uservec, itemvec, mimic="SVD++"):
    mat = None
    for index, row in df.iterrows():
        vu = uservec[index]
        vi_sum = None
        if mimic != "MF":
            for item_name in row.tolist():
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in row.tolist():
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [7]:
def create_user_item_matrix_compacted_sid(df, uservec, itemvec, mimic="SVD++"):
    mat = None
    for index, row in df.iterrows():
        vu = uservec[index]
        vi_sum = None
        if mimic != "MF":
            for item_name in row['sid'].split(','):
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in row['sid'].split(','):
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [37]:
def create_user_item_predict_mat_compacted_sid(uid, df_sid, uservec, itemvec, mimic="SVD++"):
    mat = None
    vu = uservec[uid]
    if True:
        vi_sum = None
        if mimic != "MF":
            for item_name in df_sid.loc[uid]['sid'].split(','):
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in itemvec.keys():
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [None]:
mat = create_user_item_matrix(df, uservec, itemvec, mimic="SVD++")
#mat = create_user_item_matrix(df, uservec, itemvec, mimic="MF")
#mat.todense()

In [None]:
X = mat
y = np.ones(mat.shape[0])
X_ = sparse.csc_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_, y)

In [None]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [None]:
y_pred

In [None]:
### apriori/fpgrowth

In [13]:
df_puffer = pd.read_excel('180419mark_satisfied_puffer.xlsx')

In [14]:
dfp = df_puffer.copy()

In [15]:
df1 = dfp[dfp['mark'] == 1].copy()

In [11]:
def print_apriori_rules(df_sidlookup, fn):
    def get_sid_info(df, sid):
        return df.loc[sid].iloc[0]['content'] + df.loc[sid].iloc[0]['slots']

    fd = open(fn, 'w')
    for index, row in df_rule.iterrows():
        for sid in row[1].split('|'):
            if sid == '':
                continue
            fd.write("<<< " + sid + ":" + get_sid_info(df_sidlookup, sid) + '\n')
        for sid in row[2].split('|'):
            if sid == '':
                continue
            fd.write(">>> " + sid + ":" + get_sid_info(df_sidlookup, sid) + '\n')
        fd.write('\n')
    fd.close()

In [47]:
def mark_sid_type(row):
    botid = row['source_type']
    sid = row['sid']
    if botid == "audio_music":
        row['sid'] = "m" + sid
    elif botid == "audio_unicast":
        row['sid'] = "u" + sid
    elif botid == "ai.dueros.bot.short_video":
        row['sid'] = "s" + sid
    elif botid == "ai.dueros.bot.video_on_demand":
        row['sid'] = "v" + sid
    return row

In [16]:
dfs = df1.apply(mark_sid_type, axis=1)

In [17]:
df_sidlookup = dfs[['sid', 'slots', 'content', 'nlu']].set_index('sid')
df_sidlookup.to_excel('out_sid_lookup.xlsx')
df_sidlookup.head()

Unnamed: 0_level_0,slots,content,nlu
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
m1138580000,"{""hmm_song"": ""温柔"", ""unit"": ""歌曲"", ""song"": ""温柔""}","{""content"": ""五月天，温柔"", ""type"": ""Text"", ""_ttsOpt...","{""domain"": ""audio.music"", ""intent"": ""audio.mus..."
m1110896524,"{""hmm_song"": ""凉凉"", ""unit"": ""歌曲"", ""song"": ""凉凉""}","{""content"": ""玄觞，凉凉"", ""type"": ""Text"", ""_ttsOpti...","{""domain"": ""audio.music"", ""intent"": ""audio.mus..."
s498542084515321457,"{""status"": ""0"", ""type"": ""相声"", ""keyword"": ""郭德纲""}","{""content"": """", ""type"": ""Text""}","{""domain"": ""short_video"", ""intent"": ""ai.dueros..."
u57729179189,"{""program_name"": ""波波放那克"", ""keyword_1"": ""那克"", ""...","{""content"": ""暂时没有此资源，为你推荐，五岁萌娃脱口秀%20duang~绝不加特...","{""domain"": ""audio.unicast"", ""intent"": ""audio.u..."
m964804248,"{""unit"": ""歌曲""}","{""content"": ""深夜时候适合听听独处的音乐，推荐给你，宋冬野，安和桥"", ""typ...","{""domain"": ""audio.music"", ""intent"": ""audio.mus..."


In [18]:
df1 = df_sidlookup.reset_index()

In [19]:
len(df1['sid'])

42180

In [20]:
df_sid = dfs[['uid', 'sid']].groupby('uid').agg(lambda x: ','.join(x))

In [None]:
df_sid.to_csv('out_uid_sid.csv', index=False, header=None, sep='|')

In [None]:
tree, header = build_fp_tree('out_uid_sid.csv', support=10)

In [None]:
tree.disp(fname='out_fptree.txt', df=df_sidlookup)

In [None]:
large_itemsets, recomm_rules = run_apriori('out_uid_sid.csv', 0.001, 0.2)
apriori.printResults(large_itemsets, recomm_rules, fn_items='out_large_itemsets.csv', fn_rules='out_recomm_rules.csv')

In [None]:
dfs[dfs['sid'] == 'm1023606622'].head()[['resource', 'query']]

In [None]:
df_rule = pd.read_csv('out_recomm_rules_20%.csv', header=None)

In [None]:
print_apriori_rules(df_sidlookup, 'out_rules_display.txt')

In [None]:
### SVD++

In [21]:
df_sid.head()

Unnamed: 0_level_0,sid
uid,Unnamed: 1_level_1
3F180611863094C1,"s8563948906666203907,s15703777828732809183,u57..."
3F180611863465A0,"m993189565,m1019301623,m1131777838,m1017824831..."
3F180611863C5F1A,"m1012282237,s11584181565836183544,m1093750375,..."
3F180611863E1288,"m1138580000,m1110896524,s498542084515321457,u5..."
3F180611864E141B,"s3019915722752617984,s3019915722752617984,m101..."


In [22]:
itemset = create_item_set_from_file('out_uid_sid.csv')

In [23]:
itemvec = create_onehot_vec(itemset)

In [24]:
len(itemvec)

14429

In [25]:
itemvec

{'s15232494086522288041': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 's18198521607573341674': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'u57716089739': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'm995170077': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 's17248721292431073123': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 's4820830559391402438': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'u56509278399': <1x14429 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'm1037927150': <1x14429 sparse matr

In [26]:
uservec = create_onehot_vec(df_sid.index)

In [27]:
uservec

{'3F180611863094C1': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F180611863465A0': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F180611863C5F1A': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F180611863E1288': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F180611864E141B': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F18061186542DB1': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F1806118656072B': <1x3113 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 '3F180611865841F3': <1x3113 sparse matrix of ty

In [28]:
df_sid.loc['3F18061186542DB1']

sid    m1110896524,m1322431569,m1028177873,s501599680...
Name: 3F18061186542DB1, dtype: object

In [29]:
mat = create_user_item_matrix_compacted_sid(df_sid, uservec, itemvec, mimic="SVD++")

In [None]:
itemvec['u57751445013']

In [30]:
X = mat
y = np.ones(mat.shape[0])
X_ = sparse.csc_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_, y)

In [41]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=50, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [32]:
y_pred

array([0.99991711, 0.99934209, 0.99962985, ..., 1.00008228, 0.99996201,
       0.99999391])

In [38]:
X_eval = create_user_item_predict_mat_compacted_sid('3F180611863094C1', df_sid, uservec, itemvec, mimic="SVD++")

In [42]:
y_eval = fm.predict(X_eval)

In [43]:
y_eval[:200]

array([0.99988672, 0.99996435, 0.9992056 , 0.99966375, 1.00001566,
       0.99966545, 0.99981872, 0.99965643, 1.00001566, 0.99948935,
       0.99958038, 0.99924043, 1.00001566, 0.99964825, 1.00001566,
       0.99996826, 0.99960466, 0.99938487, 0.99959003, 0.99942239,
       0.99992157, 0.99932322, 0.99956851, 0.9994798 , 0.99992652,
       0.99955839, 1.00001566, 0.9994239 , 0.99970987, 0.99954138,
       1.00001566, 1.00001566, 0.99975058, 0.99990231, 0.99955402,
       0.9995937 , 1.00001566, 0.99949254, 0.99866835, 0.99982106,
       0.99946327, 0.99992137, 0.99954837, 0.99955839, 1.00001566,
       1.00056117, 0.9993524 , 0.99955467, 0.9993766 , 0.9993876 ,
       0.99999495, 0.99971542, 0.99916557, 0.99960154, 0.99960939,
       1.00001566, 0.99948893, 0.99929827, 1.00001566, 0.99958986,
       1.00001566, 0.99945611, 0.99967142, 0.99993787, 0.9996865 ,
       0.99966972, 0.99977146, 0.9994482 , 0.99950452, 0.99889048,
       0.99944828, 0.99961169, 0.99996136, 0.99947821, 0.99945