In [141]:
import numpy as np
import pandas as pd
#import random
import datetime
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
#import jieba
#import jieba.analyse
#import gensim
#from sklearn.manifold import TSNE

from sklearn.cross_validation import train_test_split

%matplotlib notebook

plt.rcParams['font.sans-serif']=['SimHei'] # chinese

In [133]:
import apriori
import fpgrowth as fpg
from fastFM import als
from scipy import sparse

In [18]:
def build_fp_tree(file):
    data = fpg.loadData(file)
    tree, header = fpg.createTree(data, minSup=2)
    return tree, header

In [3]:
def run_apriori(file, support=0.1, confidence=0.6):
    data_gen = apriori.dataFromFile(file)
    items, rules = apriori.runApriori(data_gen, support, confidence)
    #apriori.printResults(items, rules)
    return items, rules

In [16]:
run_apriori('../data/sample_simple.csv', 0.2, 0.8)

([(('mango',), 0.25),
  (('beer',), 0.75),
  (('apple',), 0.5),
  (('milk',), 0.5),
  (('chicken',), 0.25),
  (('rice',), 0.5),
  (('apple', 'beer'), 0.375),
  (('rice', 'apple'), 0.25),
  (('milk', 'rice'), 0.25),
  (('milk', 'beer'), 0.375),
  (('rice', 'beer'), 0.5),
  (('chicken', 'beer'), 0.25),
  (('rice', 'chicken'), 0.25),
  (('rice', 'apple', 'beer'), 0.25),
  (('milk', 'rice', 'beer'), 0.25),
  (('rice', 'chicken', 'beer'), 0.25)],
 [((('rice',), ('beer',)), 1.0),
  ((('chicken',), ('beer',)), 1.0),
  ((('chicken',), ('rice',)), 1.0),
  ((('rice', 'apple'), ('beer',)), 1.0),
  ((('milk', 'rice'), ('beer',)), 1.0),
  ((('chicken',), ('rice', 'beer')), 1.0),
  ((('rice', 'chicken'), ('beer',)), 1.0),
  ((('chicken', 'beer'), ('rice',)), 1.0)])

In [5]:
#run_apriori('../data/sample_integrated.csv', 0.2, 0.8)

In [15]:
tree, header = build_fp_tree('../data/sample_simple.csv')

In [17]:
tree.disp()

- Null Set : 1
-- beer : 6
--- rice : 2
---- apple : 2
----- chicken : 1
--- apple : 1
--- milk : 3
---- rice : 2
----- chicken : 1
-- apple : 1
--- mango : 1
-- milk : 1
--- mango : 1


In [26]:
df = pd.read_csv('../data/sample_simple.csv', header=None)
df

Unnamed: 0,0,1,2,3
0,apple,beer,rice,chicken
1,apple,beer,rice,
2,apple,beer,,
3,apple,mango,,
4,milk,beer,rice,chicken
5,milk,beer,rice,
6,milk,beer,,
7,milk,mango,,


In [111]:
def create_item_set(df):
    itemset = set()
    for index, row in df.iterrows():
        for e in row.tolist():
            itemset.add(e)
    itemset.remove(np.nan)
    return itemset

In [112]:
itemset = create_item_set(df)
itemset

{'apple', 'beer', 'chicken', 'mango', 'milk', 'rice'}

In [206]:
def create_onehot_vec(itemset):
    items = list(itemset)
    itemvec = sparse.eye(len(items))
    itemvec_dict = {}
    for i in range(len(items)):
        itemvec_dict[items[i]] = itemvec.getrow(i)
    return itemvec_dict

In [207]:
itemvec = create_onehot_vec(itemset)
itemvec

{'apple': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'beer': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'chicken': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'mango': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'milk': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 'rice': <1x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>}

In [209]:
(itemvec["beer"] + itemvec["mango"]).todense()

matrix([[1., 1., 0., 0., 0., 0.]])

In [212]:
uservec = create_onehot_vec(df.index)

In [213]:
uservec

{0: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 1: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 2: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 3: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 4: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 5: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 6: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 7: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>}

In [222]:
def create_user_item_matrix(df, uservec, itemvec, mimic="SVD++"):
    mat = None
    for index, row in df.iterrows():
        vu = uservec[index]
        vi_sum = None
        if mimic != "MF":
            for item_name in row.tolist():
                if item_name is np.nan:
                    continue
                vi = itemvec[item_name]
                if vi_sum is None:
                    vi_sum = vi
                else:
                    vi_sum += vi
        if mimic == "SVD++":
            vi_sum /= np.sqrt(np.sum(vi_sum))
        for item_name in row.tolist():
            if item_name is np.nan:
                continue
            vi = itemvec[item_name]
            if mimic == "SVD++":
                vuil = sparse.hstack((vu, vi, vi_sum))
            elif mimic == "MF":
                vuil = sparse.hstack((vu, vi))
            if mat is None:
                mat = vuil
            else:
                mat = sparse.vstack((mat, vuil))
    return mat

In [227]:
mat = create_user_item_matrix(df, uservec, itemvec, mimic="SVD++")
#mat = create_user_item_matrix(df, uservec, itemvec, mimic="MF")
mat.todense()

matrix([[1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 1.        , 0.5       ,
         0.        , 0.        , 0.5       , 0.5       , 0.5       ],
        [1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.5       ,
         0.        , 0.        , 0.5       , 0.5       , 0.5       ],
        [1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.5       ,
         0.        , 0.        , 0.5       , 0.5       , 0.5       ],
        [1.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.        

In [228]:
X = mat
y = np.ones(mat.shape[0])
X_ = sparse.csc_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_, y)

In [239]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [240]:
y_pred

array([1.00002637, 1.00011393, 1.00006043, 1.00005607, 1.00004255,
       1.00007349])