In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

from sklearn.decomposition import NMF

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
N_COMP = 20 #20が最高順位

In [21]:
df_train = pd.read_csv("../input/train.csv", usecols=["card_id"])
h_trs = pd.read_csv("../input/historical_transactions.csv", usecols=["card_id", "purchase_date", "purchase_amount"], parse_dates=["purchase_date"])

In [23]:
h_trs.purchase_date = h_trs.purchase_date.apply(lambda x:x.date())
h_trs.purchase_amount = h_trs.purchase_amount.apply(lambda x:1/(1+np.exp(-1*x)))

In [24]:
df_purchase_summary = h_trs.groupby(["card_id", "purchase_date"])["purchase_amount"].sum().reset_index()

In [38]:
df_purchase_summary.purchase_date = df_purchase_summary.purchase_date.apply(lambda x:str(x))

In [42]:
df_purchase_summary

Unnamed: 0,card_id,purchase_date,purchase_amount
0,C_ID_00007093c1,2017-02-14,0.894521
1,C_ID_00007093c1,2017-02-16,0.360637
2,C_ID_00007093c1,2017-02-20,0.417857
3,C_ID_00007093c1,2017-03-03,0.381321
4,C_ID_00007093c1,2017-03-06,0.360637
5,C_ID_00007093c1,2017-03-08,0.742471
6,C_ID_00007093c1,2017-03-11,1.788988
7,C_ID_00007093c1,2017-03-15,0.338099
8,C_ID_00007093c1,2017-04-17,2.024054
9,C_ID_00007093c1,2017-04-19,2.400781


In [45]:
card_c = CategoricalDtype(sorted(df_purchase_summary.card_id.unique()), ordered=True)
date_c = CategoricalDtype(sorted(df_purchase_summary.purchase_date.unique()), ordered=True)

row = df_purchase_summary.card_id.astype(card_c).cat.codes
col = df_purchase_summary.purchase_date.astype(date_c).cat.codes
sparse_matrix = csr_matrix((df_purchase_summary["purchase_amount"], (row, col)), shape=(card_c.categories.size, date_c.categories.size))

# NMF

In [48]:
model = NMF(n_components=N_COMP, init='random', random_state=0) 
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["NMF_purchase_comp_{}".format(i) for i in range(1,N_COMP+1)])
df["card_id"] = card_c.categories.values
df.to_csv("../input/purchase_nmf.csv", index=False)