In [12]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

from sklearn.decomposition import NMF

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [13]:
N_COMP = 20

In [14]:
raw_train = pd.read_csv("./input/train.csv", parse_dates=["first_active_month"])
h_trs = pd.read_csv("./input/historical_transactions.csv", parse_dates=["purchase_date"])
n_trs = pd.read_csv("./input/new_merchant_transactions.csv")

In [15]:
card_merchant = n_trs[["card_id","merchant_id"]].dropna().groupby(["card_id", "merchant_id"]).size().reset_index()
card_merchant.columns = ["card_id", "merchant_id", "count_"]

In [16]:
person_c = CategoricalDtype(sorted(card_merchant.card_id.unique()), ordered=True)
thing_c = CategoricalDtype(sorted(card_merchant.merchant_id.unique()), ordered=True)

row = card_merchant.card_id.astype(person_c).cat.codes
col = card_merchant.merchant_id.astype(thing_c).cat.codes
sparse_matrix = csr_matrix((card_merchant["count_"], (row, col)), \
                           shape=(person_c.categories.size, thing_c.categories.size))

In [17]:
model = NMF(n_components=N_COMP, init='random', random_state=0)
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["new_NMF_comp_{}".format(i) for i in range(1, N_COMP+1)])
df["card_id"] = person_c.categories.values
df.to_csv("./input/new_merchants_nmf.csv", index=False)