In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

from sklearn.decomposition import NMF

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
N_COMP = 10

In [3]:
use_cols =[
    "authorized_flag",
    "card_id",
    "category_1",
    "category_3",
    "merchant_category_id",
    "category_2",
    "state_id",
    "city_id"
]

In [4]:
h_trs = pd.read_csv("../input/historical_transactions.csv", usecols=use_cols)
n_trs = pd.read_csv("../input/new_merchant_transactions.csv", usecols=use_cols)

trs = pd.concat([h_trs, n_trs], axis=0).reset_index(drop=True)

In [5]:
for col in use_cols:
    trs[col] = trs[col].fillna("Null")

In [6]:
trs["category_str"] = trs.authorized_flag + "_"  \
    + trs.city_id.apply(lambda x:str(x)) + "_" \
    + trs.category_1 + "_" \
    + trs.category_3 + "_" \
    + trs.merchant_category_id.apply(lambda x:str(x)) + "_" \
    + trs.category_2.apply(lambda x:str(x)) + "_" \
    + trs.state_id.apply(lambda x:str(x))

In [7]:
trs["one"] = np.ones(trs.shape[0])

In [8]:
trs.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,category_3,merchant_category_id,category_2,state_id,category_str,one
0,Y,C_ID_4e6213e9bc,88,N,A,80,1,16,Y_88_N_A_80_1.0_16,1.0
1,Y,C_ID_4e6213e9bc,88,N,A,367,1,16,Y_88_N_A_367_1.0_16,1.0
2,Y,C_ID_4e6213e9bc,88,N,A,80,1,16,Y_88_N_A_80_1.0_16,1.0
3,Y,C_ID_4e6213e9bc,88,N,A,560,1,16,Y_88_N_A_560_1.0_16,1.0
4,Y,C_ID_4e6213e9bc,88,N,A,80,1,16,Y_88_N_A_80_1.0_16,1.0


In [9]:
card_c = CategoricalDtype(sorted(trs.card_id.unique()), ordered=True)
category_c = CategoricalDtype(sorted(trs.category_str.unique()), ordered=True)

row = trs.card_id.astype(card_c).cat.codes
col = trs.category_str.astype(category_c).cat.codes
sparse_matrix = csr_matrix((trs["one"], (row, col)), shape=(card_c.categories.size, category_c.categories.size))

# NMF

In [10]:
model = NMF(n_components=N_COMP, init='random', random_state=0) 
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["NMF_category_mix_comp_{}".format(i) for i in range(1,N_COMP+1)])
df["card_id"] = card_c.categories.values

In [11]:
df.head()

Unnamed: 0,NMF_category_mix_comp_1,NMF_category_mix_comp_2,NMF_category_mix_comp_3,NMF_category_mix_comp_4,NMF_category_mix_comp_5,NMF_category_mix_comp_6,NMF_category_mix_comp_7,NMF_category_mix_comp_8,NMF_category_mix_comp_9,NMF_category_mix_comp_10,card_id
0,2e-06,0.020219,2.5e-05,0.00183,6e-05,3.5e-05,4.127115e-05,0.000384,4.1e-05,6.6e-05,C_ID_00007093c1
1,0.0,0.005095,0.0,0.023222,0.0,0.0,4.54155e-05,0.002876,0.0,0.0,C_ID_0001238066
2,0.0,5.4e-05,0.00496,0.0,0.004121,0.001549,0.0,0.0,0.001961,0.005672,C_ID_0001506ef0
3,0.010026,0.000329,0.020676,2.4e-05,0.005666,0.004019,5.681509e-08,5e-06,0.001251,0.008092,C_ID_0001793786
4,0.0,0.008895,0.0,0.002378,0.0,0.0,6.038946e-05,0.000396,0.0,0.0,C_ID_000183fdda


In [12]:
df.to_csv("../input/trs_category_mix_nmf.csv", index=False)