In [2]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

from sklearn.decomposition import NMF

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
N_COMP = 20 #20が最高順位

In [4]:
raw_train = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
h_trs = pd.read_csv("../input/historical_transactions.csv", parse_dates=["purchase_date"])
n_trs = pd.read_csv("../input/new_merchant_transactions.csv")

KeyboardInterrupt: 

In [4]:
card_merchant = h_trs[["card_id","merchant_id"]].dropna().groupby(["card_id", "merchant_id"]).size().reset_index()
card_merchant.columns = ["card_id", "merchant_id", "count_"]

In [5]:
person_c = CategoricalDtype(sorted(card_merchant.card_id.unique()), ordered=True)
thing_c = CategoricalDtype(sorted(card_merchant.merchant_id.unique()), ordered=True)

row = card_merchant.card_id.astype(person_c).cat.codes
col = card_merchant.merchant_id.astype(thing_c).cat.codes
sparse_matrix = csr_matrix((card_merchant["count_"], (row, col)), \
                           shape=(person_c.categories.size, thing_c.categories.size))

# NMF

In [6]:
model = NMF(n_components=N_COMP, init='random', random_state=0) 
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["NMF_comp_{}".format(i) for i in range(1,N_COMP+1)])
df["card_id"] = person_c.categories.values
df.to_csv("./input/merchants_nmf.csv", index=False)

# SVD

In [9]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=30, random_state=0)
svd_embedded = svd.fit_transform(sparse_matrix)

df = pd.DataFrame(svd_embedded, columns=["SVD_comp_{}".format(i) for i in range(1,31)])
df["card_id"] = person_c.categories.values
df.to_csv("./input/merchants_svd.csv", index=False)

# LDA

In [5]:
import lda

model2 = lda.LDA(n_topics=30, n_iter=300, random_state=1, alpha=0.5, eta=0.5)
lda_embedded = model2.fit_transform(sparse_matrix)

df = pd.DataFrame(lda_embedded, columns=["LDA_comp_{}".format(i) for i in range(1,31)])
df["card_id"] = person_c.categories.values
df.to_csv("./input/merchants_lda.csv", index=False)

INFO:lda:n_documents: 325540
INFO:lda:vocab_size: 326311
INFO:lda:n_words: 28973880
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -407613774
INFO:lda:<10> log likelihood: -256319316
INFO:lda:<20> log likelihood: -248469818
INFO:lda:<30> log likelihood: -247891683
INFO:lda:<40> log likelihood: -247679727
INFO:lda:<50> log likelihood: -247606067
INFO:lda:<60> log likelihood: -247512637
INFO:lda:<70> log likelihood: -247497785
INFO:lda:<80> log likelihood: -247457880
INFO:lda:<90> log likelihood: -247407190
INFO:lda:<100> log likelihood: -247368873
INFO:lda:<110> log likelihood: -247369144
INFO:lda:<120> log likelihood: -247349735
INFO:lda:<130> log likelihood: -247320236
INFO:lda:<140> log likelihood: -247321928
INFO:lda:<150> log likelihood: -247318480
INFO:lda:<160> log likelihood: -247321153
INFO:lda:<170> log likelihood: -247317246
INFO:lda:<180> log likelihood: -247318523
INFO:lda:<190> log likelihood: -247313524
INFO:lda:<200> log likelihood: -247184632
IN

In [6]:
# category3 = Aのデータを抽出して、そのデータをNMFで次元圧縮する

In [8]:
use_col = [
    "card_id",
    "merchant_id",
    "category_3"
]

In [9]:
h_trs = pd.read_csv("../input/historical_transactions.csv", usecols=use_col)

In [10]:
h_trs = h_trs[h_trs.category_3 == "A"]

In [None]:
N_COMP = 20

In [12]:
card_merchant = h_trs[["card_id","merchant_id"]].groupby(["card_id", "merchant_id"]).size().reset_index()
card_merchant.columns = ["card_id", "merchant_id", "count_"]

In [13]:
person_c = CategoricalDtype(sorted(card_merchant.card_id.unique()), ordered=True)
thing_c = CategoricalDtype(sorted(card_merchant.merchant_id.unique()), ordered=True)

row = card_merchant.card_id.astype(person_c).cat.codes
col = card_merchant.merchant_id.astype(thing_c).cat.codes
sparse_matrix = csr_matrix((card_merchant["count_"], (row, col)), \
                           shape=(person_c.categories.size, thing_c.categories.size))

In [18]:
model = NMF(n_components=N_COMP, init='random', random_state=0) 
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["NMF_category3_comp_{}".format(i) for i in range(1,N_COMP+1)])
df["card_id"] = person_c.categories.values
df.to_csv("../input/merchants_category3_A_nmf.csv", index=False)

In [1]:
# merchant_id, purchase_amountで次元圧縮

In [9]:
N_COMP = 20 #20が最高順位

In [10]:
raw_train = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
h_trs = pd.read_csv("../input/historical_transactions.csv", usecols=['card_id', 'merchant_id', 'purchase_amount'])
# n_trs = pd.read_csv("../input/new_merchant_transactions.csv", usecols=['card_id', 'merchant_id', 'purchase_amount'])

In [11]:
h_trs = h_trs[h_trs.purchase_amount < 1e+5].reset_index(drop=True)

In [12]:
h_trs['purchase_amount'] = np.round(h_trs['purchase_amount'] / 0.00150265118 + 497.06,2)

In [13]:
card_merchant = h_trs.groupby(["card_id", "merchant_id"])['purchase_amount'].sum().reset_index()
card_merchant.columns = ["card_id", "merchant_id", "purchase_amount"]

In [14]:
card_c = CategoricalDtype(sorted(card_merchant.card_id.unique()), ordered=True)
merchant_c = CategoricalDtype(sorted(card_merchant.merchant_id.unique()), ordered=True)

row = card_merchant.card_id.astype(card_c).cat.codes
col = card_merchant.merchant_id.astype(merchant_c).cat.codes
sparse_matrix = csr_matrix((card_merchant["purchase_amount"], (row, col)), \
                           shape=(card_c.categories.size, merchant_c.categories.size))

In [15]:
model = NMF(n_components=N_COMP, init='random', random_state=0) 
embedded = model.fit_transform(sparse_matrix)

df = pd.DataFrame(embedded, columns=["NMF_purchase_amount_each_merchant_{}".format(i) for i in range(1,N_COMP+1)])
df["card_id"] = card_c.categories.values
df.to_csv("../input/purchase_amount_each_merchant_nmf.csv", index=False)

NameError: name 'person_c' is not defined