In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import os
from scipy.sparse import hstack
from fastFM import sgd
from sklearn import metrics

In [2]:
data_dir = '../../../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../../../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
def get_time_str():
    return time.strftime("%H:%M:%S", time.gmtime())

In [9]:
df_train = load("train.csv")

In [10]:
aid, (afeat_index, avec) = load_ad_cnt('aid')
aid_to_index = dict(zip(aid, list(range(len(aid)))))  # mapping from aids to distinct indices
a_index = df_train['aid'].map(aid_to_index).values  # list of indices for matrix joining

y = df_train['label'].values

In [11]:
one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value

for feat_name in one_feat_names:
    uid, (ufeat_index, uvec) = load_user_cnt(feat_name)
    uid_to_index = dict(zip(uid, list(range(len(uid)))))  # mapping from uids to distinct indices
    u_index = df_train['uid'].map(uid_to_index).values  # list of indices for matrix joining
    
    X = hstack((avec[a_index,:], uvec[u_index,:])).tocsr()  # joined user and advertise matrix
    X_train, y_train = X[:6000000], y[:6000000]
    X_valid, y_valid = X[6000000:], y[6000000:]
    
    print("Training FM on [featName='{}']...".format(feat_name))
    fm = sgd.FMClassification(n_iter=1, init_stdev=0.1, 
                      rank=3, l2_reg_w=0.1, l2_reg_V=0.2)
    fm.fit(X_train, y_train)
    proba_valid = fm.predict_proba(X_valid)
    logloss = metrics.log_loss((y_valid+1)/2, proba_valid)
    auc = metrics.roc_auc_score((y_valid+1)/2, proba_valid)
    print("[{}] Iter{:>4}: AUC: {:.6f} logloss: {:.6f}".format(get_time_str(), 1, logloss, auc))
    print()

Training FM on [featName='age']...
[04:33:21] Iter   1: AUC: 0.611567 logloss: 0.525260

Training FM on [featName='gender']...
[04:33:56] Iter   1: AUC: 0.641550 logloss: 0.502200

Training FM on [featName='education']...
[04:34:34] Iter   1: AUC: 0.666281 logloss: 0.491522

Training FM on [featName='consumptionAbility']...
[04:35:12] Iter   1: AUC: 0.654675 logloss: 0.493479

Training FM on [featName='LBS']...
[04:35:52] Iter   1: AUC: 0.673006 logloss: 0.497145

Training FM on [featName='carrier']...
[04:36:27] Iter   1: AUC: 0.665553 logloss: 0.500678

Training FM on [featName='house']...
[04:37:06] Iter   1: AUC: 0.641100 logloss: 0.501819

