In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import os
from scipy.sparse import hstack
from fastFM import sgd
from sklearn import metrics

In [2]:
data_dir = '../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
def get_time_str():
    return time.strftime("%H:%M:%S", time.gmtime())

In [9]:
feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]

for feat_name in feat_names:
    uid, (ufeat_index, uvec) = load_user_cnt(feat_name)
    uvec_sum = uvec.sum(axis=1).flatten()
    print("{:<20}: min:{:>2} max:{:>3} nunique:{:>6} values: {}".format(feat_name, uvec_sum.min(), uvec_sum.max(), uvec.shape[1], uvec_sum))

age                 : min: 1 max:  1 nunique:     6 values: [[1 1 1 ... 1 1 1]]
gender              : min: 1 max:  1 nunique:     3 values: [[1 1 1 ... 1 1 1]]
marriageStatus      : min: 1 max:  3 nunique:    13 values: [[1 2 2 ... 1 2 2]]
education           : min: 1 max:  1 nunique:     8 values: [[1 1 1 ... 1 1 1]]
consumptionAbility  : min: 1 max:  1 nunique:     3 values: [[1 1 1 ... 1 1 1]]
LBS                 : min: 1 max:  1 nunique:   856 values: [[1 1 1 ... 1 1 1]]
interest1           : min: 1 max: 38 nunique:   123 values: [[31  2 15 ...  2 24 12]]
interest2           : min: 1 max: 32 nunique:    81 values: [[ 4  1 10 ...  1  3  1]]
interest3           : min: 1 max: 10 nunique:    11 values: [[1 1 1 ... 1 1 1]]
interest4           : min: 1 max: 10 nunique:    11 values: [[1 1 1 ... 1 1 1]]
interest5           : min: 1 max: 86 nunique:   137 values: [[23  1 36 ...  1 33  1]]
kw1                 : min: 1 max:  5 nunique:259909 values: [[5 5 5 ... 3 5 5]]
kw2                 : 

In [10]:
one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # one user can have more than one value

In [12]:
assert len(one_feat_names) + len(multi_feat_names) == len(feat_names)