In [1]:
import pandas as pd
import numpy as np
import operator
import pickle
import time
import os
import matplotlib.pyplot as plt

In [2]:
data_dir = '../../../data/raw/preliminary_contest_data/'
user_cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def get_time_str():
    return time.strftime("%H:%M:%S", time.gmtime())

In [7]:
clickrate_folder = "../../../data/clickrate/byUserFeatureName/"
os.makedirs(clickrate_folder, exist_ok=True)

In [None]:
df_train = load("train.csv")
aids = df_train['aid'].unique()
aid_to_indices = df_train.groupby('aid').groups
train_size = df_train.shape[0]

In [None]:
feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]

for feat_name in feat_names:
    # Given a feature
    # load dictionary and the sparse matrix
    print("[{}] counting [featureName='{}']...".format(get_time_str(), feat_name))
    uids, (ufeat_index, uvec) = load_user_cnt(feat_name)
    uid_to_index = dict(zip(uids, list(range(len(uids)))))  # mapping from uids to distinct indices
    df_train["user_index"] = df_train['uid'].map(uid_to_index)
    total_count = 0

    for aid in aids:
        # Given an aid
        # get indices and select DataFrame
        indices = aid_to_indices[aid]
        df_selected = df_train.loc[indices]
        total_count += df_selected.shape[0]

        # select positive and netagive
        df_pos = df_selected[df_selected['label']==1]
        df_neg = df_selected[df_selected['label']==-1]

        # get positive and negative indices to pick rows in the sparse matrix
        pos_uindex = df_pos['user_index'].values
        neg_uindex = df_neg['user_index'].values

        # pick up rows from the sparse matrix
        X_pos = uvec[pos_uindex,:]
        X_neg = uvec[neg_uindex,:]

        # count positive and negative
        pos_count = X_pos.sum(axis=0)
        pos_count = np.squeeze(np.asarray(pos_count))
        neg_count = X_neg.sum(axis=0)
        neg_count = np.squeeze(np.asarray(neg_count))

        # sort feature values by their indices
        sorted_index = sorted(ufeat_index.items(), key=operator.itemgetter(1))
        values = [k for k, v in sorted_index]

        # construct DataFrame
        counter = pd.DataFrame()
        counter["value"] = values
        counter["positive"] = pos_count
        counter["negative"] = neg_count

        # append aggregate
        counter = counter.append({"value": "all", "positive": df_pos.shape[0], "negative": df_neg.shape[0]}, ignore_index=True)

        out_folder = os.path.join(clickrate_folder, "[featureName='{}']".format(feat_name))
        os.makedirs(out_folder, exist_ok=True)
        out_filename = "[aid='{}'].csv".format(aid)
        out_filepath = os.path.join(out_folder, out_filename)
        counter.to_csv(out_filepath, index=False)
        
    assert total_count == train_size

[09:36:22] counting [featureName='age']...
[09:36:49] counting [featureName='gender']...
[09:37:16] counting [featureName='marriageStatus']...
[09:37:43] counting [featureName='education']...
[09:38:11] counting [featureName='consumptionAbility']...
[09:38:38] counting [featureName='LBS']...
[09:39:05] counting [featureName='interest1']...
[09:39:35] counting [featureName='interest2']...
[09:40:05] counting [featureName='interest3']...
[09:40:32] counting [featureName='interest4']...
[09:40:58] counting [featureName='interest5']...
[09:41:28] counting [featureName='kw1']...
[09:46:27] counting [featureName='kw2']...
[09:47:54] counting [featureName='kw3']...
[09:48:36] counting [featureName='topic1']...
[09:49:18] counting [featureName='topic2']...
[09:49:58] counting [featureName='topic3']...
[09:50:30] counting [featureName='appIdInstall']...
[09:52:08] counting [featureName='appIdAction']...
[09:52:44] counting [featureName='ct']...
[09:53:13] counting [featureName='os']...
[09:53:4