In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
DIR = "/content/drive/Shareddrives/SI650_Final_Project/ZhihuRec-1M"

In [3]:
impr = pd.read_csv(os.path.join(DIR, "impression.csv"), index_col=0)
impr["is_clicked"] = impr["click_ts"] != 0
impr["is_clicked"] = impr["is_clicked"].apply(lambda x: 1 if x else 0)
impr

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked
0,0,0,1525871588,1525871593,1
1,0,1,1525871638,1525871641,1
2,0,2,1525871638,1525871650,1
3,0,3,1525871649,1525871768,1
4,0,4,1525871662,0,0
...,...,...,...,...,...
999965,7973,3924,1526053400,1526053408,1
999966,7973,4992,1526053444,1526053452,1
999967,7973,7009,1526053444,0,0
999968,7973,2592,1526053444,0,0


For each user, the last click and impressions after the last click are treated as the **test set**, the click right before the last click, and the impressions that happened between the click right before the last click and the last click are treated as the **validation set**, and others are treated as the **training set**. 

In [4]:
# find the starting position of each user
start_pos = impr.groupby(by="userID").apply(lambda x: x.index[0])
start_pos = pd.DataFrame(start_pos)
start_pos.reset_index(inplace=True)
start_pos.columns = ['userID', 'start']

In [5]:
# find position of last click of each user
last_click = impr.groupby(by="userID").apply(lambda x: x[x.is_clicked == True].index[-1])
last_click = pd.DataFrame(last_click)
last_click.reset_index(inplace=True)
last_click.columns = ['userID', 'last_click']

In [7]:
# find the click right before the last click of each user
last_2_click = impr.groupby(by="userID").apply(lambda x: x[x.is_clicked == True].index[-2])
last_2_click = pd.DataFrame(last_2_click)
last_2_click.reset_index(inplace=True)
last_2_click.columns = ['userID', 'last_2_click']

In [8]:
test_df = pd.DataFrame(columns=impr.columns)
test_df = impr.merge(last_click)
test_df = test_df[test_df.index >= test_df.last_click]

In [9]:
val_df = pd.DataFrame(columns=impr.columns)
val_df = impr.merge(last_click).merge(last_2_click)
val_df = val_df[val_df.index < val_df.last_click]
val_df = val_df[val_df.index >= val_df.last_2_click]

In [10]:
train_df = pd.DataFrame(columns=impr.columns)
train_df = impr.merge(last_2_click)
train_df = train_df[train_df.index < train_df.last_2_click]

In [11]:
train_df[train_df.userID == 1]

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked,last_2_click
144,1,142,1525391039,1525391044,1,269
145,1,143,1525391155,1525391159,1,269
146,1,144,1525391155,1525391173,1,269
147,1,83,1525391199,0,0,269
148,1,145,1525391199,1525391209,1,269
...,...,...,...,...,...,...
264,1,260,1525588055,1525588058,1,269
265,1,261,1525588055,1525588068,1,269
266,1,262,1525588067,0,0,269
267,1,263,1525588067,0,0,269


In [12]:
val_df[val_df.userID == 1]

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked,last_click,last_2_click
269,1,265,1525619525,1525619539,1,270,269


In [13]:
test_df[test_df.userID == 1]

Unnamed: 0,userID,answerID,imp_ts,click_ts,is_clicked,last_click
270,1,266,1525619525,1525619561,1,270
271,1,267,1525619576,0,0,270
272,1,268,1525619579,0,0,270
273,1,269,1525619579,0,0,270


In [14]:
test_df[['userID', 'answerID', 'is_clicked', 'imp_ts', 'click_ts']].to_csv(os.path.join(DIR, "test-1M.csv"))
val_df[['userID', 'answerID', 'is_clicked', 'imp_ts', 'click_ts']].to_csv(os.path.join(DIR, "val-1M.csv"))
train_df[['userID', 'answerID', 'is_clicked', 'imp_ts', 'click_ts']].to_csv(os.path.join(DIR, "train-1M.csv"))

## If we need to merge user, item, and impression manully, then we should run the codes below; ignore this part otherwise.

In [None]:
def add_features(impr: pd.DataFrame, users_df, ans_df):
    # keep useful features
    user_features = ['userID', 'gender', 'login_freq', 'n_followers',
                 'n_topics_followed', 'n_questions_followed', 'n_answers', 'n_questions',
                 'n_comments', 'n_thanks_recv', 'n_comments_recv', 'n_likes_recv',
                 'n_dislikes_recv', 'reg_type', 'reg_platform', 'is_android',
                 'is_iphone', 'is_ipad', 'is_pc', 'is_mobile', 'device_model',
                 'device_brand', 'platform', 'province', 'city']
    ans_features = ['answerID', 'is_anonymous', 'is_hi_val',
                'is_editor_recom', 'has_pic', 'has_vid', 'n_thanks',
                'n_likes', 'n_comments', 'n_collections', 'n_dislikes', 'n_reports',
                'n_helpless']

    users_df, ans_df = users_df[user_features], ans_df[ans_features]
    for i in range(len(user_features)):
        user_features[i] = 'u_' + user_features[i]
    for i in range(len(ans_features)):
        ans_features[i] = 'i_' + ans_features[i]
    users_df.columns = user_features
    ans_df.columns = ans_features

    impr = impr[['userID', 'answerID', 'is_clicked']]
    impr.columns = ['uid', 'iid', 'label']
    
    # merge impr and user
    impr_user = impr.merge(users_df, how='inner', left_on='uid', right_on='u_userID')
    # merge impr_user and ans
    impr_user_ans = impr_user.merge(ans_df, how='inner', left_on='iid', right_on='i_answerID')
    impr_user_ans.drop(axis=1, columns=['u_userID', 'i_answerID'])
    return impr_user_ans

In [None]:
user_df = pd.read_csv(os.path.join(DIR, "user.csv"), index_col=0)
ans_df = pd.read_csv(os.path.join(DIR, "answer.csv"), index_col=0)

In [None]:
impr_test = add_features(test_df, user_df, ans_df)
impr_val = add_features(val_df, user_df, ans_df)
impr_train = add_features(train_df, user_df, ans_df)

In [None]:
impr_test.to_csv(os.path.join(DIR, "test-1M-with-feature.csv"))
impr_val.to_csv(os.path.join(DIR, "val-1M-with-feature.csv"))
impr_train.to_csv(os.path.join(DIR, "train-1M-with-feature.csv"))

In [None]:
CC_DIR = '/content/drive/Shareddrives/SI650_Final_Project/CC-CC/dataset/zhihu-1M'
impr_test.to_csv(os.path.join(CC_DIR, "zhihu-1M.test.csv"))
impr_val.to_csv(os.path.join(CC_DIR, "zhihu-1M.validation.csv"))
impr_train.to_csv(os.path.join(CC_DIR, "zhihu-1M.train.csv"))

In [None]:
impr_test

Unnamed: 0,uid,iid,label,u_userID,u_gender,u_login_freq,u_n_followers,u_n_topics_followed,u_n_questions_followed,u_n_answers,...,i_is_editor_recom,i_has_pic,i_has_vid,i_n_thanks,i_n_likes,i_n_comments,i_n_collections,i_n_dislikes,i_n_reports,i_n_helpless
0,0,141,1,0,0,0,0,1,0,0,...,0,0,0,254,3147,0,345,298,0,36
1,890,141,1,890,1,0,21,76,53,39,...,0,0,0,254,3147,0,345,298,0,36
2,3680,141,0,3680,1,0,135,37,41,26,...,0,0,0,254,3147,0,345,298,0,36
3,3700,141,0,3700,1,0,146,13,233,0,...,0,0,0,254,3147,0,345,298,0,36
4,4338,141,0,4338,0,0,0,0,0,0,...,0,0,0,254,3147,0,345,298,0,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36194,7966,21050,0,7966,0,0,2,7,9,0,...,0,1,0,78,946,164,171,25,0,4
36195,7968,26344,0,7968,0,0,5,16,0,0,...,0,0,0,2,58,11,0,0,0,0
36196,7968,29099,0,7968,0,0,5,16,0,0,...,0,0,0,9,224,30,36,6,0,1
36197,7970,13698,1,7970,2,1,7,6,2,1,...,0,0,0,18,252,37,8,1,0,1
