In [1]:
features = [
    "text_tokens",
    "hashtags",
    "tweet_id",
    "present_media",
    "present_links",
    "present_domains",
    "tweet_type",
    "language",
    "tweet_timestamp",
    "engaged_with_user_id",
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaged_with_user_is_verified",
    "engaged_with_user_account_creation",
    "engaging_user_id",
    "engaging_user_follower_count",
    "engaging_user_following_count",
    "engaging_user_is_verified",
    "engaging_user_account_creation",
    "engagee_follows_engager",
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    "reply_timestamp": 20,
    "retweet_timestamp": 21,
    "retweet_with_comment_timestamp": 22,
    "like_timestamp": 23,
}
names = features + list(labels_idx)

In [2]:
import xgboost as xgb
import pandas as pd

In [3]:
df = pd.read_csv('./data/raw/sample_200k_rows', names=names, sep='\x01')

In [4]:
df = df.assign(retweet_with_comment=df["retweet_with_comment_timestamp"].notna())

In [6]:
df = df[[
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaging_user_follower_count",
    "engaging_user_following_count",
    "retweet_with_comment"
]]

In [7]:
df_train = df.iloc[0:int(len(df) * 0.6)]
df_train.head()

Unnamed: 0,engaged_with_user_follower_count,engaged_with_user_following_count,engaging_user_follower_count,engaging_user_following_count,retweet_with_comment
0,1062,2498,50,335,False
1,34662,281,38,638,False
2,68605,1056,404,178,False
3,28392,353,113,264,False
4,4490,27,1123,1220,False


In [8]:
df_test = df.iloc[int(len(df) * 0.8):]
df_test.head()

Unnamed: 0,engaged_with_user_follower_count,engaged_with_user_following_count,engaging_user_follower_count,engaging_user_following_count,retweet_with_comment
179279,323596,277,160,333,False
179280,244,501,595,1921,False
179281,974082,249,79,200,False
179282,21439,10674,6621,4814,False
179283,401899,7127,65,790,False


In [9]:
dtrain = xgb.DMatrix(df_train[[
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaging_user_follower_count",
    "engaging_user_following_count"
]], df_train["retweet_with_comment"])
dtest = xgb.DMatrix(df_test[[
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaging_user_follower_count",
    "engaging_user_following_count"
]])

In [10]:
model = xgb.train(
    {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
    },
    dtrain
)

In [11]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def relative_cross_entropy_score(gt, pred):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

def compute_score(y_true, y_score):
    ap = average_precision_score(y_true, y_score)
    rce = relative_cross_entropy_score(y_true, y_score)
    return ap, rce

In [12]:
compute_score(df_test.retweet_with_comment, model.predict(dtest))

(0.007511916700584641, -31.76040075312956)