In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'
sns.set()
import gc

In [2]:
import xgboost as xgb

In [3]:
import os

In [10]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
from src.scoring import compute_score

---

# Loading data

In [4]:
timestamp = ['tweet_timestamp']
features = ['engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaging_user_follower_count', 'engaging_user_following_count']
targets = ['TARGET_reply', 'TARGET_retweet', 'TARGET_retweet_with_comment', 'TARGET_like']

In [5]:
%%time
df = pd.read_parquet('../data/interim/sample2m.parquet', columns=(timestamp + features + targets)).sort_values('tweet_timestamp')

CPU times: user 958 ms, sys: 223 ms, total: 1.18 s
Wall time: 918 ms


---

# Timeseries holdout split

In [6]:
training_df = df[df['tweet_timestamp'].dt.day < 20]
validation_df = df[df['tweet_timestamp'].dt.day >= 20]

---

# Train models and serialize them

In [7]:
xgb_parameters = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

In [8]:
for target in targets:
    dtrain = xgb.DMatrix(data=training_df[features], label=training_df[target])
    model = xgb.train(xgb_parameters, dtrain=dtrain)
    model.save_model(f'../models/{target}_baseline_model.model')

---

# Define model function

In [9]:
def model(test_df):
    features = ['engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaging_user_follower_count', 'engaging_user_following_count']
    targets = ['TARGET_reply', 'TARGET_retweet', 'TARGET_retweet_with_comment', 'TARGET_like']
    
    # Build test DMatrix using need features
    dtest = xgb.DMatrix(data=test_df[features])
    
    # Load all saved models
    models = {}
    for target in targets:
        model = xgb.Booster()
        model.load_model(f'../models/{target}_baseline_model.model')
        models[target] = model

    # Use saved models to compute predictions
    predictions_df = pd.DataFrame()
    for target in targets:
        predictions_df[target] = models[target].predict(dtest)
    
    # Return predictions
    return predictions_df

predictions_df = model(validation_df)

---

# Evaluate model

In [16]:
for column in predictions_df.columns:
    print(f'{column:30}','AP {:.4f} and RCE {:.4f}'.format(
        *compute_score(
            validation_df[column],
            predictions_df[column]
        )
    ))

TARGET_reply                   AP 0.0445 and RCE -1.6484
TARGET_retweet                 AP 0.1164 and RCE 0.7508
TARGET_retweet_with_comment    AP 0.0095 and RCE -27.7191
TARGET_like                    AP 0.4501 and RCE 0.8516
