In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'
sns.set()
import gc

---

# Loading Data

In [8]:
%%time
df = pd.read_parquet('../data/interim/sample2m.parquet')

CPU times: user 3.5 s, sys: 1.09 s, total: 4.59 s
Wall time: 5.47 s


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241793 entries, 0 to 2241792
Data columns (total 22 columns):
 #   Column                              Dtype         
---  ------                              -----         
 0   hashtags                            object        
 1   present_media                       object        
 2   present_links                       object        
 3   present_domains                     object        
 4   tweet_type                          category      
 5   language                            category      
 6   tweet_timestamp                     datetime64[ns]
 7   engaged_with_user_id                object        
 8   engaged_with_user_follower_count    int64         
 9   engaged_with_user_following_count   int64         
 10  engaged_with_user_is_verified       bool          
 11  engaged_with_user_account_creation  datetime64[ns]
 12  engaging_user_id                    object        
 13  engaging_user_follower_count        int64 

In [10]:
X = df[['engaged_with_user_follower_count', 'engaged_with_user_following_count', 'engaging_user_follower_count', 'engaging_user_following_count']].values
X.shape

(2241793, 4)

In [12]:
y = df['TARGET_reply'].values
y.shape

(2241793,)

---

# Holdout validation

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

---

This is a very imbalanced binary classification problem. Only 2.9% of the samples are positive.
This means that a classifier which always returns 0 will achieve 97.1% accuracy.

In [25]:
set(y_train)

{0, 1}

In [26]:
y_train.sum() / y_train.shape[0] * 100

2.91980701153363

In [27]:
y_test.sum() / y_test.shape[0] * 100

2.8973198274954544

---

# Scoring functions

In [100]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def relative_cross_entropy_score(gt, pred):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

def compute_score(y_true, y_score):
    ap = average_precision_score(y_test, y_score)
    rce = relative_cross_entropy_score(y_test, y_score)
    return ap, rce

---

# Baseline Models

## Most frequent class

In [121]:
print('{:.4f} and {:.4f}'.format(*compute_score(y_test, np.zeros(y_test.shape[0]))))

0.0290 and -662.9919


## Random

In [122]:
print('{:.4f} and {:.4f}'.format(*compute_score(y_test, np.random.rand(y_test.shape[0]))))

0.0289 and -661.8205


Almost no difference between predicting the most frequent class and just random predictions.

In line with the leaderboard's random predictions which are around `0.0227, -822.4109`

---

# XGBoost Model

In [171]:
from xgboost.sklearn import XGBClassifier

In [172]:
positive_instances_count = y_train.sum()
negative_instances_count = y_train.shape[0] - y_train.sum()

In [183]:
clf = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    scale_pos_weight=negative_instances_count / positive_instances_count,
)
clf

In [184]:
%%time
clf.fit(X_train, y_train)

CPU times: user 4min 1s, sys: 123 ms, total: 4min 2s
Wall time: 1min 1s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=33.248838914690786, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [185]:
print('{:.4f} and {:.4f}'.format(*compute_score(y_test, clf.predict(X_test))))

0.0373 and -11773.7249


## Critique

It performs exactly like random predictions.

Also, it almost always predicts 0

In [176]:
y_test.sum()

16238

In [182]:
pd.DataFrame(clf.predict(X_test)).value_counts()

0    302550
1    257899
dtype: int64