In [19]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier)
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

In [20]:
# load data
df_train = pd.read_csv('../chk_output/train_processed.csv')
df_test = pd.read_csv('../chk_output/test_processed.csv')

In [21]:
display(df_train.head(3))
display(df_test.head(3))

Unnamed: 0,Popularity,popular_author,is_weekend,popular_month,popular_channel,1334,500,1303,542,1844,...,1005,1836,380,1204,217,569,18,1436,1298,1617
0,-1,0,0,0,-2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.316228,0.0,0.0,0.0,0.0
2,1,0,0,0,-2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,popular_author,is_weekend,popular_month,popular_channel,1334,500,1303,542,1844,736,...,1005,1836,380,1204,217,569,18,1436,1298,1617
0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# to numpy() array
y_train = df_train['Popularity'].values
X_train = df_train.drop(['Popularity'], axis=1).values

In [23]:
X_test = df_test.values

In [24]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(27643, 99)
(27643,)
(11847, 99)


In [25]:
# preprocess 

#data label from (-1, 1) -> (0, 1)
y_train[y_train == -1] = 0
print(y_train)

[0 1 1 ... 0 0 1]


In [76]:
# model function
ss = StandardScaler()
# do we need to fit on X_test?
# ref: https://stats.stackexchange.com/questions/202287/why-standardization-of-the-testing-set-has-to-be-performed-with-the-mean-and-sd
X_full = np.concatenate((X_train, X_test), axis = 0)
ss.fit(X_full)         # we fit for whole X, since the result of fitting on X_train is not so good
X_train = ss.transform(X_train)


X_test = ss.transform(X_test) 

In [77]:
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats

In [78]:
# honyu's xgboost model

In [79]:
d_train = xgb.DMatrix(X_train, y_train)
# d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(X_test)

In [80]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.model_selection import RandomizedSearchCV, KFold

kfold = StratifiedKFold(n_splits = 5, random_state = 2021 ,shuffle=True)


In [81]:
# ref: https://xgboost.readthedocs.io/en/latest/parameter.html
xgb_params = {'eta': 0.05, 
              'max_depth': 3, # the largest, the more overfitter. so far, 3 is the best
              'subsample': 0.75,  
              'colsample_bytree': 0.8,
              'min_child_weight' : 1.5,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
#               'alpha': 0.6,
#               'n_estimators': 119,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

In [82]:
cvresult = xgb.cv(xgb_params, d_train, num_boost_round=1000, verbose_eval=10, nfold=5, metrics=['auc'], \
     early_stopping_rounds=50, stratified=True)


[0]	train-auc:0.54465+0.00400	test-auc:0.53581+0.00587
[10]	train-auc:0.57882+0.00548	test-auc:0.57275+0.01005
[20]	train-auc:0.58737+0.00265	test-auc:0.57855+0.01027
[30]	train-auc:0.59186+0.00158	test-auc:0.57966+0.00975
[40]	train-auc:0.59722+0.00267	test-auc:0.58191+0.00912
[50]	train-auc:0.60310+0.00191	test-auc:0.58516+0.00912
[60]	train-auc:0.60570+0.00207	test-auc:0.58611+0.00877
[70]	train-auc:0.60882+0.00227	test-auc:0.58747+0.00864
[80]	train-auc:0.61223+0.00210	test-auc:0.58837+0.00877
[90]	train-auc:0.61477+0.00162	test-auc:0.58931+0.00891
[100]	train-auc:0.61684+0.00206	test-auc:0.58980+0.00904
[110]	train-auc:0.61916+0.00250	test-auc:0.59065+0.00883
[120]	train-auc:0.62090+0.00229	test-auc:0.59166+0.00927
[130]	train-auc:0.62302+0.00230	test-auc:0.59292+0.00906
[140]	train-auc:0.62481+0.00208	test-auc:0.59349+0.00946
[150]	train-auc:0.62623+0.00205	test-auc:0.59394+0.00939
[160]	train-auc:0.62757+0.00211	test-auc:0.59407+0.00937
[170]	train-auc:0.62887+0.00214	test-auc:0

In [83]:
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=3,
              min_child_weight=1.5, missing=nan, monotone_constraints=None,
              n_estimators=379, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.75,
              tree_method=None, validate_parameters=None, verbosity=None)

In [84]:
xgb_model.fit(X_train, y_train, eval_metric='auc', verbose=True)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=3,
              min_child_weight=1.5, missing=nan, monotone_constraints='()',
              n_estimators=379, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.75, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [85]:

y_pred = xgb_model.predict_proba(X_test)[:,1]
print(y_pred)
print(y_pred.shape)

[0.4663393  0.5548561  0.47420698 ... 0.47515517 0.5088756  0.46412808]
(11847,)


In [86]:
df_submission = pd.read_csv('../chk_output/sample_submission.csv')
df_submission['Popularity'] = y_pred


In [87]:
# version1: feature= 100, feature hashing = 1024
'''
file_name = '../chk_output/xgb_v1_feature100_hash1024.csv'
df_submission.to_csv(file_name, index=False)
'''


file_name = '../chk_output/xgb_v1_feature100_hash2048.csv'
df_submission.to_csv(file_name, index=False)
