In [67]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier)
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

In [68]:
# load data
df_train = pd.read_csv('../chk_output/train_processed.csv')
df_test = pd.read_csv('../chk_output/test_processed.csv')

In [69]:
display(df_train.head(3))
display(df_test.head(3))

Unnamed: 0,Popularity,popular_author,is_weekend,popular_month,popular_channel,310,500,820,736,279,...,407,740,369,499,901,230,750,39,662,126
0,-1,0,0,0,-2,0.0,0.0,0.0,0.204124,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1,2,0.0,0.0,0.0,0.158114,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,-2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,popular_author,is_weekend,popular_month,popular_channel,310,500,820,736,279,507,...,407,740,369,499,901,230,750,39,662,126
0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# to numpy() array
y_train = df_train['Popularity'].values
X_train = df_train.drop(['Popularity'], axis=1).values

In [71]:
X_test = df_test.values

In [72]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(27643, 99)
(27643,)
(11847, 99)


In [73]:
# preprocess 

#data label from (-1, 1) -> (0, 1)
y_train[y_train == -1] = 0
print(y_train)

[0 1 1 ... 0 0 1]


In [74]:
# model function
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [75]:
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats

In [76]:
# honyu's xgboost model

In [77]:
d_train = xgb.DMatrix(X_train, y_train)
# d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(X_test)

In [78]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.model_selection import RandomizedSearchCV, KFold

kfold = StratifiedKFold(n_splits = 5, random_state = 2021 ,shuffle=True)


In [79]:
xgb_params = {'eta': 0.05, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8,
              'min_child_weight' : 1.5,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
#               'alpha': 0.6,
#               'n_estimators': 119,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

In [81]:
cvresult = xgb.cv(xgb_params, d_train, num_boost_round=1000, verbose_eval=10, nfold=5, metrics=['auc'], \
     early_stopping_rounds=50, stratified=True)


[0]	train-auc:0.55842+0.00417	test-auc:0.53960+0.00408
[10]	train-auc:0.60426+0.00264	test-auc:0.58036+0.00667
[20]	train-auc:0.61156+0.00279	test-auc:0.58266+0.00797
[30]	train-auc:0.61687+0.00242	test-auc:0.58388+0.00757
[40]	train-auc:0.62393+0.00110	test-auc:0.58493+0.00750
[50]	train-auc:0.62904+0.00164	test-auc:0.58607+0.00710
[60]	train-auc:0.63376+0.00122	test-auc:0.58688+0.00766
[70]	train-auc:0.63842+0.00082	test-auc:0.58873+0.00812
[80]	train-auc:0.64282+0.00044	test-auc:0.58909+0.00834
[90]	train-auc:0.64637+0.00055	test-auc:0.58912+0.00748
[100]	train-auc:0.64993+0.00069	test-auc:0.58967+0.00764
[110]	train-auc:0.65331+0.00052	test-auc:0.59032+0.00704
[120]	train-auc:0.65651+0.00071	test-auc:0.59035+0.00742
[130]	train-auc:0.65911+0.00062	test-auc:0.59108+0.00740
[140]	train-auc:0.66174+0.00036	test-auc:0.59132+0.00756
[150]	train-auc:0.66454+0.00079	test-auc:0.59193+0.00754
[160]	train-auc:0.66661+0.00111	test-auc:0.59212+0.00755
[170]	train-auc:0.66851+0.00133	test-auc:0

In [82]:
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=5,
              min_child_weight=1.5, missing=nan, monotone_constraints=None,
              n_estimators=260, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.8,
              tree_method=None, validate_parameters=None, verbosity=None)

In [83]:
xgb_model.fit(X_train, y_train, eval_metric='auc', verbose=True)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.05, eval_metric='auc', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=5,
              min_child_weight=1.5, missing=nan, monotone_constraints='()',
              n_estimators=260, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [87]:
file_name = '../chk_output/xgb_v1_feature100.csv'
y_pred = xgb_model.predict_proba(X_test)[:,1]
print(y_pred)
print(y_pred.shape)

[0.46186605 0.5442448  0.4730444  ... 0.47313458 0.49735275 0.41102085]
(11847,)


In [85]:
df_submission = pd.read_csv('../chk_output/sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)