In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier)
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# load data
df_train = pd.read_csv('../chk_output/train_processed.csv')
df_test = pd.read_csv('../chk_output/test_processed.csv')

In [None]:
display(df_train.head(3))
display(df_test.head(3))

In [None]:
y_train = df_train['Popularity']
X_train = df_train.drop(['Popularity'], axis=1)

In [5]:
X_test = df_test

In [6]:
print(X_train.shape)
print(y_train.shape)

(27643, 9)
(27643,)


In [7]:
# hold out testing set from training set
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [8]:
# define model
models = {'lr': LogisticRegression(max_iter=5_000, random_state=42, solver='saga'),
          'rf': RandomForestClassifier(random_state=42),
          'gb': GradientBoostingClassifier(random_state=42),
          'et': ExtraTreesClassifier(random_state=42),
          'ada': AdaBoostClassifier(random_state=42),
          'svc': SVC(random_state=42, probability=True)
          
          

          

        }
        
        

NameError: name 'LogisticRegression' is not defined

# model function set up

In [9]:
feature_names = X_train.columns

In [10]:
class CustomTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

    # This function takes a dataframe as input and returns a scaled version
def scale_df(input_df):
    ss = StandardScaler()
    feature_names = input_df.columns
    input_df = ss.fit_transform(input_df)
    input_df = pd.DataFrame(input_df, columns=feature_names)
    return input_df

In [11]:
# Instantiate lists to store results
init_list = []
gs_list = []

# Function to run model -- input scaler and model
def run_model(mod, mod_params={}, grid_search=False):
    
    # Initial dictionary to hold model results
    results = {}
    
    pipe = Pipeline([
            ('ss', CustomTransformer(scale_df)), # this will first standarize
            (mod, models[mod]) # choose which model
            ])
    
    if grid_search:
        # Instantiate list to store gridsearch results
        gs = GridSearchCV(pipe, param_grid=mod_params, cv=3, verbose=1, scoring='roc_auc', n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs.best_estimator_
        
    else:
        pipe.fit(X_train, y_train)
    
    # K-fold cv
    scores = cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=3, scoring='roc_auc')
    print('all validation cv scores:\n', scores)
    print('means: %.3f (+/- %.3f)' % (scores.mean(), scores.std()))
    
    # performance reporting
    y_pred_holdout = pipe.predict(X_test_holdout) 
    #print(y_pred_holdout) # 測試holdout test set要用predict_proba嗎？ yes...但我目前沒用
    print("Test_holdout accurarcy %f" % roc_auc_score(y_test_holdout, y_pred_holdout))
    
    '''
    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    
    results['model'] = mod
    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['precision'] = precision_score(y_test, predictions)
    results['specificity'] = tn / (tn + fp)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        gs_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe[1])
        
    else:
        init_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    '''
    
    return pipe

In [12]:
lr = run_model('lr')
print(lr)

all validation cv scores:
 [0.58226739 0.5893681  0.58188528]
means: 0.585 (+/- 0.003)
Test_holdout accurarcy 0.557806
Pipeline(steps=[('ss', <__main__.CustomTransformer object at 0x137d88be0>),
                ('lr',
                 LogisticRegression(max_iter=5000, random_state=42,
                                    solver='saga'))])


In [13]:
rf = run_model('rf')
print(rf)

all validation cv scores:
 [0.5522019  0.56375488 0.55819564]
means: 0.558 (+/- 0.005)
Test_holdout accurarcy 0.549134
Pipeline(steps=[('ss', <__main__.CustomTransformer object at 0x137d968e0>),
                ('rf', RandomForestClassifier(random_state=42))])


In [14]:
gb = run_model('gb')
print(gb)

all validation cv scores:
 [0.57586222 0.58532273 0.58083028]
means: 0.581 (+/- 0.004)
Test_holdout accurarcy 0.554844
Pipeline(steps=[('ss', <__main__.CustomTransformer object at 0x137dd6e20>),
                ('gb', GradientBoostingClassifier(random_state=42))])


In [15]:
et = run_model('et')
print(et)

all validation cv scores:
 [0.5463235  0.56365405 0.55441392]
means: 0.555 (+/- 0.007)
Test_holdout accurarcy 0.545866
Pipeline(steps=[('ss', <__main__.CustomTransformer object at 0x137dd7400>),
                ('et', ExtraTreesClassifier(random_state=42))])


In [16]:
ada = run_model('ada')
print(ada)

all validation cv scores:
 [0.57923172 0.582351   0.58212313]
means: 0.581 (+/- 0.001)
Test_holdout accurarcy 0.556448
Pipeline(steps=[('ss', <__main__.CustomTransformer object at 0x137dd6760>),
                ('ada', AdaBoostClassifier(random_state=42))])




In [17]:
#svc = run_model('svc')
#print(svc)

In [18]:
#xgb = run_model('xgb')
#print(xgb)

In [19]:
# houyu's predict model

In [20]:
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats

In [None]:
d_train = xgb.DMatrix(X_train, y_train)
# d_valid = xgb.DMatrix(x_val, y_val)
d_test = xgb.DMatrix(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.model_selection import RandomizedSearchCV, KFold

kfold = StratifiedKFold(n_splits = 5, random_state = 2021 ,shuffle=True)


In [None]:
xgb_params = {'eta': 0.05, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8,
              'min_child_weight' : 1.5,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
#               'lambda': 1.5,
#               'alpha': 0.6,
#               'n_estimators': 119,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

In [None]:
cvresult = xgb.cv(xgb_params, d_train, num_boost_round=10, verbose_eval=10, nfold=5, metrics=['auc'], \
     early_stopping_rounds=50, stratified=True)


In [None]:
xgb_model.set_params(n_estimators=cvresult.shape[0])

In [None]:
xgb_model.fit(X_train, y_train, eval_metric='auc', verbose=True)

In [None]:
file_name = '../chk_output/xgb_topic_hash.csv'
y_pred = xgb_model.predict_proba(X_test)[:,1]
df_submission = pd.read_csv('./sample_submission.csv')
df_submission['Popularity'] = y_pred
df_submission.to_csv(file_name, index=False)