In [12]:
# ============================================== [ setting ] ===================================================
import gc
import os
import random
import numpy as np
import pandas as pd

import lightgbm as lgb
from catboost import CatBoostClassifier

from time import time
from tqdm import tqdm_notebook
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
os.chdir("/Users/gimjiseong/Downloads/[ Project ]/[ Project ] Kaggle/T-Academy X Kakr")

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 2109
seed_everything(SEED)

In [13]:
# =========================================== [ Pre-processing ] ===============================================
df_train = pd.read_csv('./data/train.csv')
df_test  = pd.read_csv('./data/test.csv')

df_train['income'] = np.where(df_train['income'] == '>50K', 1, 0)
# df_train.drop(['education'], axis=1, inplace = True)
# df_test.drop(['education'], axis=1, inplace = True)
df_train["fnlwgt"] = df_train["fnlwgt"].apply(lambda x : np.log(x))
df_test["fnlwgt"] = df_test["fnlwgt"].apply(lambda x : np.log(x))

workclass_other = ['Without-pay', 'Never-worked', '?']
df_train['workclass'] = df_train['workclass'].apply(lambda x: 'Other' if x in workclass_other else x)
df_test['workclass'] = df_test['workclass'].apply(lambda x: 'Other' if x in workclass_other else x)

def age_to_cat(x):
    if x < 20:
        return '10대 이하'
    elif x < 30:
        return '20대'
    elif x < 40:
        return '30대'
    elif x < 50:
        return '40대'
    elif x < 60:
        return '50대'
    elif x < 70:
        return '60대'
    elif x < 80:
        return '70대'
    else:
        return '80대 이상'
    
def age_to_level(x):
    if x < 35:
        return 'young_level'
    elif x < 65:
        return 'middle_level'
    else:
        return 'old_level'
    
df_train['age_to_cat'] = df_train['age'].apply(age_to_cat)
df_test['age_to_cat'] = df_test['age'].apply(age_to_cat)
df_train['age_to_level'] = df_train['age'].apply(age_to_level)
df_test['age_to_level'] = df_test['age'].apply(age_to_level)

def education_level(x):
    if (x == 'Doctorate') or (x == 'Prof_school') or (x == 'Masters'):
        return 'High_edu'
    elif (x == 'Bachelors') or (x == 'Assoc_acdm') or (x == 'Assoc_voc') or (x == 'Some-college'):
        return 'Middle-edu'
    else:
        return 'Low-edu'
    
df_train['education_level'] = df_train['education'].apply(education_level)
df_test['education_level'] = df_test['education'].apply(education_level)

df_train.loc[df_train['marital_status'] == 'Married-AF-spouse', 'marital_status'] = 'Married-civ-spouse'
df_test.loc[df_test['marital_status'] == 'Married-AF-spouse', 'marital_status'] = 'Married-civ-spouse'

def now_married(x):
    if x == 'Married-civ-spouse':
        return 1
    else:
        return 0
df_train['now_married'] = df_train['marital_status'].apply(now_married)
df_test['now_married'] = df_test['marital_status'].apply(now_married)

def public_officer(x):
    if (x == 'State-gov') or (x == 'Loacl-gov') or (x == 'Federal-gov'):
        return 1
    else:
        return 0
df_train['Public_worker'] = df_train['workclass'].apply(public_officer)
df_test['Public_worker'] = df_test['workclass'].apply(public_officer)

df_train.loc[df_train['occupation'].isin(['Armed-Forces', 'Priv-house-serv']), 'occupation'] = 'Priv-house-serv'
df_test.loc[df_test['occupation'].isin(['Armed-Forces', 'Priv-house-serv']), 'occupation'] = 'Priv-house-serv'

marital_status_data = df_train.groupby('marital_status').aggregate(np.mean)
marital_status_ranks = {}
rank = 1
for idx, row in marital_status_data.sort_values(by='age').iterrows():
    marital_status_ranks[idx] = rank
    rank += 1
for data in [df_train, df_test]:
    marital_status_feature = []
    for idx, row in data.iterrows():
        marital_status_feature.append(marital_status_ranks[row.marital_status])
    data['marital_status_ranks'] = marital_status_feature
    
occupation_data = df_train.groupby('occupation').aggregate(np.mean)
occupation_ranks = {}
rank = 1
for idx, row in occupation_data.sort_values(by='age').iterrows():
    occupation_ranks[idx] = rank
    rank += 1
for data in [df_train, df_test]:
    occupation_feature = []
    for idx, row in data.iterrows():
        occupation_feature.append(occupation_ranks[row.occupation])
    data['occupation_ranks'] = occupation_feature
    
workclass_data = df_train.groupby('workclass').aggregate(np.mean)
workclass_ranks = {}
rank = 1
for idx, row in workclass_data.sort_values(by='age').iterrows():
    workclass_ranks[idx] = rank
    rank += 1
for data in [df_train, df_test]:
    workclass_feature = []
    for idx, row in data.iterrows():
        workclass_feature.append(workclass_ranks[row.workclass])
    data['workclass_ranks'] = workclass_feature
    
df_train['capital_diff'] = df_train['capital_gain'] - df_train['capital_loss']
df_test['capital_diff'] = df_test['capital_gain'] - df_test['capital_loss']

df_train['hours_per_year'] = df_train['hours_per_week'] * 52
df_train['hours_per_month'] = df_train['hours_per_week'] * 4
df_train['hours_per_day'] = df_train['hours_per_week']/7

df_test['hours_per_year'] = df_test['hours_per_week'] * 52
df_test['hours_per_month'] = df_test['hours_per_week'] * 4
df_test['hours_per_day'] = df_test['hours_per_week']/7

df_train['Edu_CapGn_mean'] = df_train.groupby(['education'])['capital_gain'].transform('mean')
df_test['Edu_CapGn_mean'] = df_test.groupby(['education'])['capital_gain'].transform('mean')
# df_train['Edu_CapGn_std'] = df_train.groupby(['education'])['capital_gain'].transform('std')
# df_test['Edu_CapGn_std'] = df_test.groupby(['education'])['capital_gain'].transform('std')

df_train['Marr_CapGn_mean'] = df_train.groupby(['marital_status'])['capital_gain'].transform('mean')
df_test['Marr_CapGn_mean'] = df_test.groupby(['marital_status'])['capital_gain'].transform('mean')
# df_train['Marr_CapGn_std'] = df_train.groupby(['marital_status'])['capital_gain'].transform('std')
# df_test['Marr_CapGn_std'] = df_test.groupby(['marital_status'])['capital_gain'].transform('std')

df_train['Occ_CapGn_mean'] = df_train.groupby(['occupation'])['capital_gain'].transform('mean')
df_test['Occ_CapGn_mean'] = df_test.groupby(['occupation'])['capital_gain'].transform('mean')
# df_train['Occ_CapGn_std'] = df_train.groupby(['occupation'])['capital_gain'].transform('std')
# df_test['Occ_CapGn_std'] = df_test.groupby(['occupation'])['capital_gain'].transform('std')

df_train['Edu_Capdf_mean'] = df_train.groupby(['education'])['capital_diff'].transform('mean')
df_test['Edu_Capdf_mean'] = df_test.groupby(['education'])['capital_diff'].transform('mean')
# df_train['Edu_Capdf_std'] = df_train.groupby(['education'])['capital_diff'].transform('std')
# df_test['Edu_Capdf_std'] = df_test.groupby(['education'])['capital_diff'].transform('std')

df_train['Marr_Capdf_mean'] = df_train.groupby(['marital_status'])['capital_diff'].transform('mean')
df_test['Marr_Capdf_mean'] = df_test.groupby(['marital_status'])['capital_diff'].transform('mean')
# df_train['Marr_Capdf_std'] = df_train.groupby(['marital_status'])['capital_diff'].transform('std')
# df_test['Marr_Capdf_std'] = df_test.groupby(['marital_status'])['capital_diff'].transform('std')

df_train['Occ_Capdf_mean'] = df_train.groupby(['occupation'])['capital_diff'].transform('mean')
df_test['Occ_Capdf_mean'] = df_test.groupby(['occupation'])['capital_diff'].transform('mean')
# df_train['Occ_Capdf_std'] = df_train.groupby(['occupation'])['capital_diff'].transform('std')
# df_test['Occ_Capdf_std'] = df_test.groupby(['occupation'])['capital_diff'].transform('std')

# pos_key = df_train.loc[(df_train['income'] == 1) & (df_train['capital_diff'] > 0), 'capital_diff'].value_counts().sort_index().keys().tolist()
# neg_key = df_train.loc[(df_train['income'] == 0) & (df_train['capital_diff'] > 0), 'capital_diff'].value_counts().sort_index().keys().tolist()

# capital_diff_pos_key = [key for key in pos_key if key not in neg_key]
# capital_diff_neg_key = [key for key in neg_key if key not in pos_key]
# df_train['capital_diff_pos_key'] = df_train['capital_diff'].apply(lambda x: x in capital_diff_pos_key)
# df_train['capital_diff_neg_key'] = df_train['capital_diff'].apply(lambda x: x in capital_diff_neg_key)
# df_test['capital_diff_pos_key'] = df_test['capital_diff'].apply(lambda x: x in capital_diff_pos_key)
# df_test['capital_diff_neg_key'] = df_test['capital_diff'].apply(lambda x: x in capital_diff_neg_key)

# df_train.drop(['native_country'], axis=1, inplace = True)
# df_test.drop(['native_country'], axis=1, inplace = True)

y = df_train['income']
df_train = df_train.drop('income', axis=1)

for col in df_train.columns:
    if df_train[col].dtype.name == 'object' or df_test[col].dtype.name == 'object':
        le = LabelEncoder()
        le.fit(list(df_train[col].values) + list(df_test[col].values))
        df_train[col] = le.transform(list(df_train[col].values))
        df_test[col]  = le.transform(list(df_test[col].values))

In [14]:
# ======================================== [ memory optimizaiton] ==============================================
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2   
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        
    return df

df_train = reduce_mem_usage(df_train)
df_test  = reduce_mem_usage(df_test)

Mem. usage decreased to  1.27 Mb (80.7% reduction)
Mem. usage decreased to  0.32 Mb (80.7% reduction)


In [19]:
# =========================================== [ ligthGBM model] ================================================
X_train = df_train.drop(['id'], axis=1)
X_test  = df_test.drop(['id'], axis=1) 
y_train = y

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat  = np.round(y_hat)
    
    return 'f1', f1_score(y_true, y_hat, average='weighted'), True

ligtGBM_params = {
          'objective': 'binary',
          'max_depth': -1,
          "max_bin" : 100,
          'n_jobs': -1,
          'learning_rate': 0.01,
#           'scale_pos_weight' : 1.25, # 1.25 = 0.8725
          'num_leaves': 50,
          'min_data_in_leaf': 25,
          'lambda_l1' : 0.21,
          'lambda_l2': 1.371190, 
          'feature_fraction': 0.749778, 
          'bagging_fraction': 0.893392, 
          'boosting_type': 'gbdt',
          'subsample_freq': 1,
#           'subsample': 0.80,    
          'colsample_bytree' : 0.8,
          'early_stopping_round' : 200,
          'n_estimators': 100000,
          'verbose': -1,
          'random_state': SEED,
          }

NFOLDS = 25
folds = KFold(n_splits = NFOLDS)

columns = X_train.columns
splits = folds.split(X_train, y_train)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X_train.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

oof_train = np.zeros((len(X_train), ))
oof_test = np.zeros((len(X_test), ))

for fold_n, (trn_idx, val_idx) in enumerate(splits):
    X_trn, X_val = X_train[columns].iloc[trn_idx], X_train[columns].iloc[val_idx]
    y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
    
    dtrain = lgb.Dataset(X_trn, label=y_trn)
    dvalid = lgb.Dataset(X_val, label=y_val)
    
    clf = lgb.train(
        ligtGBM_params,
        dtrain,
        valid_sets = [dtrain, dvalid],
        verbose_eval = 200,
        early_stopping_rounds = 200,
        feval = lgb_f1_score
    )
    
    feature_importances[f'fold_{fold_n+1}'] = clf.feature_importance()
    
    y_pred_val = clf.predict(X_val) 
    oof_train[val_idx] = y_pred_val
    y_pred_val = [int(v >= 0.5) for v in y_pred_val]
    
    y_oof[val_idx] = y_pred_val
    print(f"Fold {fold_n + 1} | F1 Score: {f1_score(y_val, y_pred_val, average='weighted')}")
    
    score += f1_score(y_val, y_pred_val, average='weighted') / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    oof_test += clf.predict(X_test) / NFOLDS
          
    del X_trn, X_val, y_trn, y_val
    gc.collect()
    
print(f"\nMean F1 score = {score}")
print(f"OOF F1 score = {f1_score(y, y_oof, average='weighted')}")

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.297268	training's f1: 0.866005	valid_1's binary_logloss: 0.318286	valid_1's f1: 0.851723
[400]	training's binary_logloss: 0.26175	training's f1: 0.879357	valid_1's binary_logloss: 0.290934	valid_1's f1: 0.859855
[600]	training's binary_logloss: 0.246099	training's f1: 0.885818	valid_1's binary_logloss: 0.285142	valid_1's f1: 0.860904
Early stopping, best iteration is:
[537]	training's binary_logloss: 0.250149	training's f1: 0.883831	valid_1's binary_logloss: 0.285967	valid_1's f1: 0.86225
Fold 1 | F1 Score: 0.8622504093937868
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.297791	training's f1: 0.865437	valid_1's binary_logloss: 0.306322	valid_1's f1: 0.851427
[400]	training's binary_logloss: 0.261968	training's f1: 0.879464	valid_1's binary_logloss: 0.283975	valid_1's f1: 0.865115
[600]	training's binary_logloss: 0.246244	training's f1: 0.88

Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.298043	training's f1: 0.864507	valid_1's binary_logloss: 0.302877	valid_1's f1: 0.863411
[400]	training's binary_logloss: 0.262533	training's f1: 0.878812	valid_1's binary_logloss: 0.27689	valid_1's f1: 0.873344
Early stopping, best iteration is:
[325]	training's binary_logloss: 0.271629	training's f1: 0.874887	valid_1's binary_logloss: 0.282667	valid_1's f1: 0.875186
Fold 15 | F1 Score: 0.8751863002527356
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.297483	training's f1: 0.86525	valid_1's binary_logloss: 0.313576	valid_1's f1: 0.859798
[400]	training's binary_logloss: 0.261826	training's f1: 0.879734	valid_1's binary_logloss: 0.288705	valid_1's f1: 0.866517
[600]	training's binary_logloss: 0.246051	training's f1: 0.88552	valid_1's binary_logloss: 0.281402	valid_1's f1: 0.870548
Early stopping, best iteration is:
[419]	training's binary_lo

Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.296974	training's f1: 0.866181	valid_1's binary_logloss: 0.325498	valid_1's f1: 0.837805
[400]	training's binary_logloss: 0.261642	training's f1: 0.880273	valid_1's binary_logloss: 0.295206	valid_1's f1: 0.842669
[600]	training's binary_logloss: 0.24596	training's f1: 0.88548	valid_1's binary_logloss: 0.287658	valid_1's f1: 0.85308
[800]	training's binary_logloss: 0.235512	training's f1: 0.891029	valid_1's binary_logloss: 0.286165	valid_1's f1: 0.855977
[1000]	training's binary_logloss: 0.226753	training's f1: 0.895689	valid_1's binary_logloss: 0.285893	valid_1's f1: 0.858358
Early stopping, best iteration is:
[969]	training's binary_logloss: 0.227999	training's f1: 0.895525	valid_1's binary_logloss: 0.285932	valid_1's f1: 0.85967
Fold 29 | F1 Score: 0.8596699866210793
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.297987	training's f1: 0.86

In [16]:
# =========================================== [ CatBoost model] ================================================
df_train["ligtgbm_oof"] = oof_train
df_test["ligtgbm_oof"] = oof_test

X_train = df_train
X_test  = df_test
y_train = y

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat  = np.round(y_hat)
    
    return 'f1', f1_score(y_true, y_hat, average='weighted'), True

cat_params = {
              'n_estimators' : 300000,
              'learning_rate': 0.01,
              'random_seed': SEED,
              'metric_period' : 200,
              'od_wait' : 455, # od_wait : 300 = 0.87109 , 350 = 0.87135, 400 = 0.87159, 450 = 0.87160
              'depth': 50,
              'border_count' : 254,
              'l2_leaf_reg' : 30, # 'l2_leaf_reg' : 30 = 0.8177 
              'min_data_in_leaf' : 16,
              'bootstrap_type' : 'Bayesian',
              'bagging_temperature' : 0.9,
              'max_leaves' : 60, # max_leaves : 60 = 0.8719
              'grow_policy' : 'Lossguide',
#               'colsample_bylevel':0.7,
              'eval_metric' : 'F1',
              'task_type' : 'CPU'
                } 
NFOLDS = 10
folds = KFold(n_splits=NFOLDS)

columns = X_train.columns
splits = folds.split(X_train, y_train)
y_preds_CAT = np.zeros(X_test.shape[0])
y_oof = np.zeros(X_train.shape[0])
score = 0

oof_train_cat = np.zeros((len(X_train), ))
oof_test_cat = np.zeros((len(X_test), ))

for fold_n, (trn_idx, val_idx) in enumerate(splits):
    X_trn, X_val = X_train[columns].iloc[trn_idx], X_train[columns].iloc[val_idx]
    y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
    
    dtrain = lgb.Dataset(X_trn, label = y_trn)
    dvalid = lgb.Dataset(X_val, label = y_val)
    
    estimator = CatBoostClassifier(**cat_params)        
    clf = estimator.fit(
        X_trn, y_trn,
        eval_set = (X_val, y_val),
        use_best_model = True)
    
    y_pred_val = clf.predict(X_val) 
    oof_train_cat[val_idx] = y_pred_val
    y_pred_val = [int(v >= 0.5) for v in y_pred_val]
    
    y_oof[val_idx] = y_pred_val
    print(f"Fold {fold_n + 1} | F1 Score: {f1_score(y_val, y_pred_val, average='weighted')}")
    
    score += f1_score(y_val, y_pred_val, average='weighted') / NFOLDS
    y_preds_CAT += clf.predict(X_test) / NFOLDS
    oof_test_cat += clf.predict(X_test) / NFOLDS
          
    del X_trn, X_val, y_trn, y_val
    gc.collect()
    
print(f"\nMean F1 score = {score}")
print(f"OOF F1 score = {f1_score(y, y_oof, average='weighted')}")

In [5]:
y_preds

array([0.01295086, 0.52241161, 0.00996503, ..., 0.05543319, 0.22476706,
       0.0164165 ])

In [17]:
submission = pd.read_csv('./data/sample_submission.csv')
y_preds = np.where(y_preds >= 0.5, 1, 0)
submission['prediction'] = y_preds

In [19]:
lightgmb_sub = pd.read_csv('./data/sample_submission.csv')
cat_sub = pd.read_csv('./data/sample_submission.csv')
submission = pd.read_csv('./data/sample_submission.csv')

lightgmb_sub['prediction'] = y_preds
cat_sub['prediction'] = y_preds_CAT
submission['prediction'] = np.mean(lightgmb_sub['prediction'], cat_sub['prediction']) # lightgmb_sub['prediction'] * 0.6 + cat_sub['prediction'] * 0.4
submission['prediction'] = np.where(submission['prediction'] >= 0.5, 1, 0)

array([1. , 1. , 1. , ..., 0.9, 1. , 1. ])

In [20]:
submission.to_csv('Kuda_lightGBM_CV25.csv', index=False)
submission.head(10)

Unnamed: 0,id,prediction
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0
