In [1]:
import os
import numpy as np
import pandas as pd
import warnings

from google.colab import drive

warnings.filterwarnings('ignore')
drive.mount("/content/drive")

os.chdir("drive/MyDrive/AI competition/Round 1")
os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['competition_data',
 '__pycache__',
 'html',
 'submission',
 'logs.log',
 'playground.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb',
 'evaluator.py',
 '2022-08-04 feature EDA.ipynb',
 "2022-08-05 LGB_train(hyeonbin's parameter tuning).ipynb",
 'model compare.ipynb',
 'Optuna Optimization.ipynb']

In [2]:
from evaluator import Evaluator, Model

train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv("competition_data/test.csv")
submission_df = pd.read_csv("competition_data/sample_submission.csv")
train_df.head()

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,engnat,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness
0,0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,...,1.0,20,2.0,12.0,4.0,2.0,1.0,4.0,2.0,1
1,1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,...,1.0,49,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1
2,2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,...,2.0,43,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1
3,3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,...,1.0,17,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1
4,4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,...,2.0,18,2.0,12.0,1.0,2.0,1.0,1.0,2.0,0


In [16]:
params = {
    'objective': 'binary', 
    'learning_rate': 0.01, 
    'random_state': 42, 
    'metric': 'auc', 
    'verbose': -1, 
    'feature_pre_filter': False, 
    'lambda_l1': 1.001449223665896e-08, 
    'lambda_l2': 4.805472362670594e-05, 
    'num_leaves': 243, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.8823839979334422, 
    'bagging_freq': 5, 
    'min_child_samples': 5
}

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from tqdm import tqdm

my_imputer = SimpleImputer()
kf = KFold(n_splits=5, random_state=42, shuffle=True)
X = train_df.drop(["nerdiness", "country"], axis=1)
y = train_df["nerdiness"]
# test_df = test_df.drop(["country"], axis=1)

metrics = [
    "accuracy", "precision", "recall", 
    "f1-score", "roc_auc", "mae"
]
metrics_functions_map = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1-score": f1_score,
    "roc_auc": roc_auc_score,
    "mae": mean_absolute_error,
}
result_df = pd.DataFrame(
    columns = metrics + ["train_acc"]
)
train_x=train_df.drop(['nerdiness', 'country'], axis=1)
train_y=train_df['nerdiness']

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train_x))):
  X_train, X_valid = train_x.loc[train_index], train_x.loc[val_index]
  y_train, y_valid = train_y.loc[train_index], train_y.loc[val_index]

  train_set = lgb.Dataset(X_train, label=y_train)
  valid_set = lgb.Dataset(X_valid, label=y_valid)

  model = lgb.train(params, train_set, 1000, valid_set, verbose_eval=100, early_stopping_rounds=160)
  score_valid = model.predict(X_valid)
  predict_valid = np.round(score_valid)

  row = {}
  for metric in metrics:
    if metric in ("accuracy", "precision", "recall", "f1-score"):
      score = metrics_functions_map[metric](y_valid, predict_valid)
    else:
      score = metrics_functions_map[metric](y_valid, score_valid)
    row[metric] = score
  result_df = result_df.append(
      row, ignore_index=True
  )

result_df["fold"] = list(range(1, i+2))
result_df = result_df.set_index("fold")

## add training accuracy
mean = result_df.mean(axis=0)

train_set = lgb.Dataset(train_x, label=train_y)
valid_set = lgb.Dataset(train_x, label=train_y)

model = lgb.train(params, train_set, 1000, valid_set, verbose_eval=100, early_stopping_rounds=160)
score_valid = model.predict(train_x)
predict_valid = np.round(score_valid)

mean["train_acc"] = accuracy_score(predict_valid, train_y)
result_df.loc["mean"] = mean

result_df

0it [00:00, ?it/s]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.827585
[200]	valid_0's auc: 0.836894
[300]	valid_0's auc: 0.843926
[400]	valid_0's auc: 0.850303
[500]	valid_0's auc: 0.855147
[600]	valid_0's auc: 0.858562
[700]	valid_0's auc: 0.861168
[800]	valid_0's auc: 0.863008
[900]	valid_0's auc: 0.864767
[1000]	valid_0's auc: 0.866542
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.866551


1it [00:15, 15.51s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.843447
[200]	valid_0's auc: 0.853561
[300]	valid_0's auc: 0.861
[400]	valid_0's auc: 0.866701
[500]	valid_0's auc: 0.871066
[600]	valid_0's auc: 0.873808
[700]	valid_0's auc: 0.87622
[800]	valid_0's auc: 0.877564
[900]	valid_0's auc: 0.879097
[1000]	valid_0's auc: 0.87998
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.87998


2it [00:30, 15.15s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.839601
[200]	valid_0's auc: 0.848853
[300]	valid_0's auc: 0.856247
[400]	valid_0's auc: 0.863048
[500]	valid_0's auc: 0.867491
[600]	valid_0's auc: 0.870344
[700]	valid_0's auc: 0.872452
[800]	valid_0's auc: 0.874522
[900]	valid_0's auc: 0.876228
[1000]	valid_0's auc: 0.877238
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.877238


3it [00:47, 15.89s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.841089
[200]	valid_0's auc: 0.850547
[300]	valid_0's auc: 0.857046
[400]	valid_0's auc: 0.862326
[500]	valid_0's auc: 0.866945
[600]	valid_0's auc: 0.869896
[700]	valid_0's auc: 0.872343
[800]	valid_0's auc: 0.874292
[900]	valid_0's auc: 0.875699
[1000]	valid_0's auc: 0.877052
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.877052


4it [01:02, 15.68s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.821164
[200]	valid_0's auc: 0.829127
[300]	valid_0's auc: 0.836451
[400]	valid_0's auc: 0.843016
[500]	valid_0's auc: 0.848237
[600]	valid_0's auc: 0.852439
[700]	valid_0's auc: 0.855275
[800]	valid_0's auc: 0.857281
[900]	valid_0's auc: 0.85897
[1000]	valid_0's auc: 0.860241
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.860241


5it [01:17, 15.58s/it]


Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.951745
[200]	valid_0's auc: 0.972939
[300]	valid_0's auc: 0.987266
[400]	valid_0's auc: 0.995191
[500]	valid_0's auc: 0.998629
[600]	valid_0's auc: 0.999725
[700]	valid_0's auc: 0.999961
[800]	valid_0's auc: 0.999997
[900]	valid_0's auc: 1
[1000]	valid_0's auc: 1
Did not meet early stopping. Best iteration is:
[960]	valid_0's auc: 1


Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.785333,0.785314,0.84296,0.813117,0.866551,0.280637,
2,0.807333,0.809173,0.855689,0.831781,0.87998,0.274068,
3,0.796667,0.794931,0.844553,0.818991,0.877238,0.27283,
4,0.791333,0.797382,0.83842,0.817386,0.877052,0.277332,
5,0.785667,0.78338,0.848739,0.814751,0.860241,0.282523,
mean,0.793267,0.794036,0.846072,0.819205,0.872212,0.277478,0.999733


In [18]:
params = {
  "objective": "binary",
  'metric': 'auc', 
  "n_estimators": 500,
  'learning_rate': 0.026332779906149555,
  'num_leaves': 955,
  'reg_alpha': 6.90331310095056e-08,
  'reg_lambda': 2.30837413695962e-06,
  'feature_fraction': 0.4, 
  'bagging_fraction': 0.8823839979334422, 
  'bagging_freq': 5, 
  'min_child_samples': 5,
}

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from tqdm import tqdm

my_imputer = SimpleImputer()
kf = KFold(n_splits=5, random_state=42, shuffle=True)
X = train_df.drop(["nerdiness", "country"], axis=1)
y = train_df["nerdiness"]
# test_df = test_df.drop(["country"], axis=1)

metrics = [
    "accuracy", "precision", "recall", 
    "f1-score", "roc_auc", "mae"
]
metrics_functions_map = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "recall": recall_score,
    "f1-score": f1_score,
    "roc_auc": roc_auc_score,
    "mae": mean_absolute_error,
}
result_df = pd.DataFrame(
    columns = metrics + ["train_acc"]
)
train_x=train_df.drop(['nerdiness', 'country'], axis=1)
train_y=train_df['nerdiness']
test_predictions = []

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train_x))):
  X_train, X_valid = train_x.loc[train_index], train_x.loc[val_index]
  y_train, y_valid = train_y.loc[train_index], train_y.loc[val_index]

  train_set = lgb.Dataset(X_train, label=y_train)
  valid_set = lgb.Dataset(X_valid, label=y_valid)

  model = lgb.train(params, train_set, 1000, valid_set, verbose_eval=100, early_stopping_rounds=160)
  score_valid = model.predict(X_valid)
  predict_valid = np.round(score_valid)

  row = {}
  for metric in metrics:
    if metric in ("accuracy", "precision", "recall", "f1-score"):
      score = metrics_functions_map[metric](y_valid, predict_valid)
    else:
      score = metrics_functions_map[metric](y_valid, score_valid)
    row[metric] = score
  result_df = result_df.append(
      row, ignore_index=True
  )

  # predict test
  predict_test = model.predict(test_df)
  test_predictions.append(predict_test)

final_predictions = np.array(test_predictions).mean(axis=0)

result_df["fold"] = list(range(1, i+2))
result_df = result_df.set_index("fold")

## add training accuracy
mean = result_df.mean(axis=0)

train_set = lgb.Dataset(train_x, label=train_y)
valid_set = lgb.Dataset(train_x, label=train_y)

model = lgb.train(params, train_set, 1000, valid_set, verbose_eval=100, early_stopping_rounds=160)
score_valid = model.predict(train_x)
predict_valid = np.round(score_valid)

mean["train_acc"] = accuracy_score(predict_valid, train_y)
result_df.loc["mean"] = mean

result_df

0it [00:00, ?it/s]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.862783
[200]	valid_0's auc: 0.870216
[300]	valid_0's auc: 0.873817
[400]	valid_0's auc: 0.875656
[500]	valid_0's auc: 0.876605
Did not meet early stopping. Best iteration is:
[485]	valid_0's auc: 0.876661


1it [00:27, 27.86s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.877906
[200]	valid_0's auc: 0.883162
[300]	valid_0's auc: 0.885193
[400]	valid_0's auc: 0.886724
[500]	valid_0's auc: 0.887214
Did not meet early stopping. Best iteration is:
[482]	valid_0's auc: 0.887493


2it [01:09, 35.77s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.871839
[200]	valid_0's auc: 0.880514
[300]	valid_0's auc: 0.882567
[400]	valid_0's auc: 0.884112
[500]	valid_0's auc: 0.884968
Did not meet early stopping. Best iteration is:
[497]	valid_0's auc: 0.885009


3it [01:34, 31.19s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.873797
[200]	valid_0's auc: 0.880677
[300]	valid_0's auc: 0.883263
[400]	valid_0's auc: 0.884394
[500]	valid_0's auc: 0.885473
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.885473


4it [02:01, 29.20s/it]

Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.856733
[200]	valid_0's auc: 0.862356
[300]	valid_0's auc: 0.866348
[400]	valid_0's auc: 0.868122
[500]	valid_0's auc: 0.868712
Did not meet early stopping. Best iteration is:
[452]	valid_0's auc: 0.868925


5it [02:26, 29.30s/it]


Training until validation scores don't improve for 160 rounds.
[100]	valid_0's auc: 0.99986
[200]	valid_0's auc: 0.99999
[300]	valid_0's auc: 1
Early stopping, best iteration is:
[222]	valid_0's auc: 1


Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.782,0.77907,0.84657,0.811419,0.876661,0.229874,
2,0.808333,0.802319,0.87006,0.834818,0.887493,0.217431,
3,0.794,0.785073,0.856181,0.819087,0.885009,0.221087,
4,0.79,0.791597,0.845601,0.817708,0.885473,0.223541,
5,0.779667,0.77474,0.85054,0.810873,0.868925,0.236114,
mean,0.7908,0.78656,0.853791,0.818781,0.880712,0.22561,0.999867


In [21]:
## Submission
submission_df['nerdiness'] = final_predictions
submission_df

Unnamed: 0,index,nerdiness
0,0,0.194566
1,1,0.991259
2,2,0.997579
3,3,0.899751
4,4,0.925865
...,...,...
35447,35447,0.962007
35448,35448,0.980259
35449,35449,0.998927
35450,35450,0.179570


In [23]:
submission_df.to_csv("submission/2022-08-06_LGBM_optim_300_trainer.csv", index=False)