In [4]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
os.chdir("drive/MyDrive/AI competition/Round 1")
os.listdir()

['competition_data',
 '__pycache__',
 'evaluator.py',
 'html',
 'submission',
 'logs.log',
 'playground.ipynb',
 'feature EDA.ipynb',
 'AutoML Baseline.ipynb',
 'Evaluator Module.ipynb']

In [6]:
train_df = pd.read_csv('competition_data/train.csv')
test_df = pd.read_csv('competition_data/test.csv')

# Evaluator

In [54]:
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_absolute_error,
)
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm

class Evaluator():
  def __init__(self, model, train_df, n_folds=4, random_state=42):
    self.kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    self.model = model
    self.train_df = train_df

  def drop_col(self, col_list=["index", "country"]):
    self.train_df = self.train_df.drop(col_list, axis=1)

  def index_col(self, col_list=["country"]):
    def _indexer(col):
      col_list = list(set(self.train_df[col]))
      col_map = {
          c: i 
          for i, c in enumerate(col_list)
      }
      return self.train_df[col].apply(lambda x: col_map[x])

    for col in col_list:
      self.train_df[f"{col}_idx"] = _indexer(col)
    self.drop_col(col_list)

  def preprocess(self, mode="index"):
    assert mode in ("index", "drop")

    if mode == "drop":
      self.drop_col()
    elif mode == "index":
      self.index_col()
      self.drop_col(col_list=["index"])

  def evaluate(self, metrics="all"):
    total_err = 0
    total_score = 0

    train_x=self.train_df.drop(['nerdiness'], axis=1)
    train_y=self.train_df['nerdiness']

    if metrics == "all":
      metrics = [
          "accuracy", "precision", "recall", 
          "f1-score", "roc_auc", "mae"
      ]
    metrics_functions_map = {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1-score": f1_score,
        "roc_auc": roc_auc_score,
        "mae": mean_absolute_error,
    }
    result_df = pd.DataFrame(
        columns = metrics + ["train_acc"]
    )

    for i, (train_index, val_index) in tqdm(enumerate(self.kf.split(train_x))):
      X_train, X_test = train_x.loc[train_index], train_x.loc[val_index]
      y_train, y_test = train_y.loc[train_index], train_y.loc[val_index]

      self.model.fit(X_train, y_train)
      predictions = self.model.predict(X_test)

      row = {}
      for metric in metrics:
        score = metrics_functions_map[metric](y_test, predictions)
        row[metric] = score
      result_df = result_df.append(
          row, ignore_index=True
      )

    result_df["fold"] = list(range(1, i+2))
    result_df = result_df.set_index("fold")

    ## add training accuracy
    mean = result_df.mean(axis=0)
    self.model.fit(train_x, train_y)
    predictions = self.model.predict(train_x)
    mean["train_acc"] = accuracy_score(predictions, train_y)
    result_df.loc["mean"] = mean

    return result_df


In [55]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier()

evaluator = Evaluator(
    model=lgbm_clf, 
    train_df=train_df,
)
evaluator.preprocess()
evaluator.evaluate()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.748,0.753468,0.810395,0.780895,0.740425,0.252,
2,0.753333,0.758558,0.80758,0.782302,0.747466,0.246667,
3,0.746667,0.755545,0.80279,0.778451,0.739815,0.253333,
4,0.744267,0.751224,0.808429,0.778777,0.736044,0.255733,
mean,0.748067,0.754699,0.807298,0.780106,0.740937,0.251933,0.844267


In [60]:
lgbm_clf = LGBMClassifier(n_estimators=1000)
evaluator = Evaluator(
    model=lgbm_clf,
    train_df=train_df
)
evaluator.preprocess()
evaluator.evaluate()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.777067,0.785124,0.822907,0.803571,0.771501,0.222933,
2,0.788267,0.79368,0.829932,0.811401,0.78376,0.211733,
3,0.7744,0.782669,0.821068,0.801408,0.768703,0.2256,
4,0.7728,0.779638,0.825192,0.801768,0.766086,0.2272,
mean,0.778133,0.785278,0.824775,0.804537,0.772512,0.221867,1.0


In [61]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()

evaluator = Evaluator(
    model=xgb_clf,
    train_df=train_df
)
evaluator.preprocess()
evaluator.evaluate()

0it [00:00, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1-score,roc_auc,mae,train_acc
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.734933,0.740889,0.802214,0.770333,0.726765,0.265067,
2,0.739733,0.745686,0.797862,0.770892,0.733446,0.260267,
3,0.7376,0.745406,0.799904,0.771694,0.729994,0.2624,
4,0.721867,0.731092,0.791667,0.760175,0.712921,0.278133,
mean,0.733533,0.740768,0.797912,0.768273,0.725782,0.266467,0.757267


# 제출 파일 생성

In [None]:
submission = pd.read_csv('competition_data/sample_submission.csv')

submission

Unnamed: 0,index,nerdiness
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
35447,35447,-1
35448,35448,-1
35449,35449,-1
35450,35450,-1


In [None]:
submission["nerdiness"] = lgbm_pred

In [None]:
submission

Unnamed: 0,index,nerdiness
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
35447,35447,1
35448,35448,1
35449,35449,1
35450,35450,0


In [None]:
submission.to_csv("baseline.csv", index = False)