## 1. 데이터 불러오기

### 필수 라이브러리

In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

ROOT_DIR = "../data"
RANDOM_STATE = 200

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))


In [15]:
# Under-sampling
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")


  Total: Normal: 38156, AbNormal: 2350


In [16]:
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
print(df_concat.value_counts("target"))

# Split data
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])
    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")

# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

target
AbNormal    2350
Normal      2350
Name: count, dtype: int64
  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


In [17]:
# Model definition
model = CatBoostClassifier(random_state=RANDOM_STATE, iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200)

# Prepare training data
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

# Train model
model.fit(train_x, train_y)
# Load test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

0:	learn: 0.6898669	total: 3.29ms	remaining: 3.28s
200:	learn: 0.5044532	total: 449ms	remaining: 1.78s
400:	learn: 0.3977177	total: 870ms	remaining: 1.3s
600:	learn: 0.3303590	total: 1.29s	remaining: 859ms
800:	learn: 0.2779120	total: 1.7s	remaining: 423ms
999:	learn: 0.2356648	total: 2.11s	remaining: 0us


In [20]:
# 테스트 데이터 준비
df_test_x = test_data[features].copy()  # 복사본을 만들어 작업

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

# 예측
test_pred = model.predict(df_test_x)

# 제출 파일 작성
df_sub = pd.read_csv("../data/catboost-1/submission.csv")
df_sub["target"] = test_pred
df_sub.to_csv("../data/catboost-1/submission.csv", index=False)