# Read All Dataset CSV

In [79]:
import os
import csv
import re
import pandas as pd
import numpy as np

In [80]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

# 收集所有資料夾名稱，按照數字順序排序資料夾名稱
for folder_name in os.listdir("./Competition_data"):
    dataset_names.append(folder_name)
dataset_names = sorted(dataset_names, key=lambda x: int(re.search(r'\d+', x).group()))

for folder_name in dataset_names:
    # print(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))
    
    
    
# print(len(dataset_names))
# print(len(X_trains))  # 49, 代表有 49 個 dataFrame (每個資料集各一個)
# print(len(y_trains))
# print(len(X_tests))
# print(X_trains[0].dtypes)
# print(y_trains[0].dtypes)

In [81]:
# print(X_trains[42])

## Data Preprocessing & Feature Engineering

In [82]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer

# 對每組資料進行處理
for i in range(len(dataset_names)):
    # 將連續型資料和數值型資料標準化
    numerical_df = X_trains[i].select_dtypes(include=['float'])   # 數值型特徵
    categorical_df = X_trains[i].select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）
    if len(numerical_df.columns) + len(categorical_df.columns) != len(X_trains[i].columns):
        print('Splitting error')
    # numerical_df --> normalization
    scaler = StandardScaler()
    scaler.fit(numerical_df)
    numerical_s = scaler.transform(numerical_df)
    numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)
    X_trains[i] = pd.concat([numerical_df, categorical_df], axis=1)
    
    
    numerical_df = X_tests[i].select_dtypes(include=['float'])   # 數值型特徵
    categorical_df = X_tests[i].select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）
    # 直接照前面用過的 scaler 來分
    numerical_s = scaler.transform(numerical_df)
    numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)
    X_tests[i] = pd.concat([numerical_df, categorical_df], axis=1)

for i in range(len(dataset_names)):
    y_trains[i].iloc[:, 0] = pd.to_numeric(y_trains[i].iloc[:, 0])

In [83]:
# from sklearn.preprocessing import MinMaxScaler, LabelBinarizer

# def preprocess_data(df, scaler=None, label_binarizers=None, columns=None):
#     # 將連續型資料和數值型資料標準化
#     numerical_df = df.select_dtypes(include=['float'])   # 數值型特徵
#     categorical_df = df.select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）

#     # numerical_df --> normalization
#     if scaler is None:
#         scaler = MinMaxScaler()
#         scaler.fit(numerical_df)
#     numerical_s = scaler.transform(numerical_df)
#     numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)

#     # categorial_df --> label binarizer encoding
#     if label_binarizers is None:
#         label_binarizers = {}

#     encoded_cols = []
#     for col in categorical_df.columns:
#         unique_values = categorical_df[col].nunique()

#         if col not in label_binarizers:
#             c_scaler = LabelBinarizer()
#             c_scaler.fit(categorical_df[col])
#             label_binarizers[col] = c_scaler
#         encoded_df = label_binarizers[col].transform(categorical_df[col])  # 轉成 ndarray

#         # 如果是多類別，轉換成 DataFrame，並加上欄位名稱
#         if encoded_df.shape[1] > 1:
#             encoded_df = pd.DataFrame(encoded_df, columns=[f"{col}_{cls}" for cls in label_binarizers[col].classes_])
#         else:
#             encoded_df = pd.Series(encoded_df.flatten(), name=categorical_df[col].name)
#         encoded_cols.append(encoded_df)

#     encoded_df = pd.concat([categorical_df.drop(columns=categorical_df.columns)] + encoded_cols, axis=1)

#     # 如果是測試資料，補齊缺少的欄位並重新排序
#     if columns is not None:
#         missing_cols = set(columns) - set(encoded_df.columns)
#         for col in missing_cols:
#             encoded_df[col] = 0  # 缺失的欄位補 0
#         encoded_df = encoded_df[columns]  # 重新排序以匹配訓練資料的欄位順序

#     # 合併數值型和類別型資料框
#     processed_df = pd.concat([numerical_df, encoded_df], axis=1)
#     processed_df = processed_df[columns] if columns is not None else processed_df

#     return processed_df, scaler, label_binarizers

# # 對每組資料進行處理
# for i in range(len(dataset_names)):
#     X_trains[i], n_scaler, label_binarizers = preprocess_data(X_trains[i])
#     X_tests[i], _, _ = preprocess_data(X_tests[i], scaler=n_scaler, label_binarizers=label_binarizers, columns=X_trains[i].columns)


In [84]:
# for i in range(len(dataset_names)):
#     missing_cols = set(X_trains[i].columns) - set(X_tests[i].columns)
#     print(missing_cols)

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [85]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [86]:
models=[]
avg_auc = 0
avg_train = 0
for i in range(len(dataset_names)):
    # 這邊做一下 stratify
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
        X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i]
    )
    # Random Forest (適合這次的小資料集)
    model = RandomForestClassifier(
        n_estimators=400, n_jobs=1, random_state=1, max_depth=7,  
    )

    # # Neural Net
    # model = MLPClassifier(
    #     hidden_layer_sizes=(128,64),
    #     activation="relu",
    #     random_state=1,
    #     alpha=0.01
    # )
    
    # # XGBoost (好像比較容易過擬合，適合大資料集) 
    # model = XGBClassifier(
    #     n_estimators=200, learning_rate=0.1, gamma=0.8,
    # )
    
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    # print(f'auc of dataset {i:2}: \t{auc}')
    avg_auc += auc
    models.append(model)
    
print(avg_auc / len(dataset_names))



0.8891371436622877


## Inference Model

In [87]:
y_predicts=[]
for i in range(len(dataset_names)):
    # print(X_tests[i])
    y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    

## Save result

In [88]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)

In [89]:
# from sklearn.metrics import roc_auc_score

# test_aucs = []
# for i in range(len(dataset_names)):
#     # 使用模型進行預測，獲得類別 1 的預測機率
#     y_predict_proba = models[i].predict_proba(X_tests[i])[:, 1]
    
#     # 計算 AUC 分數
#     auc = roc_auc_score(y_tests[i], y_predict_proba)
#     print(f'AUC of dataset {i:2}: \t{auc}')
    
#     test_aucs.append(auc)

# # 平均 AUC
# avg_test_auc = sum(test_aucs) / len(test_aucs)
# print("\nAverage Test AUC:", avg_test_auc)
