# Read All Dataset CSV

In [5]:
import os
import csv
import re
import pandas as pd
import numpy as np

In [6]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

# 收集所有資料夾名稱，按照數字順序排序資料夾名稱
for folder_name in os.listdir("./Competition_data"):
    dataset_names.append(folder_name)
dataset_names = sorted(dataset_names, key=lambda x: int(re.search(r'\d+', x).group()))

for folder_name in dataset_names:
    # print(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))
    
    
    
# print(len(dataset_names))
# print(len(X_trains))  # 49, 代表有 49 個 dataFrame (每個資料集各一個)
# print(len(y_trains))
# print(len(X_tests))
# print(X_trains[0].dtypes)
# print(y_trains[0].dtypes)

In [7]:
# print(X_trains[42])

## Data Preprocessing & Feature Engineering

In [8]:
from sklearn.preprocessing import StandardScaler
from pyod.models.iforest import IForest
import pandas as pd

# 對每組資料進行異常值檢測和標準化處理
for i in range(len(dataset_names)):
    # 分離數值型和類別型特徵
    numerical_df = X_trains[i].select_dtypes(include=['float'])   # 數值型特徵
    categorical_df = X_trains[i].select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）
    
    # 檢查分離後的欄位數是否正確
    if len(numerical_df.columns) + len(categorical_df.columns) != len(X_trains[i].columns):
        print('Splitting error')

    # 1. 使用 PYOD 檢測並去除異常值
    clf = IForest(contamination=0.1)  # 設置 contamination 以調整異常值比例
    clf.fit(numerical_df.values)
    outliers = clf.predict(numerical_df.values)  # 1 表示異常，0 表示正常
    # 重新篩選正常數據
    is_normal = (outliers == 0)
    numerical_df = numerical_df[is_normal].reset_index(drop=True)
    categorical_df = categorical_df.iloc[is_normal].reset_index(drop=True)
    y_trains[i] = y_trains[i].iloc[is_normal].reset_index(drop=True)


    # 2. 對數值型特徵進行標準化
    scaler = StandardScaler()
    scaler.fit(numerical_df.values)  # 使用訓練數據計算均值和標準差
    numerical_s = scaler.transform(numerical_df)
    numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)

    # 合併數據
    X_trains[i] = pd.concat([numerical_df, categorical_df], axis=1)
    
    # 處理測試數據
    numerical_df_test = X_tests[i].select_dtypes(include=['float'])   # 測試數據中的數值型特徵
    categorical_df_test = X_tests[i].select_dtypes(include=['int'])   # 測試數據中的類別特徵
    
    # 測試數據使用相同的 scaler 進行標準化
    numerical_s_test = scaler.transform(numerical_df_test)
    numerical_df_test = pd.DataFrame(numerical_s_test, columns=numerical_df_test.columns)
    X_tests[i] = pd.concat([numerical_df_test, categorical_df_test], axis=1)

# 轉換 y_trains 標籤為數值格式
for i in range(len(dataset_names)):
    y_trains[i].iloc[:, 0] = pd.to_numeric(y_trains[i].iloc[:, 0])




In [9]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer

# # 對每組資料進行處理
# for i in range(len(dataset_names)):
#     # 將連續型資料和數值型資料標準化
#     numerical_df = X_trains[i].select_dtypes(include=['float'])   # 數值型特徵
#     categorical_df = X_trains[i].select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）
#     if len(numerical_df.columns) + len(categorical_df.columns) != len(X_trains[i].columns):
#         print('Splitting error')
#     # numerical_df --> normalization
#     scaler = StandardScaler()
#     scaler.fit(numerical_df)
#     numerical_s = scaler.transform(numerical_df)
#     numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)
#     X_trains[i] = pd.concat([numerical_df, categorical_df], axis=1)
    
    
#     numerical_df = X_tests[i].select_dtypes(include=['float'])   # 數值型特徵
#     categorical_df = X_tests[i].select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）
#     # 直接照前面用過的 scaler 來分
#     numerical_s = scaler.transform(numerical_df)
#     numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)
#     X_tests[i] = pd.concat([numerical_df, categorical_df], axis=1)

# for i in range(len(dataset_names)):
#     y_trains[i].iloc[:, 0] = pd.to_numeric(y_trains[i].iloc[:, 0])

In [10]:
# from sklearn.preprocessing import MinMaxScaler, LabelBinarizer

# def preprocess_data(df, scaler=None, label_binarizers=None, columns=None):
#     # 將連續型資料和數值型資料標準化
#     numerical_df = df.select_dtypes(include=['float'])   # 數值型特徵
#     categorical_df = df.select_dtypes(include=['int'])   # 類別型特徵（可能為類別特徵）

#     # numerical_df --> normalization
#     if scaler is None:
#         scaler = MinMaxScaler()
#         scaler.fit(numerical_df)
#     numerical_s = scaler.transform(numerical_df)
#     numerical_df = pd.DataFrame(numerical_s, columns=numerical_df.columns)

#     # categorial_df --> label binarizer encoding
#     if label_binarizers is None:
#         label_binarizers = {}

#     encoded_cols = []
#     for col in categorical_df.columns:
#         unique_values = categorical_df[col].nunique()

#         if col not in label_binarizers:
#             c_scaler = LabelBinarizer()
#             c_scaler.fit(categorical_df[col])
#             label_binarizers[col] = c_scaler
#         encoded_df = label_binarizers[col].transform(categorical_df[col])  # 轉成 ndarray

#         # 如果是多類別，轉換成 DataFrame，並加上欄位名稱
#         if encoded_df.shape[1] > 1:
#             encoded_df = pd.DataFrame(encoded_df, columns=[f"{col}_{cls}" for cls in label_binarizers[col].classes_])
#         else:
#             encoded_df = pd.Series(encoded_df.flatten(), name=categorical_df[col].name)
#         encoded_cols.append(encoded_df)

#     encoded_df = pd.concat([categorical_df.drop(columns=categorical_df.columns)] + encoded_cols, axis=1)

#     # 如果是測試資料，補齊缺少的欄位並重新排序
#     if columns is not None:
#         missing_cols = set(columns) - set(encoded_df.columns)
#         for col in missing_cols:
#             encoded_df[col] = 0  # 缺失的欄位補 0
#         encoded_df = encoded_df[columns]  # 重新排序以匹配訓練資料的欄位順序

#     # 合併數值型和類別型資料框
#     processed_df = pd.concat([numerical_df, encoded_df], axis=1)
#     processed_df = processed_df[columns] if columns is not None else processed_df

#     return processed_df, scaler, label_binarizers

# # 對每組資料進行處理
# for i in range(len(dataset_names)):
#     X_trains[i], n_scaler, label_binarizers = preprocess_data(X_trains[i])
#     X_tests[i], _, _ = preprocess_data(X_tests[i], scaler=n_scaler, label_binarizers=label_binarizers, columns=X_trains[i].columns)


In [11]:
# for i in range(len(dataset_names)):
#     missing_cols = set(X_trains[i].columns) - set(X_tests[i].columns)
#     print(missing_cols)

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

## MODEL


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np

class YourTorchModel(nn.Module):
    def __init__(self,input_size):
        super(YourTorchModel, self).__init__()
        # 定義模型層
        self.layer0 = nn.Linear(input_size, 64)
        self.layer1 = nn.Linear(64, 8)
        self.layer2 = nn.Linear(16, 12)
        self.layer3 = nn.Linear(12, 8)
        self.out = nn.Linear(8, 1)

        self.act_fn = nn.ReLU()
        
        self.bn0 = nn.BatchNorm1d(64)
        self.bn1 = nn.BatchNorm1d(8)
        self.bn2 = nn.BatchNorm1d(12)
        self.bn3 = nn.BatchNorm1d(8)
        self.dropout = nn.Dropout(p=0.5)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):

        x = self.layer0(x)
        x = self.bn0(x)
        x = self.act_fn(x)
        x = self.dropout(x)

        x = self.layer1(x)
        x = self.bn1(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        
        # x = self.layer2(x)
        # x = self.bn2(x)
        # x = self.act_fn(x)
        # x = self.dropout(x)

        # x = self.layer3(x)
        # x = self.bn3(x)
        # x = self.act_fn(x)
        # x = self.dropout(x)
        x = self.out(x)
        return torch.sigmoid(x)  # Apply sigmoid activation here




In [14]:
#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Neuron network

In [15]:
models = []
avg_auc = 0
# 根據不同數據續集訓練模型
for i in range(len(dataset_names)):
    # 使用 stratify 將數據分為訓練集和測試集
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
        X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i]
    )
    input_size = tmp_X_train.shape[1]
    # 將數據轉換為 PyTorch 張量
    # 將 DataFrame 轉為 numpy 陣列再轉為 PyTorch 張量
    # 將數據轉換為 PyTorch 張量並移動到 CUDA (GPU)
    tmp_X_train = torch.tensor(tmp_X_train.values, dtype=torch.float32).to(device)
    tmp_y_train = torch.tensor(tmp_y_train.values, dtype=torch.float32).to(device)
    tmp_X_test = torch.tensor(tmp_X_test.values, dtype=torch.float32).to(device)
    tmp_y_test = torch.tensor(tmp_y_test.values, dtype=torch.float32).to(device)


    # 初始化模型、損失函數和優化器
    # model = YourTorchModel(input_size)      #if you don't have cuda gpu
    model = YourTorchModel(input_size).to(device)
    criterion = nn.BCELoss()  # 二元交叉熵損失
    
    l2_lambda = 0.1  # L2 regularization factor
    l1_lambda = 0.001  # L1 regularization factor   
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=l2_lambda)  

    # 訓練迴圈
    num_epochs = 300
    
    train_acc = 0
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(tmp_X_train)
        loss = criterion(outputs, tmp_y_train)

        # Apply L1 regularization
        l1_penalty = sum(param.abs().sum() for param in model.parameters())
        loss = loss + l1_lambda * l1_penalty

        loss.backward()
        optimizer.step()
        #train acc
        preds = (outputs > 0.5).float()  # 將概率轉換為類別標籤
        train_acc += (preds == tmp_y_train).float().mean()  # 計算準確率

        
        
    
    
    # 評估 - 預測測試集的概率
    model.eval()
    with torch.no_grad():
        tmp_y_prob = model(tmp_X_test).squeeze().cpu().numpy() # 概率預測值
    
    # 計算 AUC
    # 經過資料處理後，可能會出現只有一個類別的情況，此時 AUC 會報錯，因此這裡做了一個判斷
    unique_classes = np.unique(tmp_y_test.cpu().numpy())
    if len(unique_classes) < 2:
        print(f"Skipping AUC calculation for this dataset due to single class in y_test.")
        auc = 0.7  # 或選擇其他合適的值，例如預設值
    else:
        auc = roc_auc_score(tmp_y_test.cpu().numpy(), tmp_y_prob)

    # print(f"{dataset_names[i]} AUC: {auc}")
    avg_auc += auc
    models.append(model)

    train_acc /= num_epochs
    # 在每個 dataset 結束時打印損失和準確率,and test auc
    print(f"Dataset {dataset_names[i]}: Train Acc: {train_acc.item():.8f}, Test AUC: {auc:.8f}")   

print("平均 AUC:", avg_auc / len(dataset_names))

Dataset Dataset_1: Train Acc: 0.74337524, Test AUC: 0.77752976
Dataset Dataset_2: Train Acc: 0.91256452, Test AUC: 0.99805068
Dataset Dataset_3: Train Acc: 0.77476257, Test AUC: 0.47619048
Dataset Dataset_4: Train Acc: 0.87020797, Test AUC: 0.37359551
Dataset Dataset_5: Train Acc: 0.91553205, Test AUC: 0.95238095
Dataset Dataset_6: Train Acc: 0.81636906, Test AUC: 0.99744246
Dataset Dataset_7: Train Acc: 0.59630162, Test AUC: 0.98709677
Dataset Dataset_8: Train Acc: 0.75234377, Test AUC: 0.71304348
Dataset Dataset_9: Train Acc: 0.79582542, Test AUC: 0.86904762
Dataset Dataset_10: Train Acc: 0.65834361, Test AUC: 0.81580511
Dataset Dataset_11: Train Acc: 0.82926923, Test AUC: 0.16666667
Dataset Dataset_12: Train Acc: 0.71812117, Test AUC: 1.00000000
Dataset Dataset_13: Train Acc: 0.89474481, Test AUC: 0.89565217
Dataset Dataset_14: Train Acc: 0.83300120, Test AUC: 1.00000000
Dataset Dataset_15: Train Acc: 0.73016387, Test AUC: 0.68092105
Dataset Dataset_16: Train Acc: 0.80565852, Test A

### 把所有資料訓練集合後訓練(效果不是很好)

In [16]:
# models = []
# avg_auc = 0
# num_epochs = 100  # 訓練迴圈放到最外層
# # 把所有數據集放在一起訓練
# for epoch in range(num_epochs):
#     epoch_train_acc = 0
#     epoch_avg_auc = 0
#     print(f"Epoch {epoch + 1}/{num_epochs}")

#     for i in range(len(dataset_names)):
#         # 使用 stratify 將數據分為訓練集和測試集
#         tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
#             X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i]
#         )
#         input_size = tmp_X_train.shape[1]

#         # 將數據轉換為 PyTorch 張量並移動到 CUDA (GPU)
#         tmp_X_train = torch.tensor(tmp_X_train.values, dtype=torch.float32).to(device)
#         tmp_y_train = torch.tensor(tmp_y_train.values, dtype=torch.float32).to(device)
#         tmp_X_test = torch.tensor(tmp_X_test.values, dtype=torch.float32).to(device)
#         tmp_y_test = torch.tensor(tmp_y_test.values, dtype=torch.float32).to(device)

#         # 初始化模型、損失函數和優化器
#         # if epoch == 0 and i == 0:
#         model = YourTorchModel(input_size).to(device)
#         criterion = nn.BCELoss()  # 二元交叉熵損失
#         l2_lambda = 0.1
#         optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=l2_lambda)

#         # 訓練
#         model.train()
#         optimizer.zero_grad()
#         outputs = model(tmp_X_train)
#         loss = criterion(outputs, tmp_y_train)
#         loss.backward()
#         optimizer.step()

#         # 計算訓練準確率
#         preds = (outputs > 0.5).float()
#         train_acc = (preds == tmp_y_train).float().mean()
#         epoch_train_acc += train_acc.item() / len(dataset_names)

#         # 評估 - 預測測試集的概率
#         model.eval()
#         with torch.no_grad():
#             tmp_y_prob = model(tmp_X_test).squeeze().cpu().numpy()

#         # 計算 AUC
#         auc = roc_auc_score(tmp_y_test.cpu().numpy(), tmp_y_prob)
#         epoch_avg_auc += auc / len(dataset_names)

#         # 保存模型（若在第一個 epoch 時，保存到模型列表中）
#         if epoch == 0:
#             models.append(model)

#     print(f"Epoch {epoch + 1}: Avg Train Acc: {epoch_train_acc:.8f}, Avg Test AUC: {epoch_avg_auc:.8f}")

# print("整體平均 AUC:", epoch_avg_auc)


## Cross vlidation(實驗中跑不動)


In [17]:
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.metrics import roc_auc_score

# # Define your parameter grid for XGBoost or any model you're tuning
# param_grid = {
#     'n_estimators': [100, 300, 750],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'gamma': [0, 0.5, 1],
#     'subsample': [0.7, 0.9],
#     'colsample_bytree': [0.7, 0.9],
#     'reg_lambda': [0, 1],
#     'alpha': [0, 0.01, 0.1]
# }

# avg_auc = 0
# models = []

# for i in range(len(dataset_names)):
#     tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
#         X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i]
#     )
    
#     # Cross-validation with parameter tuning
#     model = XGBClassifier(objective='binary:logistic', eval_metric='auc',tree_method='gpu_hist',  # Enable GPU for XGBoost
#     use_label_encoder=False)
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
#     grid_search.fit(tmp_X_train, tmp_y_train.squeeze())
    
#     # Get the best model from cross-validation
#     best_model = grid_search.best_estimator_
#     models.append(best_model)

#     # Evaluate the best model
#     tmp_y_prob = best_model.predict_proba(tmp_X_test)[:, 1]
#     unique_classes = np.unique(tmp_y_test)
#     if len(unique_classes) < 2:
#         print(f"Skipping AUC calculation for this dataset due to single class in y_test.")
#         auc = 0.7
#     else:
#         auc = roc_auc_score(tmp_y_test, tmp_y_prob)
#     avg_auc += auc

#     print(f"AUC for dataset {i+1} with tuned parameters: {auc}")

# print(f"Average AUC across all datasets: {avg_auc / len(dataset_names)}")


## XGBoost

In [18]:
# # 這邊用迴圈跑所有的資料集，並且將每個資料集的資料分成訓練集和測試集
# # 並且用 Random Forest 來做分類
# # 得到每個資料集的 AUC
# max_deep  = 5
# # for max_deep in max_deeps:
# models=[]
# avg_auc = 0
# avg_train = 0
# for i in range(len(dataset_names)):
#     # 這邊做一下 stratify
#     tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
#         X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i]
#     )

    
#     # XGBoost (好像比較容易過擬合，適合大資料集) 
#     # model = XGBClassifier(
#     #     n_estimators=200, learning_rate=0.1, gamma=0.7,
#     # )
#     model = XGBClassifier(
#         n_estimators=750, 
#         eta=0.005,
#         # learning_rate=0.05, 
#         gamma=1, 
#         max_depth=5, 
#         subsample=0.9, 
#         colsample_bytree=0.7,
#         objective='binary:logistic',
#         eval_metric='auc',
#         reg_lambda=1,     # L2 regularization term on weights
#         alpha=0.001          # L1 regularization term on weights
#     )
#     model.fit(tmp_X_train, tmp_y_train.squeeze())
    
#     tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
#     unique_classes = np.unique(tmp_y_test)
#     if len(unique_classes) < 2:
#         print(f"Skipping AUC calculation for this dataset due to single class in y_test.")
#         auc = 0.7  # 或選擇其他合適的值，例如預設值
#     else:
#         auc = roc_auc_score(tmp_y_test, tmp_y_prob)
#     # print(f'auc of dataset {i:2}: \t{auc}')
#     avg_auc += auc
#     models.append(model)
        

# # print(f"avg auc of maxdeep of {max_deep}:   {avg_auc / len(dataset_names)}")
# print(f"avg auc :   {avg_auc / len(dataset_names)}")



## Inference Model

In [19]:
#給neural network 用
y_predicts = []
for i in range(len(dataset_names)):
    # Get the test data and convert to tensor if needed
    X_test = torch.tensor(X_tests[i].values, dtype=torch.float32).to(device)  # Move to the same device as the model

    # Put model in evaluation mode
    models[i].eval()
    
    # Disable gradient calculation for inference
    with torch.no_grad():
        # Pass data through the model and apply sigmoid to get probabilities
        logits = models[i](X_test)  # logits will be on the same device as the model
        y_predict_proba = torch.sigmoid(logits).cpu().numpy().flatten()  # Convert to numpy array

    # Store the predictions as a DataFrame
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    #print auc
    # print(f"Dataset {dataset_names[i]}: Test AUC: {roc_auc_score(y_trains[i], y_predict_proba)}")



In [20]:
# ##給xgboost用
# y_predicts=[]
# for i in range(len(dataset_names)):
#     # print(X_tests[i])
#     y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
#     df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
#     y_predicts.append(df)
    

## Save result

In [21]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)

In [22]:
# from sklearn.metrics import roc_auc_score

# test_aucs = []
# for i in range(len(dataset_names)):
#     # 使用模型進行預測，獲得類別 1 的預測機率
#     y_predict_proba = models[i].predict_proba(X_tests[i])[:, 1]
    
#     # 計算 AUC 分數
#     auc = roc_auc_score(y_tests[i], y_predict_proba)
#     print(f'AUC of dataset {i:2}: \t{auc}')
    
#     test_aucs.append(auc)

# # 平均 AUC
# avg_test_auc = sum(test_aucs) / len(test_aucs)
# print("\nAverage Test AUC:", avg_test_auc)
