#### Score : ```0.86```

# Read All Dataset CSV

In [29]:
import os
import pandas as pd
import numpy as np


In [30]:
def remove_all_predict():
    for folder_name in os.listdir("../../Competition_data"):
        if os.path.exists(f"../../Competition_data/{folder_name}/y_predict.csv") is False: continue
        os.remove(f"../../Competition_data/{folder_name}/y_predict.csv")
# use this function to remove all the y_predict.csv
remove_all_predict()

In [31]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("../../Competition_data"):
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"../../Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"../../Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"../../Competition_data/{folder_name}/X_test.csv",header=0))

## Data Preprocessing & Feature Engineering

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [33]:
def get_number_of_datatype(X_data):
    # Split the features in X_data into numeric_data and categorical_data.
    numeric, categoric = [], []
    for feature in X_data:
        val = float(X_data.loc[i, feature])
        if isinstance(val, float) and val.is_integer():
                categoric.append(feature)
        else:   numeric.append(feature)
    return numeric, categoric

def preprocess_numeric_data(numeric_data):
    imputer = SimpleImputer(strategy = "mean")
    imputed_numeric_data = imputer.fit_transform(numeric_data)
    
    scaler = StandardScaler()
    scaled_numeric_data = scaler.fit_transform(imputed_numeric_data)
    return scaled_numeric_data

def preprocess_categoric_data(categoric_data):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_data = encoder.fit_transform(categoric_data)
    return pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categoric_data.columns))

def align_columns(X_train, X_test):

    train_cols = X_train.columns
    test_cols = X_test.columns
    all_cols = test_cols.union(train_cols)

    test_missing_cols = train_cols.difference(test_cols)
    train_missing_cols = test_cols.difference(train_cols)

    for col in train_missing_cols:
        X_train[col] = 0
    for col in test_missing_cols:
        X_test[col] = 0  
    X_train = X_train[all_cols]
    X_test = X_test[all_cols]
    
    return X_train, X_test

def preprocess(X_data, numeric_features, categoric_features):
    X_data.loc[:, numeric_features] = preprocess_numeric_data(X_data[numeric_features])
    new_columns = preprocess_categoric_data(X_data.loc[:, categoric_features])
    X_data = X_data.drop(columns = categoric_features).reset_index(drop = True)
    X_data = pd.concat([X_data, new_columns], axis = 1).reset_index(drop = True)
    return X_data

for i, _ in enumerate(dataset_names):
    X_train, X_test, Y_train = X_trains[i], X_tests[i], y_trains[i]

    numeric_features, categoric_features = get_number_of_datatype(X_train)

    X_train = preprocess(X_train, numeric_features, categoric_features)
    X_test = preprocess( X_test, numeric_features, categoric_features)
    X_trains[i], X_tests[i] = align_columns(X_train, X_test)
    y_trains[i] = Y_train

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
from tqdm import tqdm

In [35]:
TEST_MODEL = False

In [36]:

torch.manual_seed(42)

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  
        self.fc2 = nn.Linear(64, 32)           
        self.fc3 = nn.Linear(32, 1)           

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # 使用ReLU激活函數
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # 輸出層使用sigmoid激活函數
        return x

models = []
aucs = []

for i in tqdm(range(len(dataset_names))):
    if TEST_MODEL:
        tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(
            X_trains[i], y_trains[i], test_size = 0.2, random_state = 42)
    else:
        tmp_X_train, tmp_y_train = X_trains[i], y_trains[i]
        
    train_tensor = TensorDataset(torch.FloatTensor(tmp_X_train.values), torch.FloatTensor(tmp_y_train.values))
    train_loader = DataLoader(train_tensor, batch_size=32, shuffle=True)
    
    input_size = tmp_X_train.shape[1] 
    model = SimpleNN(input_size)
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    
    model.train()
    for epoch in range(100):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.squeeze())
            loss.backward()
            optimizer.step()

    if TEST_MODEL:
        model.eval()
        with torch.no_grad():
            tmp_y_prob = model(torch.FloatTensor(tmp_X_test.values)).numpy()
        auc = roc_auc_score(tmp_y_test, tmp_y_prob)
        aucs.append(auc)
        
    models.append(model)



100%|██████████| 49/49 [01:13<00:00,  1.50s/it]


## Inference Model

In [37]:
y_predicts = []
for i in range(len(dataset_names)):
    model = models[i]
    model.eval()
    with torch.no_grad():
        y_predict_proba = model(torch.FloatTensor(X_tests[i].values)).numpy()
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)

## Save result

In [38]:
for idx,dataset_name in enumerate(dataset_names):
    df = y_predicts[idx]
    df.to_csv(f'../../Competition_data/{dataset_name}/y_predict.csv', index = False,header = True)