In [None]:
!pip install kaggle
from google.colab import files
files.upload()



Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"lengocminhchau","key":"9ce65932c7e874d7f6ce50ec555fdddc"}'}

In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c child-mind-institute-problematic-internet-use

Downloading child-mind-institute-problematic-internet-use.zip to /content
100% 6.20G/6.21G [01:17<00:00, 114MB/s]
100% 6.21G/6.21G [01:17<00:00, 86.4MB/s]


In [None]:
!unzip -q child-mind-institute-problematic-internet-use.zip

In [None]:
!pip install catboost
!pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [61]:
import os
import re
import numpy as np
import pandas as pd

from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm

from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from scipy.optimize import minimize
from scipy.stats import mode
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

import torch
import torch.nn as nn
import torch.optim as optim

from IPython.display import clear_output
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    VotingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor
)

import warnings
warnings.filterwarnings('ignore')

# Cấu hình hiển thị của Pandas
pd.options.display.max_columns = None

# Đặt SEED để đảm bảo tính tái lập
SEED = 42
n_splits = 5

np.random.seed(SEED)
torch.manual_seed(SEED)

# Nếu sử dụng GPU với PyTorch
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

import tensorflow as tf

tf.random.set_seed(SEED)

In [None]:
def read_and_process_file(filepath):
    try:
        df = pd.read_parquet(filepath)
        df.drop(columns='step', inplace=True)
        return df.describe().values.reshape(-1)
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")
        return None

def process_file(filename, dirname):
    filepath = os.path.join(dirname, filename, 'part-0.parquet')
    stats = read_and_process_file(filepath)
    return stats, filename.split('=')[1] if stats is not None else None

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    # Loại bỏ kết quả None do lỗi khi đọc file
    results = [result for result in results if result is not None]

    stats, indexes = zip(*results) if results else ([], [])

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

def build_autoencoder(input_dim, encoding_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    encoder = Model(inputs=input_layer, outputs=encoded)
    autoencoder.compile(optimizer=Adam(), loss='mse')

    return autoencoder, encoder

def perform_autoencoder_tf(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    input_dim = df_scaled.shape[1]
    autoencoder, encoder = build_autoencoder(input_dim, encoding_dim)

    early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

    history = autoencoder.fit(
        df_scaled, df_scaled,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        verbose=1,
        callbacks=[early_stopping]
    )

    plt.plot(history.history['loss'])
    plt.title("Autoencoder Training Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

    encoded_data = encoder.predict(df_scaled)
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i+1}' for i in range(encoded_data.shape[1])])
    return df_encoded

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def perform_autoencoder_torch(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # Tạo tensor với requires_grad=True
    data_tensor = torch.FloatTensor(df_scaled).requires_grad_(True)

    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    autoencoder.train()  # Đặt mô hình ở chế độ training

    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        epoch_loss = 0
        for i in tqdm(range(0, len(data_tensor), batch_size), desc=f"Epoch {epoch+1}/{epochs}"):
            batch = data_tensor[i: i + batch_size]
            optimizer.zero_grad()  # Xóa gradient trước đó
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)  # Tính loss
            loss.backward()  # Tính gradient
            optimizer.step()  # Cập nhật trọng số
            epoch_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss / len(data_tensor):.4f}")

    # Lấy encoding từ encoder
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()

    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    return df_encoded

In [23]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

train_ts = load_time_series("series_train.parquet")
test_ts = load_time_series("series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder_torch(df_train, encoding_dim=96, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()

train_ts_encoded['id'] = train_ts['id']

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, train_ts_encoded, how="left", on='id')
test_ = pd.merge(test, test_ts, how='left', on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season',
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

100%|██████████| 996/996 [02:58<00:00,  5.59it/s]
100%|██████████| 2/2 [00:00<00:00,  7.62it/s]
Epoch 1/100: 100%|██████████| 32/32 [00:00<00:00, 533.10it/s]


Epoch [1/100], Loss: 0.0356


Epoch 2/100: 100%|██████████| 32/32 [00:00<00:00, 473.64it/s]


Epoch [2/100], Loss: 0.0284


Epoch 3/100: 100%|██████████| 32/32 [00:00<00:00, 593.97it/s]


Epoch [3/100], Loss: 0.0259


Epoch 4/100: 100%|██████████| 32/32 [00:00<00:00, 628.70it/s]


Epoch [4/100], Loss: 0.0246


Epoch 5/100: 100%|██████████| 32/32 [00:00<00:00, 495.01it/s]


Epoch [5/100], Loss: 0.0237


Epoch 6/100: 100%|██████████| 32/32 [00:00<00:00, 584.95it/s]


Epoch [6/100], Loss: 0.0231


Epoch 7/100: 100%|██████████| 32/32 [00:00<00:00, 471.79it/s]


Epoch [7/100], Loss: 0.0227


Epoch 8/100: 100%|██████████| 32/32 [00:00<00:00, 428.44it/s]


Epoch [8/100], Loss: 0.0224


Epoch 9/100: 100%|██████████| 32/32 [00:00<00:00, 592.15it/s]


Epoch [9/100], Loss: 0.0222


Epoch 10/100: 100%|██████████| 32/32 [00:00<00:00, 543.14it/s]


Epoch [10/100], Loss: 0.0220


Epoch 11/100: 100%|██████████| 32/32 [00:00<00:00, 607.90it/s]


Epoch [11/100], Loss: 0.0218


Epoch 12/100: 100%|██████████| 32/32 [00:00<00:00, 519.95it/s]


Epoch [12/100], Loss: 0.0217


Epoch 13/100: 100%|██████████| 32/32 [00:00<00:00, 548.36it/s]


Epoch [13/100], Loss: 0.0216


Epoch 14/100: 100%|██████████| 32/32 [00:00<00:00, 391.44it/s]


Epoch [14/100], Loss: 0.0215


Epoch 15/100: 100%|██████████| 32/32 [00:00<00:00, 460.45it/s]


Epoch [15/100], Loss: 0.0214


Epoch 16/100: 100%|██████████| 32/32 [00:00<00:00, 413.68it/s]


Epoch [16/100], Loss: 0.0213


Epoch 17/100: 100%|██████████| 32/32 [00:00<00:00, 470.68it/s]


Epoch [17/100], Loss: 0.0212


Epoch 18/100: 100%|██████████| 32/32 [00:00<00:00, 419.49it/s]


Epoch [18/100], Loss: 0.0212


Epoch 19/100: 100%|██████████| 32/32 [00:00<00:00, 489.96it/s]


Epoch [19/100], Loss: 0.0211


Epoch 20/100: 100%|██████████| 32/32 [00:00<00:00, 563.58it/s]


Epoch [20/100], Loss: 0.0211


Epoch 21/100: 100%|██████████| 32/32 [00:00<00:00, 556.02it/s]


Epoch [21/100], Loss: 0.0210


Epoch 22/100: 100%|██████████| 32/32 [00:00<00:00, 455.85it/s]


Epoch [22/100], Loss: 0.0210


Epoch 23/100: 100%|██████████| 32/32 [00:00<00:00, 569.46it/s]


Epoch [23/100], Loss: 0.0210


Epoch 24/100: 100%|██████████| 32/32 [00:00<00:00, 587.73it/s]


Epoch [24/100], Loss: 0.0209


Epoch 25/100: 100%|██████████| 32/32 [00:00<00:00, 571.36it/s]


Epoch [25/100], Loss: 0.0209


Epoch 26/100: 100%|██████████| 32/32 [00:00<00:00, 408.00it/s]


Epoch [26/100], Loss: 0.0209


Epoch 27/100: 100%|██████████| 32/32 [00:00<00:00, 443.51it/s]


Epoch [27/100], Loss: 0.0208


Epoch 28/100: 100%|██████████| 32/32 [00:00<00:00, 465.81it/s]


Epoch [28/100], Loss: 0.0208


Epoch 29/100: 100%|██████████| 32/32 [00:00<00:00, 524.35it/s]


Epoch [29/100], Loss: 0.0208


Epoch 30/100: 100%|██████████| 32/32 [00:00<00:00, 608.05it/s]


Epoch [30/100], Loss: 0.0208


Epoch 31/100: 100%|██████████| 32/32 [00:00<00:00, 608.18it/s]


Epoch [31/100], Loss: 0.0207


Epoch 32/100: 100%|██████████| 32/32 [00:00<00:00, 632.97it/s]


Epoch [32/100], Loss: 0.0207


Epoch 33/100: 100%|██████████| 32/32 [00:00<00:00, 641.24it/s]


Epoch [33/100], Loss: 0.0207


Epoch 34/100: 100%|██████████| 32/32 [00:00<00:00, 553.35it/s]


Epoch [34/100], Loss: 0.0207


Epoch 35/100: 100%|██████████| 32/32 [00:00<00:00, 656.35it/s]


Epoch [35/100], Loss: 0.0207


Epoch 36/100: 100%|██████████| 32/32 [00:00<00:00, 592.99it/s]


Epoch [36/100], Loss: 0.0206


Epoch 37/100: 100%|██████████| 32/32 [00:00<00:00, 464.59it/s]


Epoch [37/100], Loss: 0.0206


Epoch 38/100: 100%|██████████| 32/32 [00:00<00:00, 599.07it/s]


Epoch [38/100], Loss: 0.0206


Epoch 39/100: 100%|██████████| 32/32 [00:00<00:00, 603.49it/s]


Epoch [39/100], Loss: 0.0206


Epoch 40/100: 100%|██████████| 32/32 [00:00<00:00, 599.75it/s]


Epoch [40/100], Loss: 0.0206


Epoch 41/100: 100%|██████████| 32/32 [00:00<00:00, 471.77it/s]


Epoch [41/100], Loss: 0.0206


Epoch 42/100: 100%|██████████| 32/32 [00:00<00:00, 421.75it/s]


Epoch [42/100], Loss: 0.0205


Epoch 43/100: 100%|██████████| 32/32 [00:00<00:00, 438.35it/s]


Epoch [43/100], Loss: 0.0205


Epoch 44/100: 100%|██████████| 32/32 [00:00<00:00, 432.20it/s]


Epoch [44/100], Loss: 0.0205


Epoch 45/100: 100%|██████████| 32/32 [00:00<00:00, 428.88it/s]


Epoch [45/100], Loss: 0.0205


Epoch 46/100: 100%|██████████| 32/32 [00:00<00:00, 444.51it/s]


Epoch [46/100], Loss: 0.0205


Epoch 47/100: 100%|██████████| 32/32 [00:00<00:00, 510.33it/s]


Epoch [47/100], Loss: 0.0205


Epoch 48/100: 100%|██████████| 32/32 [00:00<00:00, 534.15it/s]


Epoch [48/100], Loss: 0.0205


Epoch 49/100: 100%|██████████| 32/32 [00:00<00:00, 569.23it/s]


Epoch [49/100], Loss: 0.0205


Epoch 50/100: 100%|██████████| 32/32 [00:00<00:00, 539.87it/s]


Epoch [50/100], Loss: 0.0205


Epoch 51/100: 100%|██████████| 32/32 [00:00<00:00, 459.97it/s]


Epoch [51/100], Loss: 0.0205


Epoch 52/100: 100%|██████████| 32/32 [00:00<00:00, 530.06it/s]


Epoch [52/100], Loss: 0.0204


Epoch 53/100: 100%|██████████| 32/32 [00:00<00:00, 596.20it/s]


Epoch [53/100], Loss: 0.0204


Epoch 54/100: 100%|██████████| 32/32 [00:00<00:00, 526.02it/s]


Epoch [54/100], Loss: 0.0204


Epoch 55/100: 100%|██████████| 32/32 [00:00<00:00, 397.71it/s]


Epoch [55/100], Loss: 0.0204


Epoch 56/100: 100%|██████████| 32/32 [00:00<00:00, 464.12it/s]


Epoch [56/100], Loss: 0.0204


Epoch 57/100: 100%|██████████| 32/32 [00:00<00:00, 418.33it/s]


Epoch [57/100], Loss: 0.0204


Epoch 58/100: 100%|██████████| 32/32 [00:00<00:00, 614.37it/s]


Epoch [58/100], Loss: 0.0204


Epoch 59/100: 100%|██████████| 32/32 [00:00<00:00, 642.99it/s]


Epoch [59/100], Loss: 0.0204


Epoch 60/100: 100%|██████████| 32/32 [00:00<00:00, 601.51it/s]


Epoch [60/100], Loss: 0.0204


Epoch 61/100: 100%|██████████| 32/32 [00:00<00:00, 581.91it/s]


Epoch [61/100], Loss: 0.0204


Epoch 62/100: 100%|██████████| 32/32 [00:00<00:00, 551.59it/s]


Epoch [62/100], Loss: 0.0204


Epoch 63/100: 100%|██████████| 32/32 [00:00<00:00, 581.85it/s]


Epoch [63/100], Loss: 0.0204


Epoch 64/100: 100%|██████████| 32/32 [00:00<00:00, 561.79it/s]


Epoch [64/100], Loss: 0.0204


Epoch 65/100: 100%|██████████| 32/32 [00:00<00:00, 494.74it/s]


Epoch [65/100], Loss: 0.0203


Epoch 66/100: 100%|██████████| 32/32 [00:00<00:00, 540.71it/s]


Epoch [66/100], Loss: 0.0203


Epoch 67/100: 100%|██████████| 32/32 [00:00<00:00, 576.21it/s]


Epoch [67/100], Loss: 0.0203


Epoch 68/100: 100%|██████████| 32/32 [00:00<00:00, 559.11it/s]


Epoch [68/100], Loss: 0.0203


Epoch 69/100: 100%|██████████| 32/32 [00:00<00:00, 368.59it/s]


Epoch [69/100], Loss: 0.0203


Epoch 70/100: 100%|██████████| 32/32 [00:00<00:00, 431.12it/s]


Epoch [70/100], Loss: 0.0203


Epoch 71/100: 100%|██████████| 32/32 [00:00<00:00, 407.07it/s]


Epoch [71/100], Loss: 0.0203


Epoch 72/100: 100%|██████████| 32/32 [00:00<00:00, 436.95it/s]


Epoch [72/100], Loss: 0.0203


Epoch 73/100: 100%|██████████| 32/32 [00:00<00:00, 471.31it/s]


Epoch [73/100], Loss: 0.0203


Epoch 74/100: 100%|██████████| 32/32 [00:00<00:00, 580.70it/s]


Epoch [74/100], Loss: 0.0203


Epoch 75/100: 100%|██████████| 32/32 [00:00<00:00, 534.76it/s]


Epoch [75/100], Loss: 0.0203


Epoch 76/100: 100%|██████████| 32/32 [00:00<00:00, 572.08it/s]


Epoch [76/100], Loss: 0.0203


Epoch 77/100: 100%|██████████| 32/32 [00:00<00:00, 547.42it/s]


Epoch [77/100], Loss: 0.0203


Epoch 78/100: 100%|██████████| 32/32 [00:00<00:00, 650.30it/s]


Epoch [78/100], Loss: 0.0203


Epoch 79/100: 100%|██████████| 32/32 [00:00<00:00, 561.47it/s]


Epoch [79/100], Loss: 0.0203


Epoch 80/100: 100%|██████████| 32/32 [00:00<00:00, 486.14it/s]


Epoch [80/100], Loss: 0.0203


Epoch 81/100: 100%|██████████| 32/32 [00:00<00:00, 463.78it/s]


Epoch [81/100], Loss: 0.0203


Epoch 82/100: 100%|██████████| 32/32 [00:00<00:00, 330.53it/s]


Epoch [82/100], Loss: 0.0203


Epoch 83/100: 100%|██████████| 32/32 [00:00<00:00, 371.22it/s]


Epoch [83/100], Loss: 0.0203


Epoch 84/100: 100%|██████████| 32/32 [00:00<00:00, 368.48it/s]


Epoch [84/100], Loss: 0.0203


Epoch 85/100: 100%|██████████| 32/32 [00:00<00:00, 417.58it/s]


Epoch [85/100], Loss: 0.0203


Epoch 86/100: 100%|██████████| 32/32 [00:00<00:00, 381.44it/s]


Epoch [86/100], Loss: 0.0203


Epoch 87/100: 100%|██████████| 32/32 [00:00<00:00, 393.04it/s]


Epoch [87/100], Loss: 0.0203


Epoch 88/100: 100%|██████████| 32/32 [00:00<00:00, 389.99it/s]


Epoch [88/100], Loss: 0.0202


Epoch 89/100: 100%|██████████| 32/32 [00:00<00:00, 424.75it/s]


Epoch [89/100], Loss: 0.0202


Epoch 90/100: 100%|██████████| 32/32 [00:00<00:00, 397.96it/s]


Epoch [90/100], Loss: 0.0202


Epoch 91/100: 100%|██████████| 32/32 [00:00<00:00, 363.06it/s]


Epoch [91/100], Loss: 0.0202


Epoch 92/100: 100%|██████████| 32/32 [00:00<00:00, 437.08it/s]


Epoch [92/100], Loss: 0.0202


Epoch 93/100: 100%|██████████| 32/32 [00:00<00:00, 389.43it/s]


Epoch [93/100], Loss: 0.0202


Epoch 94/100: 100%|██████████| 32/32 [00:00<00:00, 416.27it/s]


Epoch [94/100], Loss: 0.0202


Epoch 95/100: 100%|██████████| 32/32 [00:00<00:00, 370.61it/s]


Epoch [95/100], Loss: 0.0202


Epoch 96/100: 100%|██████████| 32/32 [00:00<00:00, 448.28it/s]


Epoch [96/100], Loss: 0.0202


Epoch 97/100: 100%|██████████| 32/32 [00:00<00:00, 408.72it/s]


Epoch [97/100], Loss: 0.0202


Epoch 98/100: 100%|██████████| 32/32 [00:00<00:00, 479.31it/s]


Epoch [98/100], Loss: 0.0202


Epoch 99/100: 100%|██████████| 32/32 [00:00<00:00, 426.91it/s]


Epoch [99/100], Loss: 0.0202


Epoch 100/100: 100%|██████████| 32/32 [00:00<00:00, 387.82it/s]


Epoch [100/100], Loss: 0.0202


In [None]:
def update(df):
    global cat_c
    for c in cat_c:
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

datasets = [train, test]
train, test = [update(df) for df in datasets]

def create_mapping_and_apply(column, train, test):
    mapping = {value: idx for idx, value in enumerate(train[column].unique())}
    train[column] = train[column].replace(mapping).astype(int)
    test[column] = test[column].replace(mapping).astype(int)

for col in cat_c:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    thresholds = np.sort(thresholds)  # Đảm bảo các ngưỡng tăng dần
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    return -quadratic_weighted_kappa(y_true, threshold_Rounder(oof_non_rounded, thresholds))

def train_and_evaluate_fold(model, X_train, y_train, X_val, y_val):
    """Huấn luyện mô hình và đánh giá trên tập train và validation."""
    model.fit(X_train, y_train)

    # Dự đoán trên tập train và validation
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    train_kappa = quadratic_weighted_kappa(y_train, np.round(y_train_pred).astype(int))
    val_kappa = quadratic_weighted_kappa(y_val, np.round(y_val_pred).astype(int))

    return y_val_pred, train_kappa, val_kappa

In [19]:
def TrainML(model_class, test_data, n_splits=5, SEED=42):
    """Huấn luyện mô hình với Stratified K-Fold và tối ưu QWK."""
    # Kiểm tra dữ liệu đầu vào
    if 'sii' not in train.columns:
        raise ValueError("'sii' column is missing in training data!")

    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S, test_S = [], []
    oof_non_rounded = np.zeros(len(y), dtype=np.float32)
    oof_rounded = np.zeros(len(y), dtype=np.int32)
    test_preds = np.zeros((len(test_data), n_splits), dtype=np.float32)

    print("Starting K-Fold Training...")
    for fold, (train_idx, val_idx) in enumerate(SKF.split(X, y)):
        print(f"\nFold {fold+1}/{n_splits}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = clone(model_class)
        y_val_pred, train_kappa, val_kappa = train_and_evaluate_fold(model, X_train, y_train, X_val, y_val)

        # Lưu kết quả và đánh giá
        oof_non_rounded[val_idx] = y_val_pred
        oof_rounded[val_idx] = np.round(y_val_pred).astype(int)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # Dự đoán trên test
        test_preds[:, fold] = model.predict(test_data)

        print(f"Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    print(f"\nMean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK --> {np.mean(test_S):.4f}")

    # Tối ưu hoá thresholds
    print("Optimizing QWK Thresholds...")
    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')

    if not KappaOptimizer.success:
        raise ValueError("Threshold optimization did not converge!")

    optimized_thresholds = np.sort(KappaOptimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, optimized_thresholds)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)
    print("Optimized Thresholds (sorted):", np.sort(KappaOptimizer.x))
    print(f"Optimized QWK SCORE :: {tKappa:.3f}")

    # Tạo kết quả submission
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, optimized_thresholds)

    return tpTuned

    print("Training Complete!")
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])

Submission1 = TrainML(ensemble, test)

Starting K-Fold Training...

Fold 1/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28570
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 154
[LightGBM] [Info] Start training from score 0.579982
Train QWK: 0.9201, Validation QWK: 0.3589

Fold 2/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28570
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 154
[LightGBM] [Info] Start training from score 0.581087
Train QWK: 0.9197, Validation QWK: 0.4326

Fold 3/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]

In [20]:
Submission1 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission1
})
Submission1

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,1


In [42]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

train_ts = load_time_series("series_train.parquet")
test_ts = load_time_series("series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder_torch(df_train, encoding_dim=96, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()

train_ts_encoded['id'] = train_ts['id']

train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, train_ts_encoded, how="left", on='id')
test_ = pd.merge(test, test_ts, how='left', on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season',
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']


def update(df):
    global cat_c
    for c in cat_c:
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

datasets = [train, test]
train, test = [update(df) for df in datasets]

def create_mapping_and_apply(column, train, test):
    mapping = {value: idx for idx, value in enumerate(train[column].unique())}
    train[column] = train[column].replace(mapping).astype(int)
    test[column] = test[column].replace(mapping).astype(int)

for col in cat_c:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    thresholds = np.sort(thresholds)  # Đảm bảo các ngưỡng tăng dần
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    return -quadratic_weighted_kappa(y_true, threshold_Rounder(oof_non_rounded, thresholds))

def train_and_evaluate_fold(model, X_train, y_train, X_val, y_val):
    """Huấn luyện mô hình và đánh giá trên tập train và validation."""
    model.fit(X_train, y_train)

    # Dự đoán trên tập train và validation
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    train_kappa = quadratic_weighted_kappa(y_train, np.round(y_train_pred).astype(int))
    val_kappa = quadratic_weighted_kappa(y_val, np.round(y_val_pred).astype(int))

    return y_val_pred, train_kappa, val_kappa


100%|██████████| 996/996 [02:34<00:00,  6.46it/s]
100%|██████████| 2/2 [00:00<00:00,  8.22it/s]
Epoch 1/100: 100%|██████████| 32/32 [00:00<00:00, 522.70it/s]


Epoch [1/100], Loss: 0.0357


Epoch 2/100: 100%|██████████| 32/32 [00:00<00:00, 594.80it/s]


Epoch [2/100], Loss: 0.0284


Epoch 3/100: 100%|██████████| 32/32 [00:00<00:00, 615.24it/s]


Epoch [3/100], Loss: 0.0259


Epoch 4/100: 100%|██████████| 32/32 [00:00<00:00, 569.92it/s]


Epoch [4/100], Loss: 0.0246


Epoch 5/100: 100%|██████████| 32/32 [00:00<00:00, 628.39it/s]


Epoch [5/100], Loss: 0.0237


Epoch 6/100: 100%|██████████| 32/32 [00:00<00:00, 601.18it/s]


Epoch [6/100], Loss: 0.0232


Epoch 7/100: 100%|██████████| 32/32 [00:00<00:00, 563.39it/s]


Epoch [7/100], Loss: 0.0227


Epoch 8/100: 100%|██████████| 32/32 [00:00<00:00, 573.51it/s]


Epoch [8/100], Loss: 0.0224


Epoch 9/100: 100%|██████████| 32/32 [00:00<00:00, 569.49it/s]


Epoch [9/100], Loss: 0.0222


Epoch 10/100: 100%|██████████| 32/32 [00:00<00:00, 542.22it/s]


Epoch [10/100], Loss: 0.0220


Epoch 11/100: 100%|██████████| 32/32 [00:00<00:00, 597.31it/s]


Epoch [11/100], Loss: 0.0218


Epoch 12/100: 100%|██████████| 32/32 [00:00<00:00, 553.70it/s]


Epoch [12/100], Loss: 0.0217


Epoch 13/100: 100%|██████████| 32/32 [00:00<00:00, 607.87it/s]


Epoch [13/100], Loss: 0.0216


Epoch 14/100: 100%|██████████| 32/32 [00:00<00:00, 468.05it/s]


Epoch [14/100], Loss: 0.0215


Epoch 15/100: 100%|██████████| 32/32 [00:00<00:00, 426.70it/s]


Epoch [15/100], Loss: 0.0214


Epoch 16/100: 100%|██████████| 32/32 [00:00<00:00, 411.30it/s]


Epoch [16/100], Loss: 0.0213


Epoch 17/100: 100%|██████████| 32/32 [00:00<00:00, 461.92it/s]


Epoch [17/100], Loss: 0.0213


Epoch 18/100: 100%|██████████| 32/32 [00:00<00:00, 429.66it/s]


Epoch [18/100], Loss: 0.0212


Epoch 19/100: 100%|██████████| 32/32 [00:00<00:00, 442.55it/s]


Epoch [19/100], Loss: 0.0211


Epoch 20/100: 100%|██████████| 32/32 [00:00<00:00, 622.59it/s]


Epoch [20/100], Loss: 0.0211


Epoch 21/100: 100%|██████████| 32/32 [00:00<00:00, 553.83it/s]


Epoch [21/100], Loss: 0.0210


Epoch 22/100: 100%|██████████| 32/32 [00:00<00:00, 597.82it/s]


Epoch [22/100], Loss: 0.0210


Epoch 23/100: 100%|██████████| 32/32 [00:00<00:00, 603.77it/s]


Epoch [23/100], Loss: 0.0210


Epoch 24/100: 100%|██████████| 32/32 [00:00<00:00, 538.35it/s]


Epoch [24/100], Loss: 0.0209


Epoch 25/100: 100%|██████████| 32/32 [00:00<00:00, 551.34it/s]


Epoch [25/100], Loss: 0.0209


Epoch 26/100: 100%|██████████| 32/32 [00:00<00:00, 647.14it/s]


Epoch [26/100], Loss: 0.0209


Epoch 27/100: 100%|██████████| 32/32 [00:00<00:00, 568.81it/s]


Epoch [27/100], Loss: 0.0208


Epoch 28/100: 100%|██████████| 32/32 [00:00<00:00, 399.63it/s]


Epoch [28/100], Loss: 0.0208


Epoch 29/100: 100%|██████████| 32/32 [00:00<00:00, 455.48it/s]


Epoch [29/100], Loss: 0.0208


Epoch 30/100: 100%|██████████| 32/32 [00:00<00:00, 429.94it/s]


Epoch [30/100], Loss: 0.0208


Epoch 31/100: 100%|██████████| 32/32 [00:00<00:00, 551.67it/s]


Epoch [31/100], Loss: 0.0207


Epoch 32/100: 100%|██████████| 32/32 [00:00<00:00, 516.71it/s]


Epoch [32/100], Loss: 0.0207


Epoch 33/100: 100%|██████████| 32/32 [00:00<00:00, 338.28it/s]


Epoch [33/100], Loss: 0.0207


Epoch 34/100: 100%|██████████| 32/32 [00:00<00:00, 333.62it/s]


Epoch [34/100], Loss: 0.0207


Epoch 35/100: 100%|██████████| 32/32 [00:00<00:00, 353.19it/s]


Epoch [35/100], Loss: 0.0207


Epoch 36/100: 100%|██████████| 32/32 [00:00<00:00, 362.23it/s]


Epoch [36/100], Loss: 0.0207


Epoch 37/100: 100%|██████████| 32/32 [00:00<00:00, 363.81it/s]


Epoch [37/100], Loss: 0.0206


Epoch 38/100: 100%|██████████| 32/32 [00:00<00:00, 298.22it/s]


Epoch [38/100], Loss: 0.0206


Epoch 39/100: 100%|██████████| 32/32 [00:00<00:00, 358.93it/s]


Epoch [39/100], Loss: 0.0206


Epoch 40/100: 100%|██████████| 32/32 [00:00<00:00, 270.37it/s]


Epoch [40/100], Loss: 0.0206


Epoch 41/100: 100%|██████████| 32/32 [00:00<00:00, 339.37it/s]


Epoch [41/100], Loss: 0.0206


Epoch 42/100: 100%|██████████| 32/32 [00:00<00:00, 397.60it/s]


Epoch [42/100], Loss: 0.0206


Epoch 43/100: 100%|██████████| 32/32 [00:00<00:00, 382.83it/s]


Epoch [43/100], Loss: 0.0206


Epoch 44/100: 100%|██████████| 32/32 [00:00<00:00, 460.59it/s]


Epoch [44/100], Loss: 0.0205


Epoch 45/100: 100%|██████████| 32/32 [00:00<00:00, 465.28it/s]


Epoch [45/100], Loss: 0.0205


Epoch 46/100: 100%|██████████| 32/32 [00:00<00:00, 434.91it/s]


Epoch [46/100], Loss: 0.0205


Epoch 47/100: 100%|██████████| 32/32 [00:00<00:00, 466.76it/s]


Epoch [47/100], Loss: 0.0205


Epoch 48/100: 100%|██████████| 32/32 [00:00<00:00, 483.57it/s]


Epoch [48/100], Loss: 0.0205


Epoch 49/100: 100%|██████████| 32/32 [00:00<00:00, 251.76it/s]


Epoch [49/100], Loss: 0.0205


Epoch 50/100: 100%|██████████| 32/32 [00:00<00:00, 271.12it/s]


Epoch [50/100], Loss: 0.0205


Epoch 51/100: 100%|██████████| 32/32 [00:00<00:00, 338.64it/s]


Epoch [51/100], Loss: 0.0204


Epoch 52/100: 100%|██████████| 32/32 [00:00<00:00, 358.01it/s]


Epoch [52/100], Loss: 0.0204


Epoch 53/100: 100%|██████████| 32/32 [00:00<00:00, 389.03it/s]


Epoch [53/100], Loss: 0.0204


Epoch 54/100: 100%|██████████| 32/32 [00:00<00:00, 275.80it/s]


Epoch [54/100], Loss: 0.0204


Epoch 55/100: 100%|██████████| 32/32 [00:00<00:00, 373.41it/s]


Epoch [55/100], Loss: 0.0204


Epoch 56/100: 100%|██████████| 32/32 [00:00<00:00, 399.93it/s]


Epoch [56/100], Loss: 0.0204


Epoch 57/100: 100%|██████████| 32/32 [00:00<00:00, 386.19it/s]


Epoch [57/100], Loss: 0.0204


Epoch 58/100: 100%|██████████| 32/32 [00:00<00:00, 285.26it/s]


Epoch [58/100], Loss: 0.0204


Epoch 59/100: 100%|██████████| 32/32 [00:00<00:00, 303.60it/s]


Epoch [59/100], Loss: 0.0204


Epoch 60/100: 100%|██████████| 32/32 [00:00<00:00, 263.17it/s]


Epoch [60/100], Loss: 0.0204


Epoch 61/100: 100%|██████████| 32/32 [00:00<00:00, 388.03it/s]


Epoch [61/100], Loss: 0.0204


Epoch 62/100: 100%|██████████| 32/32 [00:00<00:00, 375.89it/s]


Epoch [62/100], Loss: 0.0204


Epoch 63/100: 100%|██████████| 32/32 [00:00<00:00, 373.84it/s]


Epoch [63/100], Loss: 0.0204


Epoch 64/100: 100%|██████████| 32/32 [00:00<00:00, 352.75it/s]


Epoch [64/100], Loss: 0.0203


Epoch 65/100: 100%|██████████| 32/32 [00:00<00:00, 321.87it/s]


Epoch [65/100], Loss: 0.0203


Epoch 66/100: 100%|██████████| 32/32 [00:00<00:00, 311.55it/s]


Epoch [66/100], Loss: 0.0203


Epoch 67/100: 100%|██████████| 32/32 [00:00<00:00, 269.53it/s]


Epoch [67/100], Loss: 0.0203


Epoch 68/100: 100%|██████████| 32/32 [00:00<00:00, 287.86it/s]


Epoch [68/100], Loss: 0.0203


Epoch 69/100: 100%|██████████| 32/32 [00:00<00:00, 290.16it/s]


Epoch [69/100], Loss: 0.0203


Epoch 70/100: 100%|██████████| 32/32 [00:00<00:00, 460.83it/s]


Epoch [70/100], Loss: 0.0203


Epoch 71/100: 100%|██████████| 32/32 [00:00<00:00, 588.03it/s]


Epoch [71/100], Loss: 0.0203


Epoch 72/100: 100%|██████████| 32/32 [00:00<00:00, 482.33it/s]


Epoch [72/100], Loss: 0.0203


Epoch 73/100: 100%|██████████| 32/32 [00:00<00:00, 546.30it/s]


Epoch [73/100], Loss: 0.0203


Epoch 74/100: 100%|██████████| 32/32 [00:00<00:00, 579.42it/s]


Epoch [74/100], Loss: 0.0203


Epoch 75/100: 100%|██████████| 32/32 [00:00<00:00, 465.33it/s]


Epoch [75/100], Loss: 0.0203


Epoch 76/100: 100%|██████████| 32/32 [00:00<00:00, 599.67it/s]


Epoch [76/100], Loss: 0.0203


Epoch 77/100: 100%|██████████| 32/32 [00:00<00:00, 597.48it/s]


Epoch [77/100], Loss: 0.0203


Epoch 78/100: 100%|██████████| 32/32 [00:00<00:00, 637.51it/s]


Epoch [78/100], Loss: 0.0203


Epoch 79/100: 100%|██████████| 32/32 [00:00<00:00, 485.07it/s]


Epoch [79/100], Loss: 0.0203


Epoch 80/100: 100%|██████████| 32/32 [00:00<00:00, 467.55it/s]


Epoch [80/100], Loss: 0.0203


Epoch 81/100: 100%|██████████| 32/32 [00:00<00:00, 443.03it/s]


Epoch [81/100], Loss: 0.0203


Epoch 82/100: 100%|██████████| 32/32 [00:00<00:00, 431.08it/s]


Epoch [82/100], Loss: 0.0202


Epoch 83/100: 100%|██████████| 32/32 [00:00<00:00, 433.18it/s]


Epoch [83/100], Loss: 0.0202


Epoch 84/100: 100%|██████████| 32/32 [00:00<00:00, 529.69it/s]


Epoch [84/100], Loss: 0.0202


Epoch 85/100: 100%|██████████| 32/32 [00:00<00:00, 619.65it/s]


Epoch [85/100], Loss: 0.0202


Epoch 86/100: 100%|██████████| 32/32 [00:00<00:00, 593.40it/s]


Epoch [86/100], Loss: 0.0202


Epoch 87/100: 100%|██████████| 32/32 [00:00<00:00, 600.89it/s]


Epoch [87/100], Loss: 0.0202


Epoch 88/100: 100%|██████████| 32/32 [00:00<00:00, 563.61it/s]


Epoch [88/100], Loss: 0.0202


Epoch 89/100: 100%|██████████| 32/32 [00:00<00:00, 464.95it/s]


Epoch [89/100], Loss: 0.0202


Epoch 90/100: 100%|██████████| 32/32 [00:00<00:00, 558.23it/s]


Epoch [90/100], Loss: 0.0202


Epoch 91/100: 100%|██████████| 32/32 [00:00<00:00, 561.48it/s]


Epoch [91/100], Loss: 0.0202


Epoch 92/100: 100%|██████████| 32/32 [00:00<00:00, 602.84it/s]


Epoch [92/100], Loss: 0.0202


Epoch 93/100: 100%|██████████| 32/32 [00:00<00:00, 532.81it/s]


Epoch [93/100], Loss: 0.0202


Epoch 94/100: 100%|██████████| 32/32 [00:00<00:00, 390.38it/s]


Epoch [94/100], Loss: 0.0202


Epoch 95/100: 100%|██████████| 32/32 [00:00<00:00, 446.87it/s]


Epoch [95/100], Loss: 0.0202


Epoch 96/100: 100%|██████████| 32/32 [00:00<00:00, 428.60it/s]


Epoch [96/100], Loss: 0.0202


Epoch 97/100: 100%|██████████| 32/32 [00:00<00:00, 430.31it/s]


Epoch [97/100], Loss: 0.0202


Epoch 98/100: 100%|██████████| 32/32 [00:00<00:00, 507.67it/s]


Epoch [98/100], Loss: 0.0202


Epoch 99/100: 100%|██████████| 32/32 [00:00<00:00, 536.77it/s]


Epoch [99/100], Loss: 0.0202


Epoch 100/100: 100%|██████████| 32/32 [00:00<00:00, 558.04it/s]


Epoch [100/100], Loss: 0.0202


In [43]:
def TrainML(model_class, test_data, n_splits=5, SEED=42):
    """Huấn luyện mô hình với Stratified K-Fold và tối ưu QWK."""
    # Kiểm tra dữ liệu đầu vào
    if 'sii' not in train.columns:
        raise ValueError("'sii' column is missing in training data!")

    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S, test_S = [], []
    oof_non_rounded = np.zeros(len(y), dtype=np.float32)
    oof_rounded = np.zeros(len(y), dtype=np.int32)
    test_preds = np.zeros((len(test_data), n_splits), dtype=np.float32)

    print("Starting K-Fold Training...")
    for fold, (train_idx, val_idx) in enumerate(SKF.split(X, y)):
        print(f"\nFold {fold+1}/{n_splits}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = clone(model_class)
        y_val_pred, train_kappa, val_kappa = train_and_evaluate_fold(model, X_train, y_train, X_val, y_val)

        # Lưu kết quả và đánh giá
        oof_non_rounded[val_idx] = y_val_pred
        oof_rounded[val_idx] = np.round(y_val_pred).astype(int)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # Dự đoán trên test
        test_preds[:, fold] = model.predict(test_data)

        print(f"Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    print(f"\nMean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK --> {np.mean(test_S):.4f}")

    # Tối ưu hoá thresholds
    print("Optimizing QWK Thresholds...")
    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')

    if not KappaOptimizer.success:
        raise ValueError("Threshold optimization did not converge!")

    optimized_thresholds = np.sort(KappaOptimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, optimized_thresholds)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)
    print("Optimized Thresholds (sorted):", np.sort(KappaOptimizer.x))
    print(f"Optimized QWK SCORE :: {tKappa:.3f}")

    # Tạo kết quả submission
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, optimized_thresholds)

    return tpTuned

    print("Training Complete!")

In [58]:
Params = {
    'learning_rate': 0.04,
    'max_depth': 12,
    'num_leaves': 480,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


XGB_Params = {
    'learning_rate': 0.04,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.82,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.04,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

In [51]:
# Kiểm tra nếu 'random_state' có trong Params, loại bỏ nó
if 'random_state' in Params:
    del Params['random_state']

Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])


In [59]:
Submission2 = TrainML(voting_model, test)

Starting K-Fold Training...

Fold 1/5
Train QWK: 0.7479, Validation QWK: 0.3749

Fold 2/5
Train QWK: 0.7597, Validation QWK: 0.4338

Fold 3/5
Train QWK: 0.7577, Validation QWK: 0.4040

Fold 4/5
Train QWK: 0.7670, Validation QWK: 0.3270

Fold 5/5
Train QWK: 0.7641, Validation QWK: 0.3868

Mean Train QWK --> 0.7593
Mean Validation QWK --> 0.3853
Optimizing QWK Thresholds...
Optimized Thresholds (sorted): [0.52681053 0.88683238 2.95284778]
Optimized QWK SCORE :: 0.453


In [60]:
Submission2 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission2
})
Submission2

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1


In [73]:
sub1 = Submission1
sub2 = Submission2

# Chuẩn hóa dữ liệu
sub1 = sub1.sort_values(by='id').reset_index(drop=True).rename(columns={'sii': 'sii_1'})
sub2 = sub2.sort_values(by='id').reset_index(drop=True).rename(columns={'sii': 'sii_2'})

# Kết hợp hai tệp dựa trên cột 'id'
subs = pd.merge(sub1, sub2, on='id')

# Tạo cột 'sii_s' với trọng số động
weight_1 = 0.80  # Có thể điều chỉnh trọng số
weight_2 = 1 - weight_1
subs['sii_s'] = np.round(subs['sii_1'] * weight_1 + subs['sii_2'] * weight_2).astype(int)

# Tạo DataFrame tổng hợp
combined = pd.DataFrame({
    'id': subs['id'],
    'sii_1': subs['sii_1'],
    'sii_2': subs['sii_2'],
    'sii_s': subs['sii_s']
})

# Hàm majority vote với xử lý tie cases
def majority_vote(row):
    # Tính mode của các giá trị trong hàng
    m = mode(row, keepdims=True)  # `keepdims` để đảm bảo đầu ra có định dạng nhất quán
    mode_value = m.mode[0]  # Lấy giá trị phổ biến nhất
    count = m.count[0]  # Lấy số lần xuất hiện của mode_value

    # Xử lý tie cases
    if count == 1:  # Trường hợp tất cả giá trị chỉ xuất hiện 1 lần
        return row.mean()  # Hoặc có thể chọn chiến lược khác
    return mode_value

# Áp dụng majority vote
combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_s']].apply(majority_vote, axis=1)

# Xuất kết quả cuối cùng
final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})
if not final_submission.empty:
    final_submission.to_csv('submission.csv', index=False)
    print(f"Majority voting completed. {len(final_submission)} rows saved to 'submission.csv'.")
else:
    print("Error: Final submission is empty!")

Majority voting completed. 20 rows saved to 'submission.csv'.


In [75]:
final_submission

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,1
