In [1]:
import numpy as np
import torch
import torch.nn as nn
import numpy as np

import pandas as pd

import os

import re

from sklearn.base import clone

from sklearn.metrics import cohen_kappa_score

from sklearn.model_selection import StratifiedKFold

from scipy.optimize import minimize

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

import polars as pl

import polars.selectors as cs


In [None]:
import logging

# 创建日志记录器
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# 创建日志文件的处理器
file_handler = logging.FileHandler('./XGBoost1.log', mode='w')

# 创建日志格式
formatter = logging.Formatter(
    fmt='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# 将格式应用到处理器
file_handler.setFormatter(formatter)

# 将处理器添加到日志记录器
logger.addHandler(file_handler)

# 写入日志
logger.info('********** Logging initialized **********')

# 读取数据

In [2]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

# 处理数据，处理缺失值

In [3]:
# 筛选数据
sortval = train.isnull().sum(axis=1)
matches = [True if sortval[i] < 61 else False for i in range(len(sortval))]
train = train.loc[matches,:]


In [4]:
# 填充数据

from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.pipeline import Pipeline

imputer = KNNImputer(n_neighbors=5)

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])

# 填充
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)

test_cols = test.select_dtypes(include=['float64', 'int64']).columns
test_imputed_data = imputer.fit_transform(test[test_cols])

test_imputed = pd.DataFrame(test_imputed_data, columns=test_cols)


In [5]:
# from imblearn.over_sampling import SMOTE

# # 分离特征和目标
# X = train_imputed.drop(columns=['sii'])
# y = train_imputed['sii']

# # 使用SMOTE进行过采样
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # 将过采样后的数据转换为DataFrame
# train_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# train_data_resampled['sii'] = y_resampled

In [6]:
# train_data_resampled['sii'].value_counts()
# train_imputed = train_data_resampled

In [7]:
train_imputed[['PCIAT-PCIAT_Total','sii']]


Unnamed: 0,PCIAT-PCIAT_Total,sii
0,55.0,2
1,0.0,0
2,28.0,0
3,44.0,1
4,34.0,1
...,...,...
3259,33.0,1
3260,32.0,1
3261,22.4,0
3262,31.0,1


# 处理特征

In [8]:
def feature_engineering(df):
    # 为不同属性建立关联，
    '''
    height和age有关，waist与age有关
    BIA_BIA_BMI
    BIA_BIA_BMR 基础代谢率
    BIA_BIA_DEE 日常能量消耗
    BIA_BIA_ECW 细胞外水？
    BIA_BIA_FFM 无脂肪质量
    BIA_BIA_FFMI 无脂质量指数
    BIA_BIA_FMI 脂肪质量指数
    BIA_BIA_Fat 体脂百分比
    BIA_BIA_Frame_num 
    BIA_BIA_ICW 细胞内水
    BIA_BIA_LDM 瘦干质量
    BIA_BIA_LST 瘦软组织
    BIA_BIA_SMM 骨骼肌肉质量
    BIA_BIA_TBW 身体水分含量
    BIA-BIA_Activity_Level_num 活动水平
    DEE 和 Activity_Level_num 有关
   BIA_BIA_BMR 和 BIA_BIA_Fat 有关
   BIA_BIA_FMI 和 BIA_BIA_DEE 有关
    '''
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['Height-Age'] = df['Basic_Demos-Age'] * df['Physical-Height']
    df['BMI-Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    # 体脂越高，基础代谢率可能越低
    df['BMR-Fat'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMR']
    # DEE 日常能量消耗和重量，FMI有关
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['DEE_FMI'] = df['BIA-BIA_DEE'] / df['BIA-BIA_FMI']
    # 身体水分含量和细胞内水有关，和肌肉，成正比
    df['ICW_TBW'] = df['BIA-BIA_ICW'] * df['BIA-BIA_TBW']
    df['SMM_TBW'] = df['BIA-BIA_SMM'] * df['BIA-BIA_TBW']
    '''
    FGC-FGC_GSD 总计握力
    FGC-FGC_PU 总计push up
    FGC-FGC_CU
    '''
    df['GSD-Age']= df['FGC-FGC_GSND'] * df['FGC-FGC_GSD'] * df['Basic_Demos-Age']
    '''
    PreInt_EduHx-computerinternet_hoursday 电脑使用时间
    可能和年龄质量有关
    '''
    df['Time-Age'] = df['Basic_Demos-Age'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['FMI-Time'] = df['BIA-BIA_FMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    ''' 
    FGC-FGC_SRL_Zone              1693
    FGC-FGC_SRR_Zone              1691
    FGC-FGC_PU_Zone               1689
    FGC-FGC_CU_Zone               1678
    FGC-FGC_TL_Zone               1675
    FGC-FGC_SRL                   1655
    FGC-FGC_SRR                   1653
    FGC-FGC_PU                    1650
    FGC-FGC_CU                    1638
    FGC-FGC_TL                    1636
    '''
    # 尝试使用缺失值较少的数据
    df['SMM-Height'] = df['BIA-BIA_SMM'] * df['Physical-Height']
    df['DEE-ACT'] = df['BIA-BIA_DEE'] * df['BIA-BIA_Activity_Level_num']
    return df

In [9]:
# 处理test和train
''' 
不考虑parquet的情况， 使用sii预测
'''
train_df = feature_engineering(train_imputed)
test_df = feature_engineering(test_imputed)

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score',
       'Physical-BMI', 'Physical-Height', 'Physical-Weight',
       'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
       'Physical-HeartRate', 'Physical-Systolic_BP',
       'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
       'PreInt_EduHx-computerinternet_hoursday', 'Internet_Hours_Age',
        'Height-Age','BMI-Age','BMR-Fat','DEE_Weight','DEE_FMI','ICW_TBW',
        'SMM_TBW','GSD-Age','Time-Age','FMI-Time','SMM-Height', 'DEE-ACT']

train_data = train_df[featuresCols]
test_data = test_df[featuresCols]

sii = train_df['sii']

if np.any(np.isinf(train_data)):

    train_data = train_data.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

optimal_thresholds = [0.5, 1.5, 2.5]

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

n_splits = 5
SEED = 42



In [10]:
test_data.isin([np.inf, -np.inf]).sum(axis=0)
if np.any(np.isinf(test_data)):

    test_data = test_data.replace([np.inf, -np.inf], np.nan)

In [11]:
# 使用XGBoost模型

from xgboost import XGBRegressor
XGB_Params = {
    'learning_rate': 0.01,
    'max_depth': 7,
    'n_estimators': 300,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.5,  
    'reg_lambda': 2,  
    'random_state': SEED,
    'tree_method': 'gpu_hist',
}

XGB_Model = XGBRegressor(**XGB_Params)

logger.info(XGB_Params)

In [12]:
def precision(y_pred, y_true):
    res = (y_pred == y_true)
    print(res)
    return res.sum() / len(res)

In [13]:
submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')
def train_model(models, test_data, train, sii_res):
    X = train
    y = sii_res

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(models)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        y_val_pre = precision(y_val, y_val_pred_rounded)
        test_preds[:, fold] = model.predict(test_data)
        logging.info(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}, Validation Precision: {y_val_pre:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE ::  {tKappa:.3f}")
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    submission_df = pd.DataFrame({
        'id': submission['id'],
        'sii': tpTuned
    })

    return submission_df

In [14]:
from IPython.display import clear_output
Submission1 = train_model(XGB_Model, test_data,train_data, sii)

Training Folds: 100%|██████████| 5/5 [00:05<00:00,  1.19s/it]

Mean Train QWK --> 0.8030
Mean Validation QWK ---> 0.4814
----> || Optimized QWK SCORE ::  0.546





In [15]:
Submission1

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,1


In [16]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor

Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu'

}



CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
# XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)


voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    # ('tabnet', TabNet_Model)
],weights=[3.0,6.0,6.0])  # 经过调整

Submission2 = train_model(voting_model, test_data,train_data, sii)

Training Folds: 100%|██████████| 5/5 [00:17<00:00,  3.58s/it]


Mean Train QWK --> 0.7408
Mean Validation QWK ---> 0.4890
----> || Optimized QWK SCORE ::  0.557


In [17]:
# Submission1.to_csv('submission.csv', index=False)
Submission1

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,1


In [18]:
## 多模型
Submission3 = train_model(Light, test_data,train_data, sii)


Training Folds: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s]

Mean Train QWK --> 0.8069
Mean Validation QWK ---> 0.4885
----> || Optimized QWK SCORE ::  0.551





In [19]:
Submission1.to_csv('submission.csv', index=False)

In [20]:
Submission3

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,1


In [21]:
Submission1

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,1


In [22]:
Submission2

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,1
