In [1]:
import sys
import pandas as pd
import numpy as np

# Dodaj src do sys.path, by importować własne moduły
sys.path.append('../src')

from experiment_logger import log_experiment

In [2]:
TRAIN_PATH = '../../playground-series-s5e7/train.csv'
TEST_PATH = '../../playground-series-s5e7/test.csv'

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print('Train shape:', train_data.shape)
print('Test shape:', test_data.shape)
train_data.head()

Train shape: (18524, 9)
Test shape: (6175, 8)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [3]:
train_data.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [4]:
test_data.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [5]:
train_data.info()
train_data.describe()
train_data['Personality'].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


Personality
Extrovert    0.739527
Introvert    0.260473
Name: proportion, dtype: float64

In [6]:
train_data.drop(columns=['id'], inplace=True)
test_data.drop(columns=['id'], inplace=True)

In [7]:
# Załóżmy, że usunąłeś już kolumnę 'id' z train i test
from data_utils import split_numerical_categorical
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Zakładam, że X_train to Twój DataFrame z cechami (bez targetu i id)
target = train_data['Personality']
train_data.drop(columns=['Personality'], inplace=True)
numerical_cols, categorical_cols = split_numerical_categorical(train_data)

print("Zmienne numeryczne:", numerical_cols)
print("Zmienne kategoryczne:", categorical_cols)

# Łączymy train i test, by mieć spójny encoding i imputację
full = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Imputacja numeryczna
imputer = IterativeImputer(random_state=42)
full[numerical_cols] = imputer.fit_transform(full[numerical_cols])

# Imputacja kategoryczna - wypełnienie braków 'Missing'
for col in categorical_cols:
    full[col] = full[col].fillna('Missing')

print("Kolumny w full:", full.columns.tolist())
print("Kolumny kategorialne:", categorical_cols)

# One-hot encoding
full_encoded = pd.get_dummies(full, columns=categorical_cols)

# Rozdzielamy z powrotem
X_train = full_encoded.iloc[:len(train_data)]
X_test = full_encoded.iloc[len(train_data):]
y_train = target

Zmienne numeryczne: ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
Zmienne kategoryczne: ['Stage_fear', 'Drained_after_socializing']
Kolumny w full: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
Kolumny kategorialne: ['Stage_fear', 'Drained_after_socializing']


In [8]:
X_train.head()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Missing,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_Missing,Drained_after_socializing_No,Drained_after_socializing_Yes
0,0.0,6.0,4.0,15.0,5.0,False,True,False,False,True,False
1,1.0,7.0,3.0,10.0,8.0,False,True,False,False,True,False
2,6.0,1.0,0.0,3.0,0.0,False,False,True,True,False,False
3,3.0,7.0,3.0,11.0,5.0,False,True,False,False,True,False
4,1.0,4.0,4.0,13.0,5.708436,False,True,False,False,True,False


In [9]:
print(X_train.shape)
print(X_test.shape)

(18524, 11)
(6175, 11)


In [10]:
print(X_test.dtypes)
print(X_train.dtypes)
print(X_test.head())

Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           bool
dtype: object
Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           b

In [11]:
# Zakoduj target na liczby
y_train = y_train.map({'Extrovert': 0, 'Introvert': 1})

# Sprawdź, czy wszystko jest OK
print("Unikalne wartości y_train:", y_train.unique())
print("Typ y_train:", y_train.dtype)

Unikalne wartości y_train: [0 1]
Typ y_train: int64


In [12]:
# %pip install optuna
# 1. Importy
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
import lightgbm as lgb

# 2. Optuna - optymalizacja hiperparametrów
def objective_lgbm(trial):
    param_lgbm = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 15, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1
    }
    skf_lgbm = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_lgbm = []
    for train_idx, val_idx in skf_lgbm.split(X_train, y_train):
        X_tr_lgbm, X_val_lgbm = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_lgbm, y_val_lgbm = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model_lgbm = lgb.LGBMClassifier(**param_lgbm)
        model_lgbm.fit(
            X_tr_lgbm, y_tr_lgbm,
            eval_set=[(X_val_lgbm, y_val_lgbm)],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        val_pred_lgbm = model_lgbm.predict(X_val_lgbm)
        score_lgbm = accuracy_score(y_val_lgbm, val_pred_lgbm)
        scores_lgbm.append(score_lgbm)
    return np.mean(scores_lgbm)

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=30)  # Możesz zwiększyć n_trials dla lepszych wyników

print("Najlepsze parametry:", study_lgbm.best_params)

# 3. Finalny model z OOF predictions i predykcjami na testach
best_params_lgbm = study_lgbm.best_params
best_params_lgbm.update({'random_state': 42, 'n_jobs': -1})

skf_lgbm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_lgbm = []
test_preds_lgbm = np.zeros((len(X_test), skf_lgbm.n_splits))
oof_preds_lgbm = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(skf_lgbm.split(X_train, y_train)):
    X_tr_lgbm, X_val_lgbm = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_lgbm, y_val_lgbm = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model_lgbm = lgb.LGBMClassifier(**best_params_lgbm)
    model_lgbm.fit(
        X_tr_lgbm, y_tr_lgbm,
        eval_set=[(X_val_lgbm, y_val_lgbm)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    val_pred_lgbm = model_lgbm.predict(X_val_lgbm)
    score_lgbm = accuracy_score(y_val_lgbm, val_pred_lgbm)
    scores_lgbm.append(score_lgbm)
    
    # OOF predictions
    oof_preds_lgbm[val_idx] = model_lgbm.predict_proba(X_val_lgbm)[:, 1]
    # Test predictions
    test_preds_lgbm[:, fold] = model_lgbm.predict_proba(X_test)[:, 1]

# OOF accuracy
oof_binary_lgbm = (oof_preds_lgbm > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary_lgbm))

# Uśrednienie predykcji po foldach
mean_preds_lgbm = test_preds_lgbm.mean(axis=1)
final_test_pred_lgbm = (mean_preds_lgbm > 0.5).astype(int)
print(f'Fold accuracy: {scores_lgbm}')
print(f'Mean CV accuracy: {np.mean(scores_lgbm):.4f}')

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-02 13:43:04,712] A new study created in memory with name: no-name-e2bc083a-612e-4c3b-b70a-4c3d3dc02efe


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:05,532] Trial 0 finished with value: 0.9690672519477795 and parameters: {'n_estimators': 703, 'learning_rate': 0.10183482584463473, 'max_depth': 6, 'num_leaves': 80, 'subsample': 0.6286135418109621, 'colsample_bytree': 0.6540010077841952, 'reg_alpha': 2.2838689396091274, 'reg_lambda': 2.1432016772912483}. Best is trial 0 with value: 0.9690672519477795.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:06,911] Trial 1 finished with value: 0.9691751995872719 and parameters: {'n_estimators': 270, 'learning_rate': 0.0444248346153754, 'max_depth': 6, 'num_leaves': 70, 'subsample': 0.6010849932114016, 'colsample_bytree': 0.6154368434073646, 'reg_alpha': 2.7665313998173735, 'reg_lambda': 3.302995817934411}. Best is trial 1 with value: 0.9691751995872719.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:10,298] Trial 2 finished with value: 0.9689053086279416 and parameters: {'n_estimators': 824, 'learning_rate': 0.01585623516712674, 'max_depth': 9, 'num_leaves': 82, 'subsample': 0.6962850554732334, 'colsample_bytree': 0.6823710120868378, 'reg_alpha': 0.15976127117372396, 'reg_lambda': 3.041450764458562}. Best is trial 1 with value: 0.9691751995872719.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:10,615] Trial 3 finished with value: 0.9691752141610049 and parameters: {'n_estimators': 512, 'learning_rate': 0.24038811738618754, 'max_depth': 4, 'num_leaves': 67, 'subsample': 0.7943480910695567, 'colsample_bytree': 0.674179057090984, 'reg_alpha': 4.768122425434357, 'reg_lambda': 2.9657361008415553}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:11,112] Trial 4 finished with value: 0.9690672956689779 and parameters: {'n_estimators': 925, 'learning_rate': 0.2661365920914841, 'max_depth': 6, 'num_leaves': 34, 'subsample': 0.9148666324967707, 'colsample_bytree': 0.9908401569560834, 'reg_alpha': 1.9748732969011995, 'reg_lambda': 1.369169692657749}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:11,480] Trial 5 finished with value: 0.9689053232016743 and parameters: {'n_estimators': 1098, 'learning_rate': 0.28740922983555206, 'max_depth': 7, 'num_leaves': 96, 'subsample': 0.6829968549977972, 'colsample_bytree': 0.9554456402757194, 'reg_alpha': 3.806853316784008, 'reg_lambda': 2.259911340893756}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:11,786] Trial 6 finished with value: 0.9691752141610047 and parameters: {'n_estimators': 502, 'learning_rate': 0.251513553449973, 'max_depth': 5, 'num_leaves': 29, 'subsample': 0.9819536116079653, 'colsample_bytree': 0.7890807440960969, 'reg_alpha': 4.944652916219237, 'reg_lambda': 1.1680804604962236}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:12,818] Trial 7 finished with value: 0.9690672519477793 and parameters: {'n_estimators': 255, 'learning_rate': 0.1702423513563405, 'max_depth': 11, 'num_leaves': 96, 'subsample': 0.7451245700118247, 'colsample_bytree': 0.7817260026076821, 'reg_alpha': 2.042482738161361, 'reg_lambda': 3.4174300312860932}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:13,137] Trial 8 finished with value: 0.9688513275213287 and parameters: {'n_estimators': 987, 'learning_rate': 0.20880179420208342, 'max_depth': 3, 'num_leaves': 96, 'subsample': 0.7507506048908374, 'colsample_bytree': 0.7433780783041757, 'reg_alpha': 1.1529608632598098, 'reg_lambda': 4.657825840054342}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:14,380] Trial 9 finished with value: 0.9691212184806591 and parameters: {'n_estimators': 1049, 'learning_rate': 0.023726080847212807, 'max_depth': 3, 'num_leaves': 24, 'subsample': 0.9909817298969289, 'colsample_bytree': 0.624534346414655, 'reg_alpha': 2.3995846937225975, 'reg_lambda': 4.464272570024497}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:14,916] Trial 10 finished with value: 0.9690672373740465 and parameters: {'n_estimators': 521, 'learning_rate': 0.13342582242102444, 'max_depth': 9, 'num_leaves': 50, 'subsample': 0.8532917598478825, 'colsample_bytree': 0.8985208781912523, 'reg_alpha': 4.56622197159013, 'reg_lambda': 0.6689296007531487}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:15,251] Trial 11 finished with value: 0.9691212476281251 and parameters: {'n_estimators': 523, 'learning_rate': 0.23061454444084475, 'max_depth': 4, 'num_leaves': 49, 'subsample': 0.8432395720300718, 'colsample_bytree': 0.8494246700112871, 'reg_alpha': 4.824472783012852, 'reg_lambda': 0.20738258101957896}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:15,619] Trial 12 finished with value: 0.9689592897345541 and parameters: {'n_estimators': 497, 'learning_rate': 0.2248994295147744, 'max_depth': 5, 'num_leaves': 15, 'subsample': 0.9847588132565789, 'colsample_bytree': 0.7281704797565374, 'reg_alpha': 3.8478146017127286, 'reg_lambda': 1.3442359212486013}. Best is trial 3 with value: 0.9691752141610049.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:15,980] Trial 13 finished with value: 0.9691752433084705 and parameters: {'n_estimators': 411, 'learning_rate': 0.2945387040221823, 'max_depth': 4, 'num_leaves': 65, 'subsample': 0.879790569661925, 'colsample_bytree': 0.8177208897091142, 'reg_alpha': 3.902866506356431, 'reg_lambda': 1.401640155531563}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:16,334] Trial 14 finished with value: 0.9691752433084705 and parameters: {'n_estimators': 334, 'learning_rate': 0.299169982272435, 'max_depth': 4, 'num_leaves': 64, 'subsample': 0.8859149501970218, 'colsample_bytree': 0.8605192876728204, 'reg_alpha': 3.8444242326100873, 'reg_lambda': 3.902412096766657}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000891 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:16,950] Trial 15 finished with value: 0.9690672519477793 and parameters: {'n_estimators': 350, 'learning_rate': 0.29631767006193943, 'max_depth': 8, 'num_leaves': 59, 'subsample': 0.9088711516652039, 'colsample_bytree': 0.8579057793476967, 'reg_alpha': 3.3978156184026926, 'reg_lambda': 4.09159984373496}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:17,573] Trial 16 finished with value: 0.9690672519477793 and parameters: {'n_estimators': 388, 'learning_rate': 0.18013890424920612, 'max_depth': 12, 'num_leaves': 44, 'subsample': 0.9071108996373825, 'colsample_bytree': 0.8580335519934691, 'reg_alpha': 3.5018734224744534, 'reg_lambda': 1.9039206928762678}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:17,965] Trial 17 finished with value: 0.9690132999886323 and parameters: {'n_estimators': 667, 'learning_rate': 0.299256109878422, 'max_depth': 4, 'num_leaves': 64, 'subsample': 0.8513787075369741, 'colsample_bytree': 0.9106207594925727, 'reg_alpha': 3.0778533400359427, 'reg_lambda': 3.9004092071467804}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:18,450] Trial 18 finished with value: 0.9690672665215121 and parameters: {'n_estimators': 210, 'learning_rate': 0.19917127610240973, 'max_depth': 3, 'num_leaves': 81, 'subsample': 0.9314519843397167, 'colsample_bytree': 0.821278749826788, 'reg_alpha': 4.166845952211542, 'reg_lambda': 2.645358043656425}. Best is trial 13 with value: 0.9691752433084705.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:19,008] Trial 19 finished with value: 0.9692831909479629 and parameters: {'n_estimators': 364, 'learning_rate': 0.1287779922221426, 'max_depth': 5, 'num_leaves': 74, 'subsample': 0.8098541209846801, 'colsample_bytree': 0.8996989933193626, 'reg_alpha': 4.253307382871711, 'reg_lambda': 3.8031838213061064}. Best is trial 19 with value: 0.9692831909479629.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:19,898] Trial 20 finished with value: 0.9690672810952451 and parameters: {'n_estimators': 649, 'learning_rate': 0.08956206666736946, 'max_depth': 7, 'num_leaves': 75, 'subsample': 0.8119291198528876, 'colsample_bytree': 0.9198184438997709, 'reg_alpha': 1.447174691952386, 'reg_lambda': 4.901238824347949}. Best is trial 19 with value: 0.9692831909479629.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:20,475] Trial 21 finished with value: 0.9690132562674337 and parameters: {'n_estimators': 366, 'learning_rate': 0.15093549209613488, 'max_depth': 5, 'num_leaves': 58, 'subsample': 0.8788579825151599, 'colsample_bytree': 0.8886571213143784, 'reg_alpha': 4.3198922291561725, 'reg_lambda': 3.9540851998019653}. Best is trial 19 with value: 0.9692831909479629.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:21,449] Trial 22 finished with value: 0.9693371720545757 and parameters: {'n_estimators': 415, 'learning_rate': 0.1216822318659058, 'max_depth': 4, 'num_leaves': 88, 'subsample': 0.8023280459810286, 'colsample_bytree': 0.8238537743163122, 'reg_alpha': 3.8134414593044705, 'reg_lambda': 3.5075763900228996}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:22,078] Trial 23 finished with value: 0.9691752141610047 and parameters: {'n_estimators': 441, 'learning_rate': 0.11477607524666528, 'max_depth': 5, 'num_leaves': 89, 'subsample': 0.7835368688918118, 'colsample_bytree': 0.8177557600886147, 'reg_alpha': 3.086138189720479, 'reg_lambda': 3.3644792113318447}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10960
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:22,716] Trial 24 finished with value: 0.9692292098413503 and parameters: {'n_estimators': 605, 'learning_rate': 0.0705904207593976, 'max_depth': 3, 'num_leaves': 87, 'subsample': 0.814146059351557, 'colsample_bytree': 0.7625624840763041, 'reg_alpha': 4.261712274903316, 'reg_lambda': 1.6993909030586372}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:23,359] Trial 25 finished with value: 0.9691752287347377 and parameters: {'n_estimators': 599, 'learning_rate': 0.07613870765539071, 'max_depth': 3, 'num_leaves': 89, 'subsample': 0.7532361921170425, 'colsample_bytree': 0.7419816842543856, 'reg_alpha': 4.392190022641118, 'reg_lambda': 2.5967440252142806}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of data points in the train set: 14820, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260459 -> initscore=-1.043585
[LightGBM] [Info] Start training from score -1.043585
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [In

[I 2025-07-02 13:43:24,045] Trial 26 finished with value: 0.9691752141610047 and parameters: {'n_estimators': 789, 'learning_rate': 0.06262676653560675, 'max_depth': 3, 'num_leaves': 87, 'subsample': 0.8154826195686904, 'colsample_bytree': 0.7601867076453421, 'reg_alpha': 3.4948279186545754, 'reg_lambda': 1.7332525466188664}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [b

[I 2025-07-02 13:43:24,728] Trial 27 finished with value: 0.969013241693701 and parameters: {'n_estimators': 589, 'learning_rate': 0.12740807393296286, 'max_depth': 6, 'num_leaves': 75, 'subsample': 0.7139133625489544, 'colsample_bytree': 0.7057797748512534, 'reg_alpha': 4.181140564699154, 'reg_lambda': 3.599367349023177}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[Lig

[I 2025-07-02 13:43:25,619] Trial 28 finished with value: 0.9690672519477793 and parameters: {'n_estimators': 303, 'learning_rate': 0.05864977071625574, 'max_depth': 5, 'num_leaves': 99, 'subsample': 0.7727082538670845, 'colsample_bytree': 0.9458198052170034, 'reg_alpha': 2.9403918080435125, 'reg_lambda': 4.282483854168188}. Best is trial 22 with value: 0.9693371720545757.


[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [In

[I 2025-07-02 13:43:26,381] Trial 29 finished with value: 0.9691752141610047 and parameters: {'n_estimators': 761, 'learning_rate': 0.10103242828882125, 'max_depth': 7, 'num_leaves': 76, 'subsample': 0.6648155003117376, 'colsample_bytree': 0.772007691876227, 'reg_alpha': 3.498504560079062, 'reg_lambda': 2.2363099303744374}. Best is trial 22 with value: 0.9693371720545757.


Najlepsze parametry: {'n_estimators': 415, 'learning_rate': 0.1216822318659058, 'max_depth': 4, 'num_leaves': 88, 'subsample': 0.8023280459810286, 'colsample_bytree': 0.8238537743163122, 'reg_alpha': 3.8134414593044705, 'reg_lambda': 3.5075763900228996}
[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1287
[LightGBM] [Info] Number of data points in the train set: 16671, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260452 -> initscore=-1.043619
[LightGBM] [Info] Start training from score -1.043619
[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `fo

In [13]:
# Jeśli nie masz catboost lub optuna, odkomentuj poniższą linię:
# %pip install catboost optuna

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
from catboost import CatBoostClassifier

# Jeśli masz zmienne kategoryczne, podaj ich indeksy lub nazwy:
# cat_features_cat = [lista_indeksów_lub_nazw_kolumn]
cat_features_cat = []  # <- jeśli nie masz, zostaw pustą listę

# 1. Optuna - optymalizacja hiperparametrów
def objective_cat(trial):
    param_cat = {
        'iterations': trial.suggest_int('iterations', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'verbose': 0,
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'cat_features': cat_features_cat
    }
    skf_cat = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_cat = []
    for train_idx, val_idx in skf_cat.split(X_train, y_train):
        X_tr_cat, X_val_cat = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_cat, y_val_cat = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model_cat = CatBoostClassifier(**param_cat)
        model_cat.fit(X_tr_cat, y_tr_cat, eval_set=(X_val_cat, y_val_cat), early_stopping_rounds=50, use_best_model=True)
        val_pred_cat = model_cat.predict(X_val_cat)
        score_cat = accuracy_score(y_val_cat, val_pred_cat)
        scores_cat.append(score_cat)
    return np.mean(scores_cat)

study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=30)  # Możesz zwiększyć n_trials dla lepszych wyników

print("Najlepsze parametry:", study_cat.best_params)

# 2. Finalny model z OOF predictions i predykcjami na testach
best_params_cat = study_cat.best_params
best_params_cat.update({'random_seed': 42, 'verbose': 0, 'loss_function': 'Logloss', 'eval_metric': 'Accuracy', 'cat_features': cat_features_cat})

skf_cat = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_cat = []
test_preds_cat = np.zeros((len(X_test), skf_cat.n_splits))
oof_preds_cat = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(skf_cat.split(X_train, y_train)):
    X_tr_cat, X_val_cat = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_cat, y_val_cat = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model_cat = CatBoostClassifier(**best_params_cat)
    model_cat.fit(X_tr_cat, y_tr_cat, eval_set=(X_val_cat, y_val_cat), early_stopping_rounds=50, use_best_model=True)
    
    val_pred_cat = model_cat.predict(X_val_cat)
    score_cat = accuracy_score(y_val_cat, val_pred_cat)
    scores_cat.append(score_cat)
    
    # OOF predictions
    oof_preds_cat[val_idx] = model_cat.predict_proba(X_val_cat)[:, 1]
    # Test predictions
    test_preds_cat[:, fold] = model_cat.predict_proba(X_test)[:, 1]

# OOF accuracy
oof_binary_cat = (oof_preds_cat > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary_cat))

# Uśrednienie predykcji po foldach
mean_preds_cat = test_preds_cat.mean(axis=1)
final_test_pred_cat = (mean_preds_cat > 0.5).astype(int)
print(f'Fold accuracy: {scores_cat}')
print(f'Mean CV accuracy: {np.mean(scores_cat):.4f}')

[I 2025-07-02 13:43:41,258] A new study created in memory with name: no-name-971f4513-9f87-4803-b3f5-4a0a850b1851
[I 2025-07-02 13:43:49,158] Trial 0 finished with value: 0.9694451634152669 and parameters: {'iterations': 247, 'learning_rate': 0.1605756451064026, 'depth': 10, 'l2_leaf_reg': 3.8481888766505934, 'random_strength': 0.9606699731184738, 'bagging_temperature': 0.25777813001248684, 'border_count': 75}. Best is trial 0 with value: 0.9694451634152669.
[I 2025-07-02 13:43:51,463] Trial 1 finished with value: 0.9693911677349213 and parameters: {'iterations': 1139, 'learning_rate': 0.12670445411595052, 'depth': 5, 'l2_leaf_reg': 4.615349805214294, 'random_strength': 0.3311465085568036, 'bagging_temperature': 0.8605488644931247, 'border_count': 32}. Best is trial 0 with value: 0.9694451634152669.
[I 2025-07-02 13:43:53,583] Trial 2 finished with value: 0.9691212330543921 and parameters: {'iterations': 356, 'learning_rate': 0.02233205785944426, 'depth': 5, 'l2_leaf_reg': 5.4779936855

Najlepsze parametry: {'iterations': 804, 'learning_rate': 0.24450699913516766, 'depth': 10, 'l2_leaf_reg': 9.411552757973391, 'random_strength': 0.34101041449786185, 'bagging_temperature': 0.23371379172763707, 'border_count': 53}
OOF accuracy: 0.9694990282876269
Fold accuracy: [0.967620075553157, 0.9719373988127361, 0.9730167296276309, 0.9638424177010254, 0.9654427645788337, 0.9676025917926566, 0.9719222462203023, 0.9703023758099352, 0.9713822894168467, 0.9719222462203023]
Mean CV accuracy: 0.9695


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
import xgboost as xgb

# 1. Optuna - optymalizacja hiperparametrów
def objective_xgb(trial):
    param_xgb = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'early_stopping_rounds': 50
    }
    skf_xgb = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_xgb = []
    for train_idx, val_idx in skf_xgb.split(X_train, y_train):
        X_tr_xgb, X_val_xgb = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_xgb, y_val_xgb = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model_xgb = xgb.XGBClassifier(**param_xgb)
        model_xgb.fit(X_tr_xgb, y_tr_xgb,
                      eval_set=[(X_val_xgb, y_val_xgb)],
                      verbose=False)
        scores_xgb.append(accuracy_score(y_val_xgb, model_xgb.predict(X_val_xgb)))
    return np.mean(scores_xgb)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)
print("Najlepsze parametry:", study_xgb.best_params)

best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    'random_state': 42,
    'n_jobs': -1,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'early_stopping_rounds': 50
})

skf_xgb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_xgb = []
test_preds_xgb = np.zeros((len(X_test), skf_xgb.n_splits))
oof_preds_xgb = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(skf_xgb.split(X_train, y_train)):
    X_tr_xgb, X_val_xgb = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_xgb, y_val_xgb = y_train.iloc[train_idx], y_train.iloc[val_idx]
    model_xgb = xgb.XGBClassifier(**best_params_xgb)
    model_xgb.fit(X_tr_xgb, y_tr_xgb,
                  eval_set=[(X_val_xgb, y_val_xgb)],
                  verbose=False)
    scores_xgb.append(accuracy_score(y_val_xgb, model_xgb.predict(X_val_xgb)))
    oof_preds_xgb[val_idx] = model_xgb.predict_proba(X_val_xgb)[:, 1]
    test_preds_xgb[:, fold] = model_xgb.predict_proba(X_test)[:, 1]

oof_binary_xgb = (oof_preds_xgb > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary_xgb))
mean_score_xgb = np.mean(scores_xgb)
print("Fold accuracy:", scores_xgb)
print(f"Mean CV accuracy: {mean_score_xgb:.4f}")

mean_preds_xgb = test_preds_xgb.mean(axis=1)
final_test_pred_xgb = (mean_preds_xgb > 0.5).astype(int)

[I 2025-07-02 13:45:56,738] A new study created in memory with name: no-name-0c2fc379-1c79-4e44-b924-d4058774229a


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
[I 2025-07-02 13:45:58,781] Trial 0 finished with value: 0.9689592897345541 and parameters: {'n_estimators': 1074, 'learning_rate': 0.04984689273142407, 'max_depth': 5, 'subsample': 0.7622939423814932, 'colsample_bytree': 0.7356948225983225, 'gamma': 2.0468399478144113, 'reg_alpha': 1.860509847278693, 'reg_lambda': 0.5902913950769562}. Best is trial 0 with value: 0.9689592897345541.
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Pa

Najlepsze parametry: {'n_estimators': 839, 'learning_rate': 0.24816900099301387, 'max_depth': 8, 'subsample': 0.6838681298252383, 'colsample_bytree': 0.6502983264483474, 'gamma': 4.992770692624027, 'reg_alpha': 1.168069559577995, 'reg_lambda': 2.7862920179654886}


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


OOF accuracy: 0.969067156121788
Fold accuracy: [0.9681597409606044, 0.9719373988127361, 0.9724770642201835, 0.963302752293578, 0.964902807775378, 0.9659827213822895, 0.9713822894168467, 0.9708423326133909, 0.9703023758099352, 0.9713822894168467]
Mean CV accuracy: 0.9691


In [15]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# OOF predictions ensemble (średnia po modelach)
oof_preds_ensemble = (oof_preds_lgbm + oof_preds_cat + oof_preds_xgb) / 3
oof_binary_ensemble = (oof_preds_ensemble > 0.5).astype(int)
oof_accuracy_ensemble = accuracy_score(y_train, oof_binary_ensemble)
print("OOF accuracy (ensemble):", oof_accuracy_ensemble)

# CV accuracy ensemble (accuracy na każdym foldzie)
skf_ensemble = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores_ensemble = []

for fold, (train_idx, val_idx) in enumerate(skf_ensemble.split(X_train, y_train)):
    oof_fold = (oof_preds_lgbm[val_idx] + oof_preds_cat[val_idx] + oof_preds_xgb[val_idx]) / 3
    oof_fold_binary = (oof_fold > 0.5).astype(int)
    acc = accuracy_score(np.array(y_train)[val_idx], oof_fold_binary)
    cv_scores_ensemble.append(acc)

mean_cv_accuracy_ensemble = np.mean(cv_scores_ensemble)
print(f"Fold accuracy (ensemble): {cv_scores_ensemble}")
print(f"Mean CV accuracy (ensemble): {mean_cv_accuracy_ensemble:.4f}")

# Test predictions ensemble (średnia po modelach i foldach)
mean_preds_lgbm = test_preds_lgbm.mean(axis=1)
mean_preds_cat = test_preds_cat.mean(axis=1)
mean_preds_xgb = test_preds_xgb.mean(axis=1)
ensemble_test = (mean_preds_lgbm + mean_preds_cat + mean_preds_xgb) / 3
final_ensemble_pred = (ensemble_test > 0.5).astype(int)

OOF accuracy (ensemble): 0.9691751241632477
Fold accuracy (ensemble): [0.967620075553157, 0.9719373988127361, 0.9724770642201835, 0.963302752293578, 0.9654427645788337, 0.9670626349892009, 0.9713822894168467, 0.9708423326133909, 0.9703023758099352, 0.9713822894168467]
Mean CV accuracy (ensemble): 0.9692


In [16]:
import sys
sys.path.append('../src')

from experiment_logger import log_experiment

params = {
    'models': ['LGBMClassifier', 'CatBoostClassifier', 'XGBClassifier'],
    'ensemble_type': 'mean_probability',
    'n_splits': 10
}

log_experiment(
    experiment_name='ensemble_lgbm_cat_xgb_mean',
    model_name='EnsembleMean',
    params=params,
    cv_score=mean_cv_accuracy_ensemble,  # mean CV accuracy ensemble
    comment='Ensemble średnia prawdopodobieństw z LGBM, CatBoost, XGBoost; 10-fold CV; OOF accuracy logowane; imputacja numeryczna i kategoryczna (jeśli była)',
    oof_accuracy=oof_accuracy_ensemble   # OOF accuracy ensemble
)
print('Eksperyment ensemble został zalogowany!')

Eksperyment ensemble został zalogowany!


In [None]:
import os
# Wczytaj sample_submission, aby pobrać wymagane kolumny i kolejność
sample_submission = pd.read_csv('../../playground-series-s5e7/sample_submission.csv')

# Zakładam, że predykcje ensemble są w zmiennej final_ensemble_pred (np. jako liczby lub kategorie)
if set(np.unique(final_ensemble_pred)) == {0, 1}:
    label_map = {0: 'Extrovert', 1: 'Introvert'}
    final_ensemble_pred_labels = pd.Series(final_ensemble_pred).map(label_map).values
else:
    final_ensemble_pred_labels = final_ensemble_pred

submission = sample_submission.copy()
target_col = submission.columns[1]
submission[target_col] = final_ensemble_pred_labels

# Automatyczne nadawanie nazwy pliku
output_dir = '../outputs'
existing = [f for f in os.listdir(output_dir) if f.startswith('submission') and f.endswith('.csv')]
if 'submission.csv' in existing:
    # Szukamy submissionN.csv
    nums = [int(f.replace('submission', '').replace('.csv', '')) for f in existing if f != 'submission.csv' and f.replace('submission', '').replace('.csv', '').isdigit()]
    n = max(nums) if nums else 1
    new_name = f'submission{n+1}.csv'
else:
    new_name = 'submission.csv'

output_path = os.path.join(output_dir, new_name)
submission.to_csv(output_path, index=False)
print(f'Plik submission zapisany do {output_path}')
submission.head()

Plik submission zapisany do ../outputs\submission8.csv


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
