# Baseline — Playground Series S5E7
Pierwsze podejście: Baseline - LightGBM

In [2]:
import sys
import pandas as pd
import numpy as np

# Dodaj src do sys.path, by importować własne moduły
sys.path.append('../src')

from experiment_logger import log_experiment

In [3]:
TRAIN_PATH = '../../playground-series-s5e7/train.csv'
TEST_PATH = '../../playground-series-s5e7/test.csv'

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print('Train shape:', train_data.shape)
print('Test shape:', test_data.shape)
train_data.head()

Train shape: (18524, 9)
Test shape: (6175, 8)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [4]:
train_data.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [5]:
test_data.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [6]:
train_data.info()
train_data.describe()
train_data['Personality'].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


Personality
Extrovert    0.739527
Introvert    0.260473
Name: proportion, dtype: float64

In [7]:
train_data.drop(columns=['id'], inplace=True)
test_data.drop(columns=['id'], inplace=True)

In [8]:
# Załóżmy, że usunąłeś już kolumnę 'id' z train i test
from data_utils import split_numerical_categorical
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Zakładam, że X_train to Twój DataFrame z cechami (bez targetu i id)
target = train_data['Personality']
train_data.drop(columns=['Personality'], inplace=True)
numerical_cols, categorical_cols = split_numerical_categorical(train_data)

print("Zmienne numeryczne:", numerical_cols)
print("Zmienne kategoryczne:", categorical_cols)

# Łączymy train i test, by mieć spójny encoding i imputację
full = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Imputacja numeryczna
imputer = IterativeImputer(random_state=42)
full[numerical_cols] = imputer.fit_transform(full[numerical_cols])

# Imputacja kategoryczna - wypełnienie braków 'Missing'
for col in categorical_cols:
    full[col] = full[col].fillna('Missing')

print("Kolumny w full:", full.columns.tolist())
print("Kolumny kategorialne:", categorical_cols)

# One-hot encoding
full_encoded = pd.get_dummies(full, columns=categorical_cols)

# Rozdzielamy z powrotem
X_train = full_encoded.iloc[:len(train_data)]
X_test = full_encoded.iloc[len(train_data):]
y_train = target

Zmienne numeryczne: ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
Zmienne kategoryczne: ['Stage_fear', 'Drained_after_socializing']
Kolumny w full: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
Kolumny kategorialne: ['Stage_fear', 'Drained_after_socializing']


In [9]:
X_train.head()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Missing,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_Missing,Drained_after_socializing_No,Drained_after_socializing_Yes
0,0.0,6.0,4.0,15.0,5.0,False,True,False,False,True,False
1,1.0,7.0,3.0,10.0,8.0,False,True,False,False,True,False
2,6.0,1.0,0.0,3.0,0.0,False,False,True,True,False,False
3,3.0,7.0,3.0,11.0,5.0,False,True,False,False,True,False
4,1.0,4.0,4.0,13.0,5.708436,False,True,False,False,True,False


In [10]:
print(X_train.shape)
print(X_test.shape)

(18524, 11)
(6175, 11)


In [11]:
print(X_test.dtypes)
print(X_train.dtypes)
print(X_test.head())

Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           bool
dtype: object
Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           b

In [12]:
# Zakoduj target na liczby
y_train = y_train.map({'Extrovert': 0, 'Introvert': 1})

# Sprawdź, czy wszystko jest OK
print("Unikalne wartości y_train:", y_train.unique())
print("Typ y_train:", y_train.dtype)

Unikalne wartości y_train: [0 1]
Typ y_train: int64


In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import numpy as np
from scipy.stats import mode
from xgboost.callback import EarlyStopping

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
test_preds = np.zeros((len(X_test), 10))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
    )
    
    val_pred = model.predict(X_val)
    score = accuracy_score(y_val, val_pred)
    scores.append(score)
    
    # Predykcje na test dla każdego folda (do uśrednienia)
    test_preds[:, fold] = model.predict_proba(X_test)[:, 1]

# Uśrednienie predykcji po foldach
mean_preds = test_preds.mean(axis=1)
final_test_pred = (mean_preds > 0.5).astype(int)
print(f'Fold accuracy: {scores}')
print(f'Mean CV accuracy: {np.mean(scores):.4f}')

[0]	validation_0-logloss:0.53235
[1]	validation_0-logloss:0.49692
[2]	validation_0-logloss:0.46600
[3]	validation_0-logloss:0.43875
[4]	validation_0-logloss:0.41443
[5]	validation_0-logloss:0.39250
[6]	validation_0-logloss:0.37266
[7]	validation_0-logloss:0.35465
[8]	validation_0-logloss:0.33822
[9]	validation_0-logloss:0.32317
[10]	validation_0-logloss:0.30936
[11]	validation_0-logloss:0.29667
[12]	validation_0-logloss:0.28492
[13]	validation_0-logloss:0.27407
[14]	validation_0-logloss:0.26407
[15]	validation_0-logloss:0.25484
[16]	validation_0-logloss:0.24614
[17]	validation_0-logloss:0.23814
[18]	validation_0-logloss:0.23071
[19]	validation_0-logloss:0.22373
[20]	validation_0-logloss:0.21728
[21]	validation_0-logloss:0.21133
[22]	validation_0-logloss:0.20571
[23]	validation_0-logloss:0.20054
[24]	validation_0-logloss:0.19565
[25]	validation_0-logloss:0.19115
[26]	validation_0-logloss:0.18683
[27]	validation_0-logloss:0.18289
[28]	validation_0-logloss:0.17911
[29]	validation_0-loglos

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[67]	validation_0-logloss:0.13199
[68]	validation_0-logloss:0.13179
[69]	validation_0-logloss:0.13166
[70]	validation_0-logloss:0.13155
[71]	validation_0-logloss:0.13141
[72]	validation_0-logloss:0.13133
[73]	validation_0-logloss:0.13121
[74]	validation_0-logloss:0.13113
[75]	validation_0-logloss:0.13104
[76]	validation_0-logloss:0.13095
[77]	validation_0-logloss:0.13075
[78]	validation_0-logloss:0.13062
[79]	validation_0-logloss:0.13045
[80]	validation_0-logloss:0.13041
[81]	validation_0-logloss:0.13035
[82]	validation_0-logloss:0.13028
[83]	validation_0-logloss:0.13019
[84]	validation_0-logloss:0.13019
[85]	validation_0-logloss:0.13006
[86]	validation_0-logloss:0.12995
[87]	validation_0-logloss:0.12989
[88]	validation_0-logloss:0.12978
[89]	validation_0-logloss:0.12976
[90]	validation_0-logloss:0.12974
[91]	validation_0-logloss:0.12966
[92]	validation_0-logloss:0.12961
[93]	validation_0-logloss:0.12954
[94]	validation_0-logloss:0.12955
[95]	validation_0-logloss:0.12951
[96]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[54]	validation_0-logloss:0.13041
[55]	validation_0-logloss:0.12987
[56]	validation_0-logloss:0.12935
[57]	validation_0-logloss:0.12885
[58]	validation_0-logloss:0.12843
[59]	validation_0-logloss:0.12803
[60]	validation_0-logloss:0.12769
[61]	validation_0-logloss:0.12737
[62]	validation_0-logloss:0.12710
[63]	validation_0-logloss:0.12681
[64]	validation_0-logloss:0.12658
[65]	validation_0-logloss:0.12644
[66]	validation_0-logloss:0.12620
[67]	validation_0-logloss:0.12611
[68]	validation_0-logloss:0.12594
[69]	validation_0-logloss:0.12584
[70]	validation_0-logloss:0.12571
[71]	validation_0-logloss:0.12558
[72]	validation_0-logloss:0.12553
[73]	validation_0-logloss:0.12544
[74]	validation_0-logloss:0.12539
[75]	validation_0-logloss:0.12532
[76]	validation_0-logloss:0.12526
[77]	validation_0-logloss:0.12528
[78]	validation_0-logloss:0.12519
[79]	validation_0-logloss:0.12524
[80]	validation_0-logloss:0.12520
[81]	validation_0-logloss:0.12518
[82]	validation_0-logloss:0.12517
[83]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[51]	validation_0-logloss:0.13181
[52]	validation_0-logloss:0.13106
[53]	validation_0-logloss:0.13033
[54]	validation_0-logloss:0.12964
[55]	validation_0-logloss:0.12897
[56]	validation_0-logloss:0.12835
[57]	validation_0-logloss:0.12783
[58]	validation_0-logloss:0.12727
[59]	validation_0-logloss:0.12679
[60]	validation_0-logloss:0.12628
[61]	validation_0-logloss:0.12590
[62]	validation_0-logloss:0.12560
[63]	validation_0-logloss:0.12519
[64]	validation_0-logloss:0.12488
[65]	validation_0-logloss:0.12453
[66]	validation_0-logloss:0.12429
[67]	validation_0-logloss:0.12397
[68]	validation_0-logloss:0.12378
[69]	validation_0-logloss:0.12355
[70]	validation_0-logloss:0.12332
[71]	validation_0-logloss:0.12310
[72]	validation_0-logloss:0.12285
[73]	validation_0-logloss:0.12267
[74]	validation_0-logloss:0.12247
[75]	validation_0-logloss:0.12233
[76]	validation_0-logloss:0.12211
[77]	validation_0-logloss:0.12201
[78]	validation_0-logloss:0.12193
[79]	validation_0-logloss:0.12177
[80]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[48]	validation_0-logloss:0.15429
[49]	validation_0-logloss:0.15358
[50]	validation_0-logloss:0.15295
[51]	validation_0-logloss:0.15229
[52]	validation_0-logloss:0.15178
[53]	validation_0-logloss:0.15131
[54]	validation_0-logloss:0.15083
[55]	validation_0-logloss:0.15034
[56]	validation_0-logloss:0.15004
[57]	validation_0-logloss:0.14974
[58]	validation_0-logloss:0.14944
[59]	validation_0-logloss:0.14908
[60]	validation_0-logloss:0.14885
[61]	validation_0-logloss:0.14856
[62]	validation_0-logloss:0.14838
[63]	validation_0-logloss:0.14815
[64]	validation_0-logloss:0.14797
[65]	validation_0-logloss:0.14783
[66]	validation_0-logloss:0.14767
[67]	validation_0-logloss:0.14760
[68]	validation_0-logloss:0.14751
[69]	validation_0-logloss:0.14745
[70]	validation_0-logloss:0.14747
[71]	validation_0-logloss:0.14742
[72]	validation_0-logloss:0.14735
[73]	validation_0-logloss:0.14736
[74]	validation_0-logloss:0.14734
[75]	validation_0-logloss:0.14737
[76]	validation_0-logloss:0.14739
[77]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[58]	validation_0-logloss:0.14360
[59]	validation_0-logloss:0.14329
[60]	validation_0-logloss:0.14300
[61]	validation_0-logloss:0.14281
[62]	validation_0-logloss:0.14265
[63]	validation_0-logloss:0.14244
[64]	validation_0-logloss:0.14229
[65]	validation_0-logloss:0.14213
[66]	validation_0-logloss:0.14194
[67]	validation_0-logloss:0.14182
[68]	validation_0-logloss:0.14173
[69]	validation_0-logloss:0.14165
[70]	validation_0-logloss:0.14161
[71]	validation_0-logloss:0.14153
[72]	validation_0-logloss:0.14149
[73]	validation_0-logloss:0.14137
[74]	validation_0-logloss:0.14134
[75]	validation_0-logloss:0.14132
[76]	validation_0-logloss:0.14130
[77]	validation_0-logloss:0.14125
[78]	validation_0-logloss:0.14121
[79]	validation_0-logloss:0.14118
[80]	validation_0-logloss:0.14112
[81]	validation_0-logloss:0.14108
[82]	validation_0-logloss:0.14106
[83]	validation_0-logloss:0.14102
[84]	validation_0-logloss:0.14101
[85]	validation_0-logloss:0.14104
[86]	validation_0-logloss:0.14099
[87]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[63]	validation_0-logloss:0.14087
[64]	validation_0-logloss:0.14073
[65]	validation_0-logloss:0.14054
[66]	validation_0-logloss:0.14048
[67]	validation_0-logloss:0.14041
[68]	validation_0-logloss:0.14024
[69]	validation_0-logloss:0.14011
[70]	validation_0-logloss:0.14005
[71]	validation_0-logloss:0.13996
[72]	validation_0-logloss:0.13999
[73]	validation_0-logloss:0.13998
[74]	validation_0-logloss:0.13999
[75]	validation_0-logloss:0.13995
[76]	validation_0-logloss:0.13995
[77]	validation_0-logloss:0.13989
[78]	validation_0-logloss:0.13995
[79]	validation_0-logloss:0.14000
[80]	validation_0-logloss:0.14002
[81]	validation_0-logloss:0.13998
[82]	validation_0-logloss:0.14007
[83]	validation_0-logloss:0.14010
[84]	validation_0-logloss:0.14023
[85]	validation_0-logloss:0.14026
[86]	validation_0-logloss:0.14030
[87]	validation_0-logloss:0.14035
[88]	validation_0-logloss:0.14045
[89]	validation_0-logloss:0.14054
[90]	validation_0-logloss:0.14066
[91]	validation_0-logloss:0.14070
[92]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[65]	validation_0-logloss:0.12549
[66]	validation_0-logloss:0.12525
[67]	validation_0-logloss:0.12500
[68]	validation_0-logloss:0.12481
[69]	validation_0-logloss:0.12471
[70]	validation_0-logloss:0.12451
[71]	validation_0-logloss:0.12431
[72]	validation_0-logloss:0.12427
[73]	validation_0-logloss:0.12418
[74]	validation_0-logloss:0.12416
[75]	validation_0-logloss:0.12406
[76]	validation_0-logloss:0.12395
[77]	validation_0-logloss:0.12388
[78]	validation_0-logloss:0.12391
[79]	validation_0-logloss:0.12385
[80]	validation_0-logloss:0.12381
[81]	validation_0-logloss:0.12379
[82]	validation_0-logloss:0.12369
[83]	validation_0-logloss:0.12364
[84]	validation_0-logloss:0.12355
[85]	validation_0-logloss:0.12350
[86]	validation_0-logloss:0.12348
[87]	validation_0-logloss:0.12346
[88]	validation_0-logloss:0.12352
[89]	validation_0-logloss:0.12358
[90]	validation_0-logloss:0.12356
[91]	validation_0-logloss:0.12361
[92]	validation_0-logloss:0.12363
[93]	validation_0-logloss:0.12362
[94]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[56]	validation_0-logloss:0.13958
[57]	validation_0-logloss:0.13912
[58]	validation_0-logloss:0.13875
[59]	validation_0-logloss:0.13833
[60]	validation_0-logloss:0.13803
[61]	validation_0-logloss:0.13780
[62]	validation_0-logloss:0.13748
[63]	validation_0-logloss:0.13724
[64]	validation_0-logloss:0.13703
[65]	validation_0-logloss:0.13678
[66]	validation_0-logloss:0.13658
[67]	validation_0-logloss:0.13624
[68]	validation_0-logloss:0.13613
[69]	validation_0-logloss:0.13583
[70]	validation_0-logloss:0.13556
[71]	validation_0-logloss:0.13550
[72]	validation_0-logloss:0.13528
[73]	validation_0-logloss:0.13517
[74]	validation_0-logloss:0.13507
[75]	validation_0-logloss:0.13498
[76]	validation_0-logloss:0.13486
[77]	validation_0-logloss:0.13479
[78]	validation_0-logloss:0.13476
[79]	validation_0-logloss:0.13468
[80]	validation_0-logloss:0.13467
[81]	validation_0-logloss:0.13462
[82]	validation_0-logloss:0.13463
[83]	validation_0-logloss:0.13461
[84]	validation_0-logloss:0.13459
[85]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[44]	validation_0-logloss:0.13660
[45]	validation_0-logloss:0.13543
[46]	validation_0-logloss:0.13434
[47]	validation_0-logloss:0.13330
[48]	validation_0-logloss:0.13233
[49]	validation_0-logloss:0.13145
[50]	validation_0-logloss:0.13062
[51]	validation_0-logloss:0.12984
[52]	validation_0-logloss:0.12913
[53]	validation_0-logloss:0.12844
[54]	validation_0-logloss:0.12780
[55]	validation_0-logloss:0.12727
[56]	validation_0-logloss:0.12674
[57]	validation_0-logloss:0.12617
[58]	validation_0-logloss:0.12570
[59]	validation_0-logloss:0.12532
[60]	validation_0-logloss:0.12498
[61]	validation_0-logloss:0.12460
[62]	validation_0-logloss:0.12428
[63]	validation_0-logloss:0.12397
[64]	validation_0-logloss:0.12372
[65]	validation_0-logloss:0.12350
[66]	validation_0-logloss:0.12324
[67]	validation_0-logloss:0.12298
[68]	validation_0-logloss:0.12282
[69]	validation_0-logloss:0.12261
[70]	validation_0-logloss:0.12246
[71]	validation_0-logloss:0.12232
[72]	validation_0-logloss:0.12219
[73]	validatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[62]	validation_0-logloss:0.12442
[63]	validation_0-logloss:0.12418
[64]	validation_0-logloss:0.12389
[65]	validation_0-logloss:0.12359
[66]	validation_0-logloss:0.12332
[67]	validation_0-logloss:0.12305
[68]	validation_0-logloss:0.12287
[69]	validation_0-logloss:0.12266
[70]	validation_0-logloss:0.12247
[71]	validation_0-logloss:0.12232
[72]	validation_0-logloss:0.12224
[73]	validation_0-logloss:0.12204
[74]	validation_0-logloss:0.12197
[75]	validation_0-logloss:0.12184
[76]	validation_0-logloss:0.12166
[77]	validation_0-logloss:0.12158
[78]	validation_0-logloss:0.12141
[79]	validation_0-logloss:0.12130
[80]	validation_0-logloss:0.12124
[81]	validation_0-logloss:0.12113
[82]	validation_0-logloss:0.12107
[83]	validation_0-logloss:0.12109
[84]	validation_0-logloss:0.12103
[85]	validation_0-logloss:0.12099
[86]	validation_0-logloss:0.12101
[87]	validation_0-logloss:0.12092
[88]	validation_0-logloss:0.12095
[89]	validation_0-logloss:0.12088
[90]	validation_0-logloss:0.12083
[91]	validatio

In [16]:
import sys
sys.path.append('../src')

from experiment_logger import log_experiment

params = {
    'model': 'XGBClassifier',
    'encoding': 'one-hot',
    'n_splits': 10,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'random_state': 42
}

log_experiment(
    experiment_name='xgb_ohe_10fold',
    model_name='XGBClassifier',
    params=params,
    cv_score=np.mean(scores),
    comment='XGBoost, one-hot encoding, 10-fold CV - mean probability (uśredniona predykcja po foldach), Personality zakodowany na liczby, numeryczna imputacja (IterativeImputer), i kategoryczna imputacja (SimpleImputer) przed one-hot encodingiem'
)
print('Eksperyment został zalogowany!')

Eksperyment został zalogowany!


In [14]:
import os
# Wczytaj sample_submission, aby pobrać wymagane kolumny i kolejność
sample_submission = pd.read_csv('../../playground-series-s5e7/sample_submission.csv')

# Zakładam, że predykcje są w zmiennej final_test_pred (np. jako liczby lub kategorie)
if set(np.unique(final_test_pred)) == {0, 1}:
    label_map = {0: 'Extrovert', 1: 'Introvert'}
    final_test_pred = pd.Series(final_test_pred).map(label_map).values

submission = sample_submission.copy()
target_col = submission.columns[1]
submission[target_col] = final_test_pred

# Automatyczne nadawanie nazwy pliku
output_dir = '../outputs'
existing = [f for f in os.listdir(output_dir) if f.startswith('submission') and f.endswith('.csv')]
if 'submission.csv' in existing:
    # Szukamy submissionN.csv
    nums = [int(f.replace('submission', '').replace('.csv', '')) for f in existing if f != 'submission.csv' and f.replace('submission', '').replace('.csv', '').isdigit()]
    n = max(nums) if nums else 1
    new_name = f'submission{n+1}.csv'
else:
    new_name = 'submission.csv'

output_path = os.path.join(output_dir, new_name)
submission.to_csv(output_path, index=False)
print(f'Plik submission zapisany do {output_path}')
submission.head()

Plik submission zapisany do ../outputs\submission5.csv


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
