In [2]:
import sys
import pandas as pd
import numpy as np

# Dodaj src do sys.path, by importować własne moduły
sys.path.append('../src')

from experiment_logger import log_experiment

In [4]:
TRAIN_PATH = '../../playground-series-s5e7/train.csv'
TEST_PATH = '../../playground-series-s5e7/test.csv'

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print('Train shape:', train_data.shape)
print('Test shape:', test_data.shape)
train_data.head()

Train shape: (18524, 9)
Test shape: (6175, 8)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [5]:
train_data.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [6]:
test_data.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [7]:
train_data.info()
train_data.describe()
train_data['Personality'].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


Personality
Extrovert    0.739527
Introvert    0.260473
Name: proportion, dtype: float64

In [8]:
train_data.drop(columns=['id'], inplace=True)
test_data.drop(columns=['id'], inplace=True)

In [9]:
# Załóżmy, że usunąłeś już kolumnę 'id' z train i test
from data_utils import split_numerical_categorical
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Zakładam, że X_train to Twój DataFrame z cechami (bez targetu i id)
target = train_data['Personality']
train_data.drop(columns=['Personality'], inplace=True)
numerical_cols, categorical_cols = split_numerical_categorical(train_data)

print("Zmienne numeryczne:", numerical_cols)
print("Zmienne kategoryczne:", categorical_cols)

# Łączymy train i test, by mieć spójny encoding i imputację
full = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Imputacja numeryczna
imputer = IterativeImputer(random_state=42)
full[numerical_cols] = imputer.fit_transform(full[numerical_cols])

# Imputacja kategoryczna - wypełnienie braków 'Missing'
for col in categorical_cols:
    full[col] = full[col].fillna('Missing')

print("Kolumny w full:", full.columns.tolist())
print("Kolumny kategorialne:", categorical_cols)

# One-hot encoding
full_encoded = pd.get_dummies(full, columns=categorical_cols)

# Rozdzielamy z powrotem
X_train = full_encoded.iloc[:len(train_data)]
X_test = full_encoded.iloc[len(train_data):]
y_train = target

Zmienne numeryczne: ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
Zmienne kategoryczne: ['Stage_fear', 'Drained_after_socializing']
Kolumny w full: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
Kolumny kategorialne: ['Stage_fear', 'Drained_after_socializing']


In [10]:
X_train.head()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Missing,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_Missing,Drained_after_socializing_No,Drained_after_socializing_Yes
0,0.0,6.0,4.0,15.0,5.0,False,True,False,False,True,False
1,1.0,7.0,3.0,10.0,8.0,False,True,False,False,True,False
2,6.0,1.0,0.0,3.0,0.0,False,False,True,True,False,False
3,3.0,7.0,3.0,11.0,5.0,False,True,False,False,True,False
4,1.0,4.0,4.0,13.0,5.708436,False,True,False,False,True,False


In [11]:
print(X_train.shape)
print(X_test.shape)

(18524, 11)
(6175, 11)


In [12]:
print(X_test.dtypes)
print(X_train.dtypes)
print(X_test.head())

Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           bool
dtype: object
Time_spent_Alone                     float64
Social_event_attendance              float64
Going_outside                        float64
Friends_circle_size                  float64
Post_frequency                       float64
Stage_fear_Missing                      bool
Stage_fear_No                           bool
Stage_fear_Yes                          bool
Drained_after_socializing_Missing       bool
Drained_after_socializing_No            bool
Drained_after_socializing_Yes           b

In [13]:
# Zakoduj target na liczby
y_train = y_train.map({'Extrovert': 0, 'Introvert': 1})

# Sprawdź, czy wszystko jest OK
print("Unikalne wartości y_train:", y_train.unique())
print("Typ y_train:", y_train.dtype)

Unikalne wartości y_train: [0 1]
Typ y_train: int64


In [14]:
# Jeśli nie masz catboost lub optuna, odkomentuj poniższą linię:
# %pip install catboost optuna

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
from catboost import CatBoostClassifier

# Jeśli masz zmienne kategoryczne, podaj ich indeksy lub nazwy:
# cat_features = [lista_indeksów_lub_nazw_kolumn]
cat_features = []  # <- jeśli nie masz, zostaw pustą listę

# 1. Optuna - optymalizacja hiperparametrów
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'verbose': 0,
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'cat_features': cat_features
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model = CatBoostClassifier(**param)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50, use_best_model=True)
        val_pred = model.predict(X_val)
        score = accuracy_score(y_val, val_pred)
        scores.append(score)
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # Możesz zwiększyć n_trials dla lepszych wyników

print("Najlepsze parametry:", study.best_params)

# 2. Finalny model z OOF predictions i predykcjami na testach
best_params = study.best_params
best_params.update({'random_seed': 42, 'verbose': 0, 'loss_function': 'Logloss', 'eval_metric': 'Accuracy', 'cat_features': cat_features})

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
test_preds = np.zeros((len(X_test), skf.n_splits))
oof_preds = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = CatBoostClassifier(**best_params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50, use_best_model=True)
    
    val_pred = model.predict(X_val)
    score = accuracy_score(y_val, val_pred)
    scores.append(score)
    
    # OOF predictions
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    # Test predictions
    test_preds[:, fold] = model.predict_proba(X_test)[:, 1]

# OOF accuracy
oof_binary = (oof_preds > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary))

# Uśrednienie predykcji po foldach
mean_preds = test_preds.mean(axis=1)
final_test_pred = (mean_preds > 0.5).astype(int)
print(f'Fold accuracy: {scores}')
print(f'Mean CV accuracy: {np.mean(scores):.4f}')

[I 2025-07-02 12:43:09,531] A new study created in memory with name: no-name-1b05a7d4-03cd-43f9-ad79-85b8c7d3ce6a
[I 2025-07-02 12:43:11,825] Trial 0 finished with value: 0.9694991299481466 and parameters: {'iterations': 815, 'learning_rate': 0.26557968779967306, 'depth': 4, 'l2_leaf_reg': 3.7902769083871277, 'random_strength': 0.5355082391724041, 'bagging_temperature': 0.6720076016943745, 'border_count': 147}. Best is trial 0 with value: 0.9694991299481466.
[I 2025-07-02 12:43:15,132] Trial 1 finished with value: 0.9695531256284923 and parameters: {'iterations': 1137, 'learning_rate': 0.19377360568839613, 'depth': 6, 'l2_leaf_reg': 2.7742646388631633, 'random_strength': 0.6123673146692254, 'bagging_temperature': 0.6613522959676076, 'border_count': 223}. Best is trial 1 with value: 0.9695531256284923.
[I 2025-07-02 12:43:17,211] Trial 2 finished with value: 0.9693911823086541 and parameters: {'iterations': 346, 'learning_rate': 0.2679298356061002, 'depth': 5, 'l2_leaf_reg': 2.600813071

Najlepsze parametry: {'iterations': 1056, 'learning_rate': 0.2216909462809513, 'depth': 8, 'l2_leaf_reg': 2.156237233715828, 'random_strength': 0.8427001340269554, 'bagging_temperature': 0.9247675257637368, 'border_count': 175}
OOF accuracy: 0.9696069963290866
Fold accuracy: [0.9681597409606044, 0.9724770642201835, 0.9724770642201835, 0.963302752293578, 0.9654427645788337, 0.9681425485961123, 0.9719222462203023, 0.9703023758099352, 0.9719222462203023, 0.9719222462203023]
Mean CV accuracy: 0.9696


In [15]:
import sys
sys.path.append('../src')

from experiment_logger import log_experiment

params = {
    'model': 'CatBoostClassifier',
    'encoding': 'one-hot',
    'n_splits': 10,
    'learning_rate': best_params.get('learning_rate', None),
    'iterations': best_params.get('iterations', None),
    'depth': best_params.get('depth', None),
    'l2_leaf_reg': best_params.get('l2_leaf_reg', None),
    'random_strength': best_params.get('random_strength', None),
    'bagging_temperature': best_params.get('bagging_temperature', None),
    'border_count': best_params.get('border_count', None),
    'random_seed': 42
}

log_experiment(
    experiment_name='catboost_optuna_oof',
    model_name='CatBoostClassifier',
    params=params,
    cv_score=np.mean(scores),
    comment='CatBoost, Optuna, 10-fold CV, OOF accuracy logowane, imputacja numeryczna i kategoryczna (jeśli była), mean probability na testach',
    oof_accuracy=accuracy_score(y_train, oof_binary)
)
print('Eksperyment został zalogowany!')

Eksperyment został zalogowany!


In [16]:
import os
# Wczytaj sample_submission, aby pobrać wymagane kolumny i kolejność
sample_submission = pd.read_csv('../../playground-series-s5e7/sample_submission.csv')

# Zakładam, że predykcje są w zmiennej final_test_pred (np. jako liczby lub kategorie)
if set(np.unique(final_test_pred)) == {0, 1}:
    label_map = {0: 'Extrovert', 1: 'Introvert'}
    final_test_pred = pd.Series(final_test_pred).map(label_map).values

submission = sample_submission.copy()
target_col = submission.columns[1]
submission[target_col] = final_test_pred
# Automatyczne nadawanie nazwy pliku
output_dir = '../outputs'
existing = [f for f in os.listdir(output_dir) if f.startswith('submission') and f.endswith('.csv')]
if 'submission.csv' in existing:
    # Szukamy submissionN.csv
    nums = [int(f.replace('submission', '').replace('.csv', '')) for f in existing if f != 'submission.csv' and f.replace('submission', '').replace('.csv', '').isdigit()]
    n = max(nums) if nums else 1
    new_name = f'submission{n+1}.csv'
else:
    new_name = 'submission.csv'

output_path = os.path.join(output_dir, new_name)
submission.to_csv(output_path, index=False)
print(f'Plik submission zapisany do {output_path}')
submission.head()

Plik submission zapisany do ../outputs\submission7.csv


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
