# Baseline — Playground Series S5E7
Pierwsze podejście: Baseline - LightGBM

In [1]:
import sys
import pandas as pd
import numpy as np

# Dodaj src do sys.path, by importować własne moduły
sys.path.append('../src')

from experiment_logger import log_experiment

In [2]:
TRAIN_PATH = '../../playground-series-s5e7/train.csv'
TEST_PATH = '../../playground-series-s5e7/test.csv'

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print('Train shape:', train_data.shape)
print('Test shape:', test_data.shape)
train_data.head()

Train shape: (18524, 9)
Test shape: (6175, 8)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [3]:
train_data.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [5]:
test_data.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [6]:
train_data.info()
train_data.describe()
train_data['Personality'].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


Personality
Extrovert    0.739527
Introvert    0.260473
Name: proportion, dtype: float64

In [7]:
train_data.drop(columns=['id'], inplace=True)
test_data.drop(columns=['id'], inplace=True)

In [11]:
# Załóżmy, że usunąłeś już kolumnę 'id' z train i test
from data_utils import split_numerical_categorical

# Zakładam, że X_train to Twój DataFrame z cechami (bez targetu i id)
target = train_data['Personality']
train_data.drop(columns=['Personality'], inplace=True)
numerical_cols, categorical_cols = split_numerical_categorical(train_data)

print("Zmienne numeryczne:", numerical_cols)
print("Zmienne kategoryczne:", categorical_cols)

# Łączymy train i test, by mieć spójny encoding

full = pd.concat([train_data, test_data], axis=0, ignore_index=True)

print("Kolumny w full:", full.columns.tolist())
print("Kolumny kategorialne:", categorical_cols)

# One-hot encoding
full_encoded = pd.get_dummies(full, columns=categorical_cols)

# Rozdzielamy z powrotem
X_train = full_encoded.iloc[:len(train_data)]
X_test = full_encoded.iloc[len(train_data):]
y_train = target

Zmienne numeryczne: ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
Zmienne kategoryczne: ['Stage_fear', 'Drained_after_socializing']
Kolumny w full: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
Kolumny kategorialne: ['Stage_fear', 'Drained_after_socializing']


In [12]:
X_train.head()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_No,Drained_after_socializing_Yes
0,0.0,6.0,4.0,15.0,5.0,True,False,True,False
1,1.0,7.0,3.0,10.0,8.0,True,False,True,False
2,6.0,1.0,0.0,3.0,0.0,False,True,False,False
3,3.0,7.0,3.0,11.0,5.0,True,False,True,False
4,1.0,4.0,4.0,13.0,,True,False,True,False


In [14]:
print(X_train.shape)
print(X_test.shape)

(18524, 7)
(6175, 7)


In [15]:
print(X_test.dtypes)
print(X_train.dtypes)
print(X_test.head())

Time_spent_Alone                 float64
Social_event_attendance          float64
Going_outside                    float64
Friends_circle_size              float64
Post_frequency                   float64
Stage_fear_No                       bool
Stage_fear_Yes                      bool
Drained_after_socializing_No        bool
Drained_after_socializing_Yes       bool
dtype: object
Time_spent_Alone                 float64
Social_event_attendance          float64
Going_outside                    float64
Friends_circle_size              float64
Post_frequency                   float64
Stage_fear_No                       bool
Stage_fear_Yes                      bool
Drained_after_socializing_No        bool
Drained_after_socializing_Yes       bool
dtype: object
       Time_spent_Alone  Social_event_attendance  Going_outside  \
18524               3.0                      7.0            4.0   
18525               NaN                      0.0            0.0   
18526               3.0          

In [17]:
# Zakoduj target na liczby
y_train = y_train.map({'Extrovert': 0, 'Introvert': 1})

# Sprawdź, czy wszystko jest OK
print("Unikalne wartości y_train:", y_train.unique())
print("Typ y_train:", y_train.dtype)

Unikalne wartości y_train: [0 1]
Typ y_train: int64


In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import numpy as np
from scipy.stats import mode

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
test_preds = np.zeros((len(X_test), 10))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1
    )


    model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(50), log_evaluation(0)]
)
    
    val_pred = model.predict(X_val)
    score = accuracy_score(y_val, val_pred)
    scores.append(score)
    
    # Predykcje na test dla każdego folda (do uśrednienia)
    test_preds[:, fold] = model.predict(X_test)


# Majority voting po wierszach (axis=1)
    final_test_pred = mode(test_preds, axis=1)[0].flatten()
print(f'Fold accuracy: {scores}')
print(f'Mean CV accuracy: {np.mean(scores):.4f}')

[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 16671, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260452 -> initscore=-1.043619
[LightGBM] [Info] Start training from score -1.043619
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[94]	valid_0's binary_logloss: 0.129002
[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 16671, number of used features: 9
[LightGBM] [Info] 

In [22]:
import sys
sys.path.append('../src')

from experiment_logger import log_experiment

params = {
    'model': 'LGBMClassifier',
    'encoding': 'one-hot',
    'n_splits': 10,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'random_state': 42
}

log_experiment(
    experiment_name='lgbm_ohe_10fold',
    model_name='LGBMClassifier',
    params=params,
    cv_score=np.mean(scores),
    comment='LightGBM, one-hot encoding, 10-fold CV - majority voting, Personality zakodowany na liczby'
)

print('Eksperyment został zalogowany!')

Eksperyment został zalogowany!


In [23]:
# Wczytaj sample_submission, aby pobrać wymagane kolumny i kolejność
sample_submission = pd.read_csv('../../playground-series-s5e7/sample_submission.csv')

# Zakładam, że predykcje są w zmiennej final_test_pred (np. jako liczby lub kategorie)
# Jeśli trzeba, przekonwertuj predykcje na odpowiedni format (np. z 0/1 na 'Extrovert'/'Introvert')
if set(np.unique(final_test_pred)) == {0, 1}:
    # Zamień na oryginalne etykiety, jeśli to potrzebne
    label_map = {0: 'Extrovert', 1: 'Introvert'}
    final_test_pred = pd.Series(final_test_pred).map(label_map).values

# Tworzymy submission na podstawie sample_submission
submission = sample_submission.copy()
# Zakładam, że target to druga kolumna w sample_submission
target_col = submission.columns[1]
submission[target_col] = final_test_pred

# Zapisz do pliku
submission.to_csv('../outputs/submission.csv', index=False)
print('Plik submission zapisany do ../outputs/submission.csv')
submission.head()

Plik submission zapisany do ../outputs/submission.csv


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
