In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb

In [15]:
train = pd.read_csv(r'../data/pgseries-s05e07/train.csv')
test = pd.read_csv(r'../data/pgseries-s05e07/test.csv')

submissions = pd.read_csv(r'../data/pgseries-s05e07/sample_submission.csv')

#train.head()

In [20]:
# 3. Encode Target.
# Categorical variable `personality` -> numerical using LabelEncoder
# Encode target labels with value between 0 and n_classes-1.
# This transformer should be used to encode target values, *i.e.* y, and not the input X

le = LabelEncoder()
train["Personality_encoded"] = le.fit_transform(train["Personality"])

# 4. Prepare Features
X = train.drop(columns=["id", "Personality", "Personality_encoded"])
y = train["Personality_encoded"]
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

X_test = test.drop(columns=["id"])
print(f"Shape of X Test: {X_test.shape}")

# 5. Encode Categorical Columns Using Ordinal Encoder
combined = pd.concat([X, X_test], axis=0)
cat_cols = combined.select_dtypes(include='object').columns.tolist()
# ['Stage_fear', 'Drained_after_socializing']

encoder = OrdinalEncoder()
combined[cat_cols] = encoder.fit_transform(combined[cat_cols])

X = combined.iloc[:len(X)].reset_index(drop=True)
X_test = combined.iloc[len(X):].reset_index(drop=True)
print(f"Shape of X: {X.shape}")
print(f"Shape of X Test: {X_test.shape}")


Shape of X: (18524, 7)
Shape of y: (18524,)
Shape of X Test: (6175, 7)
Shape of X: (18524, 7)
Shape of X Test: (6175, 7)


In [21]:
# 6. Setup Hyperparameters to XG Boost
params = {
    "objective": "binary:logistic", # as binary classification task, (extro vs intro)
    "eval_metric": "logloss", # metric for binary classifications
    "max_depth": 4, # depth of each decision tree
    "eta": 0.1, # learning rate set to 0.1
    "subsample": 0.8, 
    "colsample_bytree": 0.8,
    "random_state": 42 # for reproducibility
}

In [25]:
# 7. Stratified K-Fold Cross Validation
# To Train & Validate XG Boost Model, 
# Stratified -> Ensuring Class Dist in each fold remain consistent

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)

    model = xgb.train(params, dtrain, num_boost_round=100,
                       evals=[(dval, "valid")],
                       early_stopping_rounds=10, verbose_eval=False)
    
    oof_preds[val_idx] = model.predict(dval) > 0.5
    test_preds += model.predict(dtest) / skf.n_splits



In [26]:
# 8. Evaluate
cv_acc = accuracy_score(y, oof_preds)
print(f"Cross-Validation Accuracy: {cv_acc:.4f}")

# 9. Create Submission
final_preds = (test_preds > 0.5).astype(int)
submissions["Personality"] = le.inverse_transform(final_preds)
submissions.head()


Cross-Validation Accuracy: 0.9692


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
