In [None]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer


In [None]:
# Load Dataset

# Define file paths (Kaggle default path structure)
TRAIN_PATH = '/kaggle/input/playground-series-s5e7/train.csv'
TEST_PATH = '/kaggle/input/playground-series-s5e7/test.csv'

# Load the CSV files into DataFrames
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)



In [None]:
# Basic Preprocessing

# Drop the 'id' column as it's not predictive
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

# Method to splits df into numerical and categorical features.
def split_numerical_categorical(df):
    """
    Splits the columns of a DataFrame into numerical and categorical features.

    Parameters:
    df (pandas.DataFrame): The DataFrame to split.

    Returns:
    tuple: A tuple containing two lists - numerical columns and categorical columns.
    """
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
    return numerical_cols, categorical_cols

# Separate target variable
target = train_df['Personality']
train_df.drop(columns=['Personality'], inplace=True)

# Split features into numerical and categorical using a custom utility
numerical_cols, categorical_cols = split_numerical_categorical(train_df)

print("Numerical features:", numerical_cols)
print("Categorical features:", categorical_cols)

In [None]:
# Stage 4: Handle Missing Values
# ==========================================
# Combine train and test for consistent preprocessing
full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Impute missing numerical values using Iterative Imputer (e.g., Bayesian Ridge)
imputer = IterativeImputer(random_state=42)
full[numerical_cols] = imputer.fit_transform(full[numerical_cols])

# Imputer for categorical columns with constant value 'Missing'
imputer_const = SimpleImputer(strategy='constant', fill_value='Missing')

# Apply imputation and preserve column names
full_df[categorical_cols] = pd.DataFrame(
    imputer_const.fit_transform(full_df[categorical_cols]),
    columns=categorical_cols,
    index=full_df.index
)


In [None]:
# Encode Categorical Features

# Apply One-Hot Encoding to categorical features
full_encoded = pd.get_dummies(full_df, columns=categorical_cols)

# Split the combined data back into training and test sets
X_train = full_encoded.iloc[:len(train_df)]
X_test = full_encoded.iloc[len(train_df):]
y_train = target.map({'Extrovert': 0, 'Introvert': 1})  # Encode target as binary


In [None]:
#LightGBM

%pip install optuna  


# 1. Imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
import lightgbm as lgb


# 2. Optuna - Hyperparameter Optimization for LightGBM
def objective_lgbm(trial):
    # Define the hyperparameter search space for LightGBM
    param_lgbm = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 15, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Stratified K-Fold to preserve label distribution across folds
    skf_lgbm = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_lgbm = []

    for train_idx, val_idx in skf_lgbm.split(X_train, y_train):
        # Split training and validation data for each fold
        X_tr_lgbm, X_val_lgbm = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_lgbm, y_val_lgbm = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Initialize and train LightGBM model with current parameters
        model_lgbm = lgb.LGBMClassifier(**param_lgbm, verbose = -1)
        model_lgbm.fit(
            X_tr_lgbm, y_tr_lgbm,
            eval_set=[(X_val_lgbm, y_val_lgbm)],
            callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        
        # Predict on validation set and compute accuracy
        val_pred_lgbm = model_lgbm.predict(X_val_lgbm)
        score_lgbm = accuracy_score(y_val_lgbm, val_pred_lgbm)
        scores_lgbm.append(score_lgbm)
    
    # Return mean cross-validation score
    return np.mean(scores_lgbm)

# Run Optuna study with the defined objective
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=30)  # Increase n_trials for better optimization

# Display best hyperparameters found by Optuna
print("Best hyperparameters found by Optuna:", study_lgbm.best_params)

# 3. Train Final Model with Out-of-Fold (OOF) and Test Predictions
best_params_lgbm = study_lgbm.best_params
best_params_lgbm.update({'random_state': 42, 'n_jobs': -1})

# Use 10-fold Stratified CV for final training and predictions
skf_lgbm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_lgbm = []

# Initialize arrays to store predictions
test_preds_lgbm = np.zeros((len(X_test), skf_lgbm.n_splits))  # Test predictions per fold
oof_preds_lgbm = np.zeros(len(X_train))                       # Out-of-fold predictions

for fold, (train_idx, val_idx) in enumerate(skf_lgbm.split(X_train, y_train)):
    # Split data into train and validation sets
    X_tr_lgbm, X_val_lgbm = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_lgbm, y_val_lgbm = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train model with best parameters from Optuna
    model_lgbm = lgb.LGBMClassifier(**best_params_lgbm)
    model_lgbm.fit(
        X_tr_lgbm, y_tr_lgbm,
        eval_set=[(X_val_lgbm, y_val_lgbm)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    # Predict and evaluate on validation set
    val_pred_lgbm = model_lgbm.predict(X_val_lgbm)
    score_lgbm = accuracy_score(y_val_lgbm, val_pred_lgbm)
    scores_lgbm.append(score_lgbm)
    
    # Store OOF predicted probabilities for ensemble/blending
    oof_preds_lgbm[val_idx] = model_lgbm.predict_proba(X_val_lgbm)[:, 1]
    
    # Store predictions on test set for this fold
    test_preds_lgbm[:, fold] = model_lgbm.predict_proba(X_test)[:, 1]

# Convert OOF probabilities to binary predictions using 0.5 threshold
oof_binary_lgbm = (oof_preds_lgbm > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary_lgbm))

# Average test predictions across all folds
mean_preds_lgbm = test_preds_lgbm.mean(axis=1)
final_test_pred_lgbm = (mean_preds_lgbm > 0.5).astype(int)

# Print individual fold accuracies and overall mean CV score
print(f'Fold accuracies: {scores_lgbm}')
print(f'Mean CV accuracy: {np.mean(scores_lgbm):.4f}')

In [None]:
#catboost

%pip install catboost 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

cat_features_cat = []  # <- Set to column indices/names if you have categorical features

# 1. Optuna - Hyperparameter Optimization for CatBoost
def objective_cat(trial):
    # Define the hyperparameter search space
    param_cat = {
        'iterations': trial.suggest_int('iterations', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 2.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'verbose': 0,  # Suppress CatBoost training output
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'cat_features': cat_features_cat
    }

    # Perform 5-fold stratified cross-validation
    skf_cat = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_cat = []

    for train_idx, val_idx in skf_cat.split(X_train, y_train):
        # Split data into training and validation sets
        X_tr_cat, X_val_cat = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_cat, y_val_cat = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the CatBoost model with the current hyperparameters
        model_cat = CatBoostClassifier(**param_cat)
        model_cat.fit(
            X_tr_cat, y_tr_cat,
            eval_set=(X_val_cat, y_val_cat),
            early_stopping_rounds=50,
            use_best_model=True
        )
        
        # Evaluate model on validation data
        val_pred_cat = model_cat.predict(X_val_cat)
        score_cat = accuracy_score(y_val_cat, val_pred_cat)
        scores_cat.append(score_cat)
    
    # Return the average validation score across folds
    return np.mean(scores_cat)

# Run the Optuna study to maximize the validation accuracy
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=30)  # Increase n_trials for more thorough optimization

# Print the best hyperparameters found by Optuna
print("Best hyperparameters found by Optuna:", study_cat.best_params)

# 2. Train Final Model Using Out-of-Fold (OOF) and Test Predictions

# Prepare best parameters and update with fixed values
best_params_cat = study_cat.best_params
best_params_cat.update({
    'random_seed': 42,
    'verbose': 0,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'cat_features': cat_features_cat
})

# Use 10-fold stratified cross-validation for final model evaluation
skf_cat = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_cat = []

# Arrays to store predictions
test_preds_cat = np.zeros((len(X_test), skf_cat.n_splits))  # Predictions for each test fold
oof_preds_cat = np.zeros(len(X_train))                       # Out-of-Fold predictions for training data

for fold, (train_idx, val_idx) in enumerate(skf_cat.split(X_train, y_train)):
    # Split data into training and validation sets
    X_tr_cat, X_val_cat = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_cat, y_val_cat = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train model using best parameters
    model_cat = CatBoostClassifier(**best_params_cat)
    model_cat.fit(
        X_tr_cat, y_tr_cat,
        eval_set=(X_val_cat, y_val_cat),
        early_stopping_rounds=50,
        use_best_model=True
    )
    
    # Validation accuracy
    val_pred_cat = model_cat.predict(X_val_cat)
    score_cat = accuracy_score(y_val_cat, val_pred_cat)
    scores_cat.append(score_cat)
    
    # Store Out-of-Fold predicted probabilities
    oof_preds_cat[val_idx] = model_cat.predict_proba(X_val_cat)[:, 1]
    
    # Store predicted probabilities for the test set (per fold)
    test_preds_cat[:, fold] = model_cat.predict_proba(X_test)[:, 1]

# Convert OOF probabilities into binary predictions using 0.5 threshold
oof_binary_cat = (oof_preds_cat > 0.5).astype(int)
print("OOF accuracy:", accuracy_score(y_train, oof_binary_cat))

# Average the test predictions across all folds
mean_preds_cat = test_preds_cat.mean(axis=1)
final_test_pred_cat = (mean_preds_cat > 0.5).astype(int)

# Print fold-wise and mean cross-validation accuracy
print(f'Fold accuracy: {scores_cat}')
print(f'Mean CV accuracy: {np.mean(scores_cat):.4f}')


In [None]:
import xgboost as xgb

# 1. Optuna - hyperparameter optimization
def objective_xgb(trial):
    # Define the hyperparameter search space
    param_xgb = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'n_jobs': -1,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'early_stopping_rounds': 50
    }

    # 5-fold Stratified Cross-Validation
    skf_xgb = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores_xgb = []

    for train_idx, val_idx in skf_xgb.split(X_train, y_train):
        # Split training and validation data
        X_tr_xgb, X_val_xgb = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr_xgb, y_val_xgb = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train the XGBoost model with current hyperparameters
        model_xgb = xgb.XGBClassifier(**param_xgb)
        model_xgb.fit(
            X_tr_xgb, y_tr_xgb,
            eval_set=[(X_val_xgb, y_val_xgb)],
            verbose=False  # Suppress output
        )

        # Evaluate accuracy and store the result
        scores_xgb.append(accuracy_score(y_val_xgb, model_xgb.predict(X_val_xgb)))

    # Return mean cross-validation score
    return np.mean(scores_xgb)

# Run Optuna optimization process
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)

# Prepare final best parameters for training
best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    'random_state': 42,
    'n_jobs': -1,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'early_stopping_rounds': 50
})

# 10-fold Stratified Cross-Validation for final training and evaluation
skf_xgb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_xgb = []

# Arrays to hold predictions
test_preds_xgb = np.zeros((len(X_test), skf_xgb.n_splits))  # Test set predictions
oof_preds_xgb = np.zeros(len(X_train))                      # Out-of-fold predictions

for fold, (train_idx, val_idx) in enumerate(skf_xgb.split(X_train, y_train)):
    # Split into training and validation sets for this fold
    X_tr_xgb, X_val_xgb = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_xgb, y_val_xgb = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Train the model using best parameters
    model_xgb = xgb.XGBClassifier(**best_params_xgb)
    model_xgb.fit(
        X_tr_xgb, y_tr_xgb,
        eval_set=[(X_val_xgb, y_val_xgb)],
        verbose=False
    )

    # Save fold accuracy
    scores_xgb.append(accuracy_score(y_val_xgb, model_xgb.predict(X_val_xgb)))

    # Save OOF predictions (probability for class 1)
    oof_preds_xgb[val_idx] = model_xgb.predict_proba(X_val_xgb)[:, 1]

    # Save test set predictions for this fold
    test_preds_xgb[:, fold] = model_xgb.predict_proba(X_test)[:, 1]

# Convert OOF probabilities to binary predictions using 0.5 threshold
oof_binary_xgb = (oof_preds_xgb > 0.5).astype(int)

# Evaluate OOF accuracy
print("OOF accuracy:", accuracy_score(y_train, oof_binary_xgb))

# Print accuracy for each fold
mean_score_xgb = np.mean(scores_xgb)
print("Fold accuracy:", scores_xgb)
print(f"Mean CV accuracy: {mean_score_xgb:.4f}")

# Average predictions across all folds for final test prediction
mean_preds_xgb = test_preds_xgb.mean(axis=1)
final_test_pred_xgb = (mean_preds_xgb > 0.5).astype(int)


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# ===== OOF predictions ensemble (averaging predictions across models) =====
# Combine out-of-fold predictions from LightGBM, CatBoost, and XGBoost by averaging
oof_preds_ensemble = (oof_preds_lgbm + oof_preds_cat + oof_preds_xgb) / 3

# Convert averaged probabilities to binary predictions using a 0.5 threshold
oof_binary_ensemble = (oof_preds_ensemble > 0.5).astype(int)

# Calculate overall OOF accuracy for the ensemble
oof_accuracy_ensemble = accuracy_score(y_train, oof_binary_ensemble)
print("OOF accuracy (ensemble):", oof_accuracy_ensemble)

# ===== CV accuracy per fold (based on ensemble OOF predictions) =====
# Perform Stratified K-Fold to evaluate ensemble accuracy fold by fold
skf_ensemble = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores_ensemble = []

for fold, (train_idx, val_idx) in enumerate(skf_ensemble.split(X_train, y_train)):
    # Average the OOF predictions for the current fold from all models
    oof_fold = (oof_preds_lgbm[val_idx] + oof_preds_cat[val_idx] + oof_preds_xgb[val_idx]) / 3
    
    # Convert probabilities to binary predictions
    oof_fold_binary = (oof_fold > 0.5).astype(int)
    
    # Calculate accuracy for the current fold
    acc = accuracy_score(np.array(y_train)[val_idx], oof_fold_binary)
    cv_scores_ensemble.append(acc)

# Calculate mean cross-validation accuracy across all folds
mean_cv_accuracy_ensemble = np.mean(cv_scores_ensemble)
print(f"Fold accuracy (ensemble): {cv_scores_ensemble}")
print(f"Mean CV accuracy (ensemble): {mean_cv_accuracy_ensemble:.4f}")

# ===== Test predictions ensemble (average predictions across models and folds) =====
# Average test predictions over folds for each model
mean_preds_lgbm = test_preds_lgbm.mean(axis=1)
mean_preds_cat = test_preds_cat.mean(axis=1)
mean_preds_xgb = test_preds_xgb.mean(axis=1)

# Combine predictions from all three models by averaging
ensemble_test = (mean_preds_lgbm + mean_preds_cat + mean_preds_xgb) / 3

# Final binary predictions on the test set using a 0.5 threshold
final_ensemble_pred = (ensemble_test > 0.5).astype(int)


In [None]:
# Convert binary predictions (0,1) to string labels ('Extrovert', 'Introvert')
final_ensemble_pred_labels = pd.Series(final_ensemble_pred).map({0: 'Extrovert', 1: 'Introvert'}).values

# Load the sample submission file which contains the required format and IDs
ssub = pd.read_csv("/kaggle/input/playground-series-s5e7/sample_submission.csv")

# Replace the 'Personality' column with our predicted personality labels
ssub['Personality'] = final_ensemble_pred_labels

# Save the updated DataFrame to a CSV file without including the index
ssub.to_csv("submission.csv", index=False)
