## Data loading

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.ensemble import VotingClassifier
import joblib
import os
from IPython.display import display

os.makedirs('eda_plots', exist_ok=True)

## Load datasets with CORRECTED paths

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

def clean_column_names(df):
    df.columns = [re.sub(r'\W+', '_', col.strip()) for col in df.columns]
    return df

train = clean_column_names(train)
test = clean_column_names(test)

print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

Train columns: ['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']
Test columns: ['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']


## Fixed target column identification

In [3]:
def find_target_column(train_df, test_df):
    diff = set(train_df.columns) - set(test_df.columns)
    if len(diff) == 1:
        return list(diff)[0]
    
    target_candidates = ['personality', 'target', 'class', 'label']
    for col in train_df.columns:
        if any(tc in col.lower() for tc in target_candidates):
            return col
    
    for col in train_df.columns:
        if train_df[col].nunique() == 2:
            return col
    
    raise ValueError("Target column not found")

target_col = find_target_column(train, test)
print(f"\nIdentified target column: {target_col}")


Identified target column: Personality


## Handle missing target values

In [4]:
train = train.dropna(subset=[target_col])
train[target_col] = train[target_col].astype(str).str.strip().str.lower()

if 'intro' in train[target_col].iloc[0].lower() or 'extro' in train[target_col].iloc[0].lower():
    train[target_col] = train[target_col].apply(
        lambda x: 'introvert' if 'intro' in x.lower() else 'extrovert'
    )
else:
    unique_vals = train[target_col].unique()
    if len(unique_vals) == 2:
        print(f"Mapping values: {unique_vals[0]} -> introvert, {unique_vals[1]} -> extrovert")
        train[target_col] = train[target_col].map({
            unique_vals[0]: 'introvert',
            unique_vals[1]: 'extrovert'
        })
    else:
        raise ValueError(f"Unexpected target values: {unique_vals}")


## Personality distribution

In [5]:
print("\nStarting EDA...")
plt.figure(figsize=(10, 6))
sns.countplot(x=target_col, data=train, palette='viridis')
plt.title('Distribution of Personality Types')
plt.savefig('eda_plots/personality_distribution.png', bbox_inches='tight')
plt.close()
print("- Personality distribution plot saved")


Starting EDA...
- Personality distribution plot saved


## Numerical features

In [6]:
num_features = [col for col in train.columns if train[col].dtype in ['float64', 'int64'] and col != 'id' and col != target_col]
if num_features:
    print(f"\nNumerical features found: {len(num_features)}")
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(num_features[:min(9, len(num_features))], 1):
        plt.subplot(3, 3, i)
        sns.histplot(train[feature], kde=True, bins=30)
        plt.title(f'Distribution of {feature}')
    plt.tight_layout()
    plt.savefig('eda_plots/numerical_distributions.png', bbox_inches='tight')
    plt.close()
    print("- Numerical distributions plot saved")
    
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(num_features[:min(6, len(num_features))], 1):
        plt.subplot(2, 3, i)
        sns.boxplot(x=target_col, y=feature, data=train, palette='coolwarm')
        plt.title(f'{feature} by Personality')
    plt.tight_layout()
    plt.savefig('eda_plots/numerical_boxplots.png', bbox_inches='tight')
    plt.close()
    print("- Numerical boxplots saved")


Numerical features found: 5


  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


- Numerical distributions plot saved
- Numerical boxplots saved


## Pair plot of numerical features

In [7]:
print("\nGenerating Pair Plots for Top Numerical Features (Train and Test)")


top_numerical_cols = num_features[:5] if len(num_features) > 0 else []
if top_numerical_cols:
    # Pair plots for training set
    pair_plot_df_train = train[top_numerical_cols + [target_col]].copy()
    plt.figure(figsize=(15, 15))
    sns.pairplot(pair_plot_df_train, hue=target_col, diag_kind='kde', palette='viridis')
    plt.suptitle('Pair Plots of Top Numerical Features by Personality (Train)', y=1.02)
    plt.savefig('eda_plots/numerical_pair_plots_train.png', bbox_inches='tight')
    plt.close()
    print("- Pair plots for train saved")

    # Pair plots for test set
    pair_plot_df_test = test[top_numerical_cols].copy()
    plt.figure(figsize=(15, 15))
    sns.pairplot(pair_plot_df_test, diag_kind='kde', palette='viridis')
    plt.suptitle('Pair Plots of Top Numerical Features (Test)', y=1.02)
    plt.savefig('eda_plots/numerical_pair_plots_test.png', bbox_inches='tight')
    plt.close()
    print("- Pair plots for test saved")
else:
    print("No numerical features available for pair plots.")


Generating Pair Plots for Top Numerical Features (Train and Test)


  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  self._figure.tight_layo

- Pair plots for train saved


  func(x=vector, **plot_kwargs)
  with pd.option_context('mode.use_inf_as_na', True):
  func(x=vector, **plot_kwargs)
  with pd.option_context('mode.use_inf_as_na', True):
  func(x=vector, **plot_kwargs)
  with pd.option_context('mode.use_inf_as_na', True):
  func(x=vector, **plot_kwargs)
  with pd.option_context('mode.use_inf_as_na', True):
  func(x=vector, **plot_kwargs)
  with pd.option_context('mode.use_inf_as_na', True):
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  func(x=x, y=y, **kwargs)
  self._figure.tight_layout(*a

- Pair plots for test saved


<Figure size 1500x1500 with 0 Axes>

<Figure size 1500x1500 with 0 Axes>

## Categorical features

In [8]:
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col != target_col and col != 'id']
if cat_features:
    print(f"\nCategorical features found: {len(cat_features)}")
    plt.figure(figsize=(20, 15))
    for i, feature in enumerate(cat_features[:min(9, len(cat_features))], 1):
        plt.subplot(3, 3, i)
        sns.countplot(x=feature, hue=target_col, data=train, palette='viridis')
        plt.title(f'{feature} Distribution')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/categorical_distributions.png', bbox_inches='tight')
    plt.close()
    print("- Categorical distributions plot saved")
    
    plt.figure(figsize=(20, 15))
    for i, feature in enumerate(cat_features[:min(6, len(cat_features))], 1):
        plt.subplot(2, 3, i)
        prop_df = (train.groupby(feature)[target_col]
                   .value_counts(normalize=True)
                   .rename('percentage')
                   .reset_index())
        sns.barplot(x=feature, y='percentage', hue=target_col, data=prop_df, palette='viridis')
        plt.title(f'Personality % by {feature}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/categorical_percentages.png', bbox_inches='tight')
    plt.close()
    print("- Categorical percentages plot saved")


Categorical features found: 2
- Categorical distributions plot saved
- Categorical percentages plot saved


## Correlation analysis

In [9]:
if num_features:
    corr_matrix = train[num_features + [target_col]].copy()
    corr_matrix[target_col] = corr_matrix[target_col].map({'introvert': 0, 'extrovert': 1})
    corr = corr_matrix.corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.savefig('eda_plots/correlation_matrix.png', bbox_inches='tight')
    plt.close()
    print("- Correlation matrix plot saved")
    
    target_corr = corr[target_col].drop(target_col).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=target_corr.values, y=target_corr.index, palette='viridis')
    plt.title('Feature Correlation with Personality')
    plt.xlabel('Correlation Coefficient')
    plt.savefig('eda_plots/target_correlations.png', bbox_inches='tight')
    plt.close()
    print("- Target correlations plot saved")

  xa[xa < 0] = -1


- Correlation matrix plot saved
- Target correlations plot saved


## Feature engineering with SAFE column checks

In [10]:
print("\nStarting Feature Engineering...")

def create_features(df):
    df = df.copy()
    new_features = []
    
    social_cols = ['Social_event_attendance', 'Going_outside', 
                   'Friends_circle_size', 'Post_frequency']
    available_social = [col for col in social_cols if col in df.columns]
    
    if 'Social_event_attendance' in available_social and 'Going_outside' in available_social:
        df['Social_Going_ratio'] = df['Social_event_attendance'] / (df['Going_outside'].replace(0, 0.1))
        df['Social_Going_diff'] = df['Social_event_attendance'] - df['Going_outside']
        new_features.extend(['Social_Going_ratio', 'Social_Going_diff'])
    
    if 'Friends_circle_size' in available_social and 'Post_frequency' in available_social:
        df['Friend_Post_ratio'] = df['Friends_circle_size'] / (df['Post_frequency'].replace(0, 0.1) + 1e-5)
        df['Friend_Post_product'] = df['Friends_circle_size'] * df['Post_frequency']
        new_features.extend(['Friend_Post_ratio', 'Friend_Post_product'])
    
    if 'Stage_fear' in df.columns and 'Drained_after_socializing' in df.columns:
        df['Avoids_Interaction'] = np.where(
            (df['Stage_fear'] == 'Yes') | (df['Drained_after_socializing'] == 'Yes'), 1, 0)
        new_features.append('Avoids_Interaction')
    
    if available_social:
        df['Social_Index'] = df[available_social].mean(axis=1)
        df['Social_Std'] = df[available_social].std(axis=1).fillna(0)
        new_features.extend(['Social_Index', 'Social_Std'])
    
    if 'Time_spent_Alone' in df.columns:
        df['Alone_Ratio'] = df['Time_spent_Alone'] / 24
        df['Alone_Log'] = np.log1p(df['Time_spent_Alone'])
        new_features.extend(['Alone_Ratio', 'Alone_Log'])
    
    print(f"Created {len(new_features)} new features")
    return df, new_features

print("\nEngineering features for train data:")
train, new_features = create_features(train)
print("\nEngineering features for test data:")
test, _ = create_features(test)

print(f"\n{len(new_features)} new features created:")
print(new_features)




Starting Feature Engineering...

Engineering features for train data:
Created 9 new features

Engineering features for test data:
Created 9 new features

9 new features created:
['Social_Going_ratio', 'Social_Going_diff', 'Friend_Post_ratio', 'Friend_Post_product', 'Avoids_Interaction', 'Social_Index', 'Social_Std', 'Alone_Ratio', 'Alone_Log']


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


## EDA for new features

In [11]:
if new_features:
    print("\nAnalyzing new features...")
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(new_features[:min(6, len(new_features))], 1):
        plt.subplot(2, 3, i)
        if train[feature].nunique() > 10:
            sns.boxplot(x=target_col, y=feature, data=train, palette='viridis')
        else:
            sns.countplot(x=feature, hue=target_col, data=train, palette='viridis')
        plt.title(f'{feature} by Personality')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/new_features.png', bbox_inches='tight')
    plt.close()
    print("- New features plot saved")


Analyzing new features...
- New features plot saved


##  Preprocessing

In [12]:
print("\nStarting preprocessing...")
X = train.drop(columns=[target_col, 'id'], errors='ignore')
y = train[target_col].map({'introvert': 0, 'extrovert': 1})

num_features = [col for col in X.columns if X[col].dtype in ['float64', 'int64']]
cat_features = [col for col in X.columns if X[col].dtype == 'object']

print(f"\nPreprocessing features:")
print(f"Numerical features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")
print(f"Total features: {len(num_features) + len(cat_features)}")


Starting preprocessing...

Preprocessing features:
Numerical features: 14
Categorical features: 2
Total features: 16


## Create preprocessing pipelines

In [13]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


## Apply preprocessing

In [14]:
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test.drop(columns=['id'], errors='ignore'))

print(f"Processed train shape: {X_processed.shape}")
print(f"Processed test shape: {X_test_processed.shape}")

Processed train shape: (18524, 16)
Processed test shape: (6175, 16)


## Split data

In [15]:
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, stratify=y, random_state=42
)

## Calculate class weights

In [16]:
class_0_count = np.sum(y == 0)
class_1_count = np.sum(y == 1)
class_0_weight = len(y) / (2 * class_0_count)
class_1_weight = len(y) / (2 * class_1_count)
class_weights = {0: class_0_weight, 1: class_1_weight}

print(f"\nClass distribution: Introvert={class_0_count}, Extrovert={class_1_count}")
print(f"Class weights: Introvert={class_0_weight:.2f}, Extrovert={class_1_weight:.2f}")


Class distribution: Introvert=4825, Extrovert=13699
Class weights: Introvert=1.92, Extrovert=0.68


## XGBoost Model

In [17]:
print("\nTraining XGBoost model...")
xgb_params = {
    'learning_rate': 0.015,
    'max_depth': 5,
    'min_child_weight': 3,
    'subsample': 0.85,
    'colsample_bytree': 0.75,
    'gamma': 0.3,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'n_estimators': 1500,
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'eval_metric': 'logloss',
    'random_state': 42,
    'scale_pos_weight': class_0_weight / class_1_weight
}

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100,
    verbose=100
)

best_iter = xgb_model.best_iteration
print(f"XGBoost best iteration: {best_iter}")

xgb_final = XGBClassifier(
    **{**xgb_params, 'n_estimators': best_iter}
)
xgb_final.fit(X_processed, y)
print("XGBoost training complete!")


Training XGBoost model...
[0]	validation_0-logloss:0.58111




[100]	validation_0-logloss:0.19303
[200]	validation_0-logloss:0.16129
[300]	validation_0-logloss:0.16132
[346]	validation_0-logloss:0.16237
XGBoost best iteration: 246
XGBoost training complete!


## Feature importance plot

In [18]:
plt.figure(figsize=(12, 8))
sorted_idx = xgb_final.feature_importances_.argsort()
feature_names = num_features + cat_features
plt.barh(np.array(feature_names)[sorted_idx][-20:], 
        xgb_final.feature_importances_[sorted_idx][-20:])
plt.title("XGBoost Feature Importance (Top 20)")
plt.savefig('eda_plots/xgboost_feature_importance.png', bbox_inches='tight')
plt.close()

# LightGBM Model
print("\nTraining LightGBM model...")
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.018,
    'num_leaves': 41,
    'max_depth': 6,
    'min_child_samples': 25,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'n_estimators': 1200,
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': -1
}

lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)
    ]
)

best_iter_lgb = lgb_model.best_iteration_
print(f"LightGBM best iteration: {best_iter_lgb}")

lgb_final = LGBMClassifier(
    **{**lgb_params, 'n_estimators': best_iter_lgb}
)
lgb_final.fit(X_processed, y)
print("LightGBM training complete!")


Training LightGBM model...
[LightGBM] [Info] Number of positive: 10959, number of negative: 3860
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.198567
[200]	valid_0's binary_logloss: 0.159214
[300]	valid_0's binary_logloss: 0.157395
Early stopping, best iteration is:
[272]	valid_0's binary_logloss: 0.157221
LightGBM best iteration: 272
[LightGBM] [Info] Number of positive: 13699, number of negative: 4825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, 

## Feature importance plot

In [19]:
plt.figure(figsize=(12, 8))
lgb_importances = lgb_final.feature_importances_
sorted_idx = lgb_importances.argsort()
plt.barh(np.array(feature_names)[sorted_idx][-20:], 
        lgb_importances[sorted_idx][-20:])
plt.title("LightGBM Feature Importance (Top 20)")
plt.savefig('eda_plots/lightgbm_feature_importance.png', bbox_inches='tight')
plt.close()



## Ensemble Model

In [20]:
print("\nCreating ensemble model...")
ensemble = VotingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgb', lgb_final)
    ],
    voting='soft',
    weights=[1, 1.3]
)

ensemble.fit(X_processed, y)
print("Ensemble training complete!")


Creating ensemble model...
[LightGBM] [Info] Number of positive: 13699, number of negative: 4825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 615
[LightGBM] [Info] Number of data points in the train set: 18524, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Ensemble training complete!


## Model Evaluation

In [21]:
print("\nModel Evaluation:")
def evaluate_model(model, name, X_val, y_val):
    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred)
    print(f"{name} Validation Accuracy: {val_acc:.4f}")
    print(f"{name} Validation F1 Score: {val_f1:.4f}")
    print(classification_report(y_val, val_pred))
    
    cm = confusion_matrix(y_val, val_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Introvert', 'Extrovert'], 
                yticklabels=['Introvert', 'Extrovert'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'{name} Confusion Matrix')
    plt.savefig(f'eda_plots/{name}_confusion_matrix.png', bbox_inches='tight')
    plt.close()
    return val_acc

xgb_acc = evaluate_model(xgb_final, 'XGBoost', X_val, y_val)
lgb_acc = evaluate_model(lgb_final, 'LightGBM', X_val, y_val)
ensemble_acc = evaluate_model(ensemble, 'Ensemble', X_val, y_val)


Model Evaluation:
XGBoost Validation Accuracy: 0.9665
XGBoost Validation F1 Score: 0.9775
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       965
           1       0.97      0.98      0.98      2740

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705

LightGBM Validation Accuracy: 0.9655
LightGBM Validation F1 Score: 0.9767
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       965
           1       0.97      0.98      0.98      2740

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.95      3705
weighted avg       0.97      0.97      0.97      3705

Ensemble Validation Accuracy: 0.9663
Ensemble Validation F1 Score: 0.9773
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       96

## Performance comparison

In [22]:
model_names = ['XGBoost', 'LightGBM', 'Ensemble']
acc_scores = [xgb_acc, lgb_acc, ensemble_acc]
f1_scores = [f1_score(y_val, xgb_final.predict(X_val)), 
             f1_score(y_val, lgb_final.predict(X_val)), 
             f1_score(y_val, ensemble.predict(X_val))]

plt.figure(figsize=(10, 6))
x = np.arange(len(model_names))
width = 0.35

plt.bar(x - width/2, acc_scores, width, label='Accuracy')
plt.bar(x + width/2, f1_scores, width, label='F1 Score')

plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(x, model_names)
plt.ylim(0, 1)
plt.legend()
plt.savefig('eda_plots/model_performance.png', bbox_inches='tight')
plt.close()

## Select best model

In [23]:
if ensemble_acc > max(xgb_acc, lgb_acc):
    final_model = ensemble
    print("\nSelected Ensemble as final model")
elif lgb_acc > xgb_acc:
    final_model = lgb_final
    print("\nSelected LightGBM as final model")
else:
    final_model = xgb_final
    print("\nSelected XGBoost as final model")


Selected XGBoost as final model


## Generate predictions

In [24]:
test_pred = final_model.predict(X_test_processed)
test_labels = ['Introvert' if pred == 0 else 'Extrovert' for pred in test_pred]

## Create submission

In [25]:
submission = pd.DataFrame({
    'id': test['id'],
    'Personality': test_labels
})
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")


Submission file created: submission.csv


## Save models

In [26]:
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(final_model, 'personality_model.pkl')
print("Models saved: preprocessor.pkl and personality_model.pkl")

print("\n" + "="*50)
print("MODEL TRAINING COMPLETE!")
print(f"Final Validation Accuracy: {ensemble_acc:.4f}")
print(f"Total Features Used: {len(feature_names)}")
print(f"New Features Created: {len(new_features)}")
print("EDA plots saved in 'eda_plots/' directory")
print("="*50)

Models saved: preprocessor.pkl and personality_model.pkl

MODEL TRAINING COMPLETE!
Final Validation Accuracy: 0.9663
Total Features Used: 16
New Features Created: 9
EDA plots saved in 'eda_plots/' directory


## function to preprocess new input data and make predictions

In [27]:
def predict_personality(input_data, preprocessor, model):
    """
    Predict personality type from input data using the trained model
    
    Args:
        input_data (dict): Dictionary of feature values
        preprocessor (ColumnTransformer): Trained preprocessing pipeline
        model: Trained classification model
        
    Returns:
        tuple: (personality_label, confidence_score)
    """
    
    input_df = pd.DataFrame([input_data])
    
    input_df = clean_column_names(input_df)
    
    input_df, _ = create_features(input_df)
    
    if 'id' in input_df.columns:
        input_df = input_df.drop(columns=['id'])
    
    input_processed = preprocessor.transform(input_df)
    
    pred_proba = model.predict_proba(input_processed)[0]
    prediction = model.predict(input_processed)[0]
    
    personality = 'Introvert' if prediction == 0 else 'Extrovert'
    
    confidence = pred_proba[prediction]
    
    return personality, confidence

if __name__ == "__main__":
    
    example_input = {
        'Time_spent_Alone': 8.5,
        'Social_event_attendance': 1,
        'Going_outside': 2,
        'Friends_circle_size': 5,
        'Post_frequency': 1,
        'Stage_fear': 'Yes',
        'Drained_after_socializing': 'Yes'
    }
    
    personality, confidence = predict_personality(
        example_input, 
        preprocessor, 
        final_model
    )
    
    print(f"\nPredicted Personality: {personality}")
    print(f"Confidence Score: {confidence:.2f}")

Created 9 new features

Predicted Personality: Introvert
Confidence Score: 0.83
