In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.ensemble import VotingClassifier
import joblib
import os

In [2]:
os.makedirs('eda_plots', exist_ok=True)

train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

def clean_column_names(df):
    df.columns = [re.sub(r'\W+', '_', col.strip()) for col in df.columns]
    return df

train = clean_column_names(train)
test = clean_column_names(test)

# Find target column
target_col = [col for col in train.columns if 'Personality' in col][0]

# Data cleaning and validation
train = train.dropna(subset=[target_col])
train[target_col] = train[target_col].str.strip().str.lower()
train[target_col] = train[target_col].apply(lambda x: 'introvert' if 'intro' in x else 'extrovert')

In [3]:
plt.figure(figsize=(10, 6))
sns.countplot(x=target_col, data=train, palette='viridis')
plt.title('Distribution of Personality Types')
plt.savefig('eda_plots/personality_distribution.png', bbox_inches='tight')
plt.close()

In [4]:
num_features = [col for col in train.columns if train[col].dtype in ['float64', 'int64'] and col != 'id']
if num_features:
    
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(num_features[:min(9, len(num_features))], 1):
        plt.subplot(3, 3, i)
        sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
        plt.title(f'Distribution of {feature}')
    plt.tight_layout()
    plt.savefig('eda_plots/numerical_distributions.png', bbox_inches='tight')
    plt.close()
    
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(num_features[:min(6, len(num_features))], 1):
        plt.subplot(2, 3, i)
        sns.boxplot(x=target_col, y=feature, data=train, palette='coolwarm')
        plt.title(f'{feature} by Personality')
    plt.tight_layout()
    plt.savefig('eda_plots/numerical_boxplots.png', bbox_inches='tight')
    plt.close()


  sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
  with pd.option_context('mode.use_inf_as_na', True):
  sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
  with pd.option_context('mode.use_inf_as_na', True):
  sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
  with pd.option_context('mode.use_inf_as_na', True):
  sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
  with pd.option_context('mode.use_inf_as_na', True):
  sns.histplot(train[feature], kde=True, bins=30, palette='coolwarm')
  with pd.option_context('mode.use_inf_as_na', True):


In [6]:
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col != target_col]
if cat_features:
    
    plt.figure(figsize=(20, 15))
    for i, feature in enumerate(cat_features[:min(9, len(cat_features))], 1):
        plt.subplot(3, 3, i)
        sns.countplot(x=feature, hue=target_col, data=train, palette='viridis')
        plt.title(f'{feature} Distribution')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/categorical_distributions.png', bbox_inches='tight')
    plt.close()
    
    
    plt.figure(figsize=(20, 15))
    for i, feature in enumerate(cat_features[:min(6, len(cat_features))], 1):
        plt.subplot(2, 3, i)
        prop_df = (train.groupby(feature)[target_col]
                   .value_counts(normalize=True)
                   .rename('percentage')
                   .reset_index())
        sns.barplot(x=feature, y='percentage', hue=target_col, data=prop_df, palette='viridis')
        plt.title(f'Personality % by {feature}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/categorical_percentages.png', bbox_inches='tight')
    plt.close()

In [7]:
if num_features:
    corr_matrix = train[num_features + [target_col]].copy()
    corr_matrix[target_col] = corr_matrix[target_col].map({'introvert': 0, 'extrovert': 1})
    corr = corr_matrix.corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.savefig('eda_plots/correlation_matrix.png', bbox_inches='tight')
    plt.close()
    
    target_corr = corr[target_col].drop(target_col).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=target_corr.values, y=target_corr.index, palette='viridis')
    plt.title('Feature Correlation with Personality')
    plt.xlabel('Correlation Coefficient')
    plt.savefig('eda_plots/target_correlations.png', bbox_inches='tight')
    plt.close()

  xa[xa < 0] = -1


In [8]:
time_cols = [col for col in train.columns if 'time' in col.lower() or 'hours' in col.lower()]
if time_cols:
    plt.figure(figsize=(15, 5))
    for i, col in enumerate(time_cols, 1):
        plt.subplot(1, len(time_cols), i)
        sns.boxplot(x=target_col, y=col, data=train, palette='coolwarm')
        plt.title(f'{col} by Personality')
    plt.tight_layout()
    plt.savefig('eda_plots/time_features.png', bbox_inches='tight')
    plt.close()

In [11]:
social_cols = [col for col in train.columns if 'social' in col.lower() or 'friend' in col.lower()]
social_numeric = [col for col in social_cols if train[col].dtype in ['float64', 'int64']]
social_categorical = [col for col in social_cols if train[col].dtype == 'object']

if social_numeric:
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(social_numeric[:min(4, len(social_numeric))], 1):
        plt.subplot(2, 2, i)
        sns.violinplot(x=target_col, y=col, data=train, palette='viridis')
        plt.title(f'{col} Distribution')
    plt.tight_layout()
    plt.savefig('eda_plots/social_features_numeric.png', bbox_inches='tight')
    plt.close()

if social_categorical:
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(social_categorical[:min(4, len(social_categorical))], 1):
        plt.subplot(2, 2, i)
        sns.countplot(x=col, hue=target_col, data=train, palette='viridis')
        plt.title(f'{col} Distribution')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/social_features_categorical.png', bbox_inches='tight')
    plt.close()


In [12]:
def create_features(df):
    df = df.copy()
    
    if 'Social_event_attendance' in df.columns and 'Going_outside' in df.columns:
        df['Social_Going_ratio'] = df['Social_event_attendance'] / (df['Going_outside'] + 1e-5)
        df['Social_Going_diff'] = df['Social_event_attendance'] - df['Going_outside']
    
    if 'Friends_circle_size' in df.columns and 'Post_frequency' in df.columns:
        df['Friend_Post_ratio'] = df['Friends_circle_size'] / (df['Post_frequency'] + 1e-5)
        df['Friend_Post_product'] = df['Friends_circle_size'] * df['Post_frequency']
    
    if 'Stage_fear' in df.columns and 'Drained_after_socializing' in df.columns:
        df['Avoids_Interaction'] = np.where(
            (df['Stage_fear'] == 'Yes') | (df['Drained_after_socializing'] == 'Yes'), 1, 0)
    
    social_cols = [c for c in ['Social_event_attendance', 'Going_outside', 
                               'Friends_circle_size', 'Post_frequency'] if c in df.columns]
    if social_cols:
        df['Social_Index'] = df[social_cols].mean(axis=1)
        df['Social_Std'] = df[social_cols].std(axis=1).fillna(0)
    
    if 'Drained_after_socializing' in df.columns and 'Going_outside' in df.columns:
        df['Drained_Going_Interaction'] = np.where(
            (df['Drained_after_socializing'] == 'Yes') & (df['Going_outside'] > 3), 1, 0)
    
    if 'Friends_circle_size' in df.columns and 'Social_event_attendance' in df.columns:
        df['Social_Engagement'] = (df['Friends_circle_size'] + 1) * (df['Social_event_attendance'] + 1)
    
    if 'Time_spent_Alone' in df.columns:
        df['Alone_Ratio'] = df['Time_spent_Alone'] / 24
        df['Alone_Log'] = np.log1p(df['Time_spent_Alone'])
    
    return df

train = create_features(train)
test = create_features(test)

  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [13]:
new_features = [col for col in train.columns if col not in num_features + cat_features + [target_col, 'id']]
if new_features:
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(new_features[:min(6, len(new_features))], 1):
        plt.subplot(2, 3, i)
        if train[feature].nunique() > 10:
            sns.boxplot(x=target_col, y=feature, data=train, palette='viridis')
        else:
            sns.countplot(x=feature, hue=target_col, data=train, palette='viridis')
        plt.title(f'{feature} by Personality')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('eda_plots/new_features.png', bbox_inches='tight')
    plt.close()

In [14]:
X = train.drop(columns=[target_col, 'id'], errors='ignore')
y = train[target_col].map({'introvert': 0, 'extrovert': 1})


In [15]:
num_features = [col for col in X.columns if X[col].dtype in ['float64', 'int64']]
cat_features = [col for col in X.columns if X[col].dtype == 'object']

print(f"Numerical features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")

Numerical features: 16
Categorical features: 2


In [16]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [17]:
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test.drop(columns=['id'], errors='ignore'))

X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, stratify=y, random_state=42
)

class_0_weight = len(y) / (2 * np.sum(y == 0))
class_1_weight = len(y) / (2 * np.sum(y == 1))
class_weights = {0: class_0_weight, 1: class_1_weight}

print(f"Class weights: 0={class_0_weight:.2f}, 1={class_1_weight:.2f}")

Class weights: 0=1.92, 1=0.68


In [18]:
xgb_params = {
    'learning_rate': 0.015,
    'max_depth': 5,
    'min_child_weight': 3,
    'subsample': 0.85,
    'colsample_bytree': 0.75,
    'gamma': 0.3,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'n_estimators': 1500,
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'use_label_encoder': False,
    'random_state': 42,
    'scale_pos_weight': class_0_weight / class_1_weight
}

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100,
    verbose=100
)

[0]	validation_0-logloss:0.58109




[100]	validation_0-logloss:0.19279
[200]	validation_0-logloss:0.16121
[300]	validation_0-logloss:0.16136
[333]	validation_0-logloss:0.16227


In [19]:
best_iter = xgb_model.best_iteration
xgb_final = XGBClassifier(
    **{**xgb_params, 'n_estimators': best_iter, 'early_stopping_rounds': None}
)
xgb_final.fit(X_processed, y)

In [20]:
plt.figure(figsize=(12, 8))
sorted_idx = xgb_final.feature_importances_.argsort()
feature_names = num_features + cat_features
plt.barh(np.array(feature_names)[sorted_idx][-20:], 
        xgb_final.feature_importances_[sorted_idx][-20:])
plt.title("XGBoost Feature Importance (Top 20)")
plt.savefig('eda_plots/xgboost_feature_importance.png', bbox_inches='tight')
plt.close()

In [21]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.018,
    'num_leaves': 41,
    'max_depth': 6,
    'min_child_samples': 25,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'n_estimators': 1200,
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': -1
}

# Identify categorical feature indices
cat_indices = [i for i, col in enumerate(preprocessor.transformers_[1][2]) 
               if col in cat_features]

lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    categorical_feature=cat_indices,
    callbacks=[early_stopping(100), log_evaluation(100)]
)


[LightGBM] [Info] Number of positive: 10959, number of negative: 3860
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.198909
[200]	valid_0's binary_logloss: 0.160046
[300]	valid_0's binary_logloss: 0.158194
Early stopping, best iteration is:
[296]	valid_0's binary_logloss: 0.158146


In [22]:
best_iter_lgb = lgb_model.best_iteration_
lgb_final = LGBMClassifier(
    **{**lgb_params, 'n_estimators': best_iter_lgb}
)
lgb_final.fit(X_processed, y, categorical_feature=cat_indices)  # Train on full dataset

# Plot feature importance for LightGBM
plt.figure(figsize=(12, 8))
lgb_importances = lgb_final.feature_importances_
sorted_idx = lgb_importances.argsort()
plt.barh(np.array(feature_names)[sorted_idx][-20:], 
        lgb_importances[sorted_idx][-20:])
plt.title("LightGBM Feature Importance (Top 20)")
plt.savefig('eda_plots/lightgbm_feature_importance.png', bbox_inches='tight')
plt.close()

[LightGBM] [Info] Number of positive: 13699, number of negative: 4825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 701
[LightGBM] [Info] Number of data points in the train set: 18524, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [23]:
ensemble = VotingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgb', lgb_final)
    ],
    voting='soft',
    weights=[1, 1.3]
)

# Train ensemble on full dataset
ensemble.fit(X_processed, y)

[LightGBM] [Info] Number of positive: 13699, number of negative: 4825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 719
[LightGBM] [Info] Number of data points in the train set: 18524, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [24]:
def evaluate_model(model, name):
    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred)
    print(f"\n{name} Validation Accuracy: {val_acc:.4f}")
    print(f"{name} Validation F1 Score: {val_f1:.4f}")
    print(classification_report(y_val, val_pred))
    
    cm = confusion_matrix(y_val, val_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Introvert', 'Extrovert'], 
                yticklabels=['Introvert', 'Extrovert'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'{name} Confusion Matrix')
    plt.savefig(f'eda_plots/{name}_confusion_matrix.png', bbox_inches='tight')
    plt.close()
    
    return val_acc

xgb_acc = evaluate_model(xgb_final, 'XGBoost')
lgb_acc = evaluate_model(lgb_final, 'LightGBM')
ensemble_acc = evaluate_model(ensemble, 'Ensemble')



XGBoost Validation Accuracy: 0.9665
XGBoost Validation F1 Score: 0.9775
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       965
           1       0.97      0.98      0.98      2740

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705


LightGBM Validation Accuracy: 0.9655
LightGBM Validation F1 Score: 0.9767
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       965
           1       0.97      0.98      0.98      2740

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.95      3705
weighted avg       0.97      0.97      0.97      3705


Ensemble Validation Accuracy: 0.9665
Ensemble Validation F1 Score: 0.9774
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       965
           1  

In [26]:
model_names = ['XGBoost', 'LightGBM', 'Ensemble']
acc_scores = [xgb_acc, lgb_acc, ensemble_acc]
f1_scores = [f1_score(y_val, xgb_final.predict(X_val)), 
             f1_score(y_val, lgb_final.predict(X_val)), 
             f1_score(y_val, ensemble.predict(X_val))]

plt.figure(figsize=(10, 6))
x = np.arange(len(model_names))
width = 0.35

plt.bar(x - width/2, acc_scores, width, label='Accuracy')
plt.bar(x + width/2, f1_scores, width, label='F1 Score')

plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(x, model_names)
plt.ylim(0, 1)
plt.legend()
plt.savefig('eda_plots/model_performance.png', bbox_inches='tight')
plt.close()

In [27]:
if ensemble_acc > max(xgb_acc, lgb_acc):
    final_model = ensemble
    print("Selected Ensemble as final model")
elif lgb_acc > xgb_acc:
    final_model = lgb_final
    print("Selected LightGBM as final model")
else:
    final_model = xgb_final
    print("Selected XGBoost as final model")

# Generate predictions
test_pred = final_model.predict(X_test_processed)
test_labels = ['Introvert' if pred == 0 else 'Extrovert' for pred in test_pred]

# Create submission
submission = pd.DataFrame({
    'Id': test['id'],
    'Personality': test_labels
})
submission.to_csv('submission.csv', index=False)


Selected XGBoost as final model


In [28]:
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(final_model, 'personality_model.pkl')

print("\nModel training complete!")
print(f"Final Validation Accuracy: {ensemble_acc:.4f}")
print("Submission file 'submission.csv' created")
print("Models saved as 'preprocessor.pkl' and 'personality_model.pkl'")
print("EDA plots saved in 'eda_plots/' directory")


Model training complete!
Final Validation Accuracy: 0.9665
Submission file 'submission.csv' created
Models saved as 'preprocessor.pkl' and 'personality_model.pkl'
EDA plots saved in 'eda_plots/' directory
