# Load Library

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [26]:
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

In [5]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import joblib

# Load data and quick check

In [6]:
df_train = pd.read_csv('train.csv') 
df_test  = pd.read_csv('test.csv')   

print("Train shape:", df_train.shape)
print("Test shape: ", df_test.shape)
print("\nTarget distribution (train):")
print(df_train['Personality'].value_counts())

Train shape: (18524, 9)
Test shape:  (6175, 8)

Target distribution (train):
Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64


# Define features and encode simple Yes/No columns

In [7]:
# Step 3: define features (change these lists if your actual columns differ)
numeric_features = df_train.select_dtypes(include=np.number).columns.to_list()
categorical_features = df_train.select_dtypes(exclude=np.number).columns.to_list()

categorical_features.remove('Personality')
numeric_features.remove('id')

all_features = numeric_features + categorical_features
target = 'Personality'



In [8]:
print(categorical_features)

['Stage_fear', 'Drained_after_socializing']


In [9]:
print(numeric_features)

['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']


In [20]:
# # helper to map Yes/No to 1/0 (keeps things deterministic)
def encode_yes_no(column):
    mapping = {'Yes': 1, 'No': 0, 'yes': 1, 'no': 0}
    # handle missing: fill with 0 (you can change strategy if needed)
    return column.map(mapping).fillna(0).astype(int)

# Apply encoding on both train and test (so columns exist for later)
for col in categorical_features:
    if col in df_train.columns:
        df_train[col] = encode_yes_no(df_train[col])
    else:
        # if missing column, create a default column of zeros
        df_train[col] = 0
    if col in df_test.columns:
        df_test[col] = encode_yes_no(df_test[col])
    else:
        df_test[col] = 0

In [21]:
X = df_train[all_features]
y = df_train[target]


In [22]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, 'label_encoder.joblib')   # save label encoder for later inverse_transform
print("Saved label encoder -> label_encoder.joblib")

Saved label encoder -> label_encoder.joblib


In [23]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, val_idx = next(sss.split(X, y_encoded))

X_train, X_val = X.iloc[train_idx].reset_index(drop=True), X.iloc[val_idx].reset_index(drop=True)
y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

print("Train class distribution:", np.unique(y_train, return_counts=True))
print("Val   class distribution:", np.unique(y_val, return_counts=True))


Train class distribution: (array([0, 1]), array([10959,  3860]))
Val   class distribution: (array([0, 1]), array([2740,  965]))


In [24]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))  # we already mapped yes/no but keep safe
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [27]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_and_grids = {}

# Logistic Regression
models_and_grids['LogisticRegression'] = (
    LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    {'classifier__C': [0.1, 1.0, 10.0]}
)

# Random Forest
models_and_grids['RandomForest'] = (
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [None, 10]}
)

# Gradient Boosting (sklearn)
models_and_grids['GradientBoosting'] = (
    GradientBoostingClassifier(random_state=42),
    {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1]}
)

# XGBoost
models_and_grids['XGBoost'] = (
    xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1], 'classifier__max_depth': [3, 6]}
)

# LightGBM
models_and_grids['LightGBM'] = (
    lgb.LGBMClassifier(random_state=42),
    {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1], 'classifier__max_depth': [3, 6]}
)

# CatBoost (using iterations param)
models_and_grids['CatBoost'] = (
    CatBoostClassifier(random_state=42, verbose=False),
    {'classifier__iterations': [50, 100], 'classifier__learning_rate': [0.01, 0.1], 'classifier__depth': [3, 6]}
)

In [28]:
best_estimators = {}
validation_scores = {}

for name, (estimator, param_grid) in models_and_grids.items():
    print(f"\nRunning GridSearchCV for {name} ...")
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', estimator)
    ])
    grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1, refit=True)
    grid.fit(X_train, y_train)   # grid will fit preprocessor inside CV folds correctly
    best = grid.best_estimator_  # this is a fitted pipeline (refit=True -> refit on full X_train)
    best_estimators[name] = best

    # Evaluate on hold-out validation set
    y_val_pred = best.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    validation_scores[name] = acc

    # print results
    print(f"{name} - best params: {grid.best_params_}")
    print(f"{name} - validation accuracy: {acc:.4f}")
    print("Classification report (validation):")
    print(classification_report(le.inverse_transform(y_val), le.inverse_transform(y_val_pred)))


Running GridSearchCV for LogisticRegression ...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
LogisticRegression - best params: {'classifier__C': 0.1}
LogisticRegression - validation accuracy: 0.9682
Classification report (validation):
              precision    recall  f1-score   support

   Extrovert       0.98      0.97      0.98      2740
   Introvert       0.93      0.95      0.94       965

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705


Running GridSearchCV for RandomForest ...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
RandomForest - best params: {'classifier__max_depth': 10, 'classifier__n_estimators': 50}
RandomForest - validation accuracy: 0.9709
Classification report (validation):
              precision    recall  f1-score   support

   Extrovert       0.98      0.98      0.98      2740
   Introvert       0.94      0.95      0.

In [29]:
results_df = pd.DataFrame([
    (name, score) for name, score in validation_scores.items()
], columns=['Model', 'ValidationAccuracy']).sort_values('ValidationAccuracy', ascending=False).reset_index(drop=True)

print("\nModel comparison (validation accuracy):")
print(results_df)

best_model_name = results_df.loc[0, 'Model']
best_pipeline = best_estimators[best_model_name]   # this is a Pipeline and already fitted on X_train
print(f"\nSelected best model: {best_model_name} with validation accuracy {results_df.loc[0,'ValidationAccuracy']:.4f}")




Model comparison (validation accuracy):
                Model  ValidationAccuracy
0             XGBoost            0.971660
1            LightGBM            0.971660
2    GradientBoosting            0.971390
3            CatBoost            0.971120
4        RandomForest            0.970850
5  LogisticRegression            0.968151

Selected best model: XGBoost with validation accuracy 0.9717


In [30]:
fitted_preprocessor = best_pipeline.named_steps['preprocessor']
joblib.dump(fitted_preprocessor, 'preprocessing.joblib')
print("Saved fitted preprocessor -> preprocessing.joblib")

# Save the entire best pipeline (preprocessor + classifier) for direct predictions
joblib.dump(best_pipeline, 'best_model.joblib')
print("Saved entire best pipeline -> best_model.joblib")

Saved fitted preprocessor -> preprocessing.joblib
Saved entire best pipeline -> best_model.joblib
