In [None]:
import kagglehub
kagglehub.login()

In [None]:
data_path = kagglehub.competition_download('playground-series-s5e7')

print('Data source import complete.')

In [None]:
print(data_path)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))

Original dataset

In [None]:
# df_orig = pd.read_csv('/content/drive/MyDrive/code/kaggle/S5E7_introvert_extrovert/personality_datasert.csv')

In [None]:
# df_orig.head()

In [None]:
# data = data.drop(columns=['id'])

# common_cols = data.columns.intersection(df_orig.columns)

# train_aligned = data[common_cols]
# df_orig_aligned = df_orig[common_cols]

# data = pd.concat([train_aligned, df_orig_aligned], axis=0).reset_index(drop=True)

# EDA

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
missing_table = pd.DataFrame({
    'Missing Values': data.isna().sum(),
    'Percentage (%)': (data.isnull().mean() * 100).round(2)
})

print(missing_table.sort_values(by='Missing Values', ascending=False))

In [None]:
data.nunique()

In [None]:
print(f"Duplicates before dropping: {data.duplicated().sum()}")
data = data.drop_duplicates()
print(f"Duplicates after dropping: {data.duplicated().sum()}")


Check dataset balance

In [None]:
data['Personality'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Introvert or extrovert distribution')
plt.ylabel('')
plt.show()

# Data visualisations

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=data, x="Time_spent_Alone")
plt.xlabel('Time spent alone')
plt.ylabel('')
plt.show()

In [None]:
data['Stage_fear'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Stage fear')
plt.ylabel('')
plt.show()

In [None]:
sns.countplot(data=data, x="Social_event_attendance")
plt.xlabel('Social event attendance')
plt.ylabel('')
plt.show()

In [None]:
sns.countplot(data=data, x="Going_outside")
plt.xlabel('Going outside')
plt.ylabel('')
plt.show()

In [None]:
data['Drained_after_socializing'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, explode=(0,0.1), colors=['#4c72b0', '#aec7e8'])
plt.title('Drained after socializing')
plt.ylabel('')
plt.show()

In [None]:
sns.countplot(data=data, x="Friends_circle_size")
plt.xlabel('Number of friends')
plt.ylabel('')
plt.show()

In [None]:
sns.countplot(data=data, x="Post_frequency")
plt.xlabel('Frequency of posting')
plt.ylabel('')
plt.show()

In [None]:
plt.figure(figsize=(16, 6))

data_corr = data.corr(numeric_only=True)

heatmap = sns.heatmap(data_corr.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12})

plt.show()

# Data preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle

In [None]:
X = data.drop(['Personality','id'], axis=1)
y = data['Personality']

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names_out_ = None

    def fit(self, X, y = None):
      return self

    def transform(self, X):
        X = X.copy()

        X['Social_score'] = X['Social_event_attendance'] + X['Going_outside'] + X['Friends_circle_size']
        X['Introvert_score'] = X['Time_spent_Alone'] - X['Social_score']
        X['Alone_to_friends_ratio'] = X['Time_spent_Alone'] / (X['Friends_circle_size'] + 1)
        X['Social_post_interaction'] = X['Post_frequency'] * X['Social_event_attendance']

        self.feature_names_out_ = X.columns.tolist()

        return X

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out_

In [None]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean', add_indicator=False)),
    ('scaler', StandardScaler())
])

In [None]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=False)),
    ('encoder', OrdinalEncoder())
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', categorical_pipeline, make_column_selector(dtype_include=['object']))
])

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

# Model building

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import shap
shap.initjs()

In [None]:
counter = Counter(y)
scale_pos_weight = counter[0] / counter[1]

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
best_params =  {'max_depth': 5,
                  'learning_rate': 0.016936843011325364,
                  'n_estimators': 958,
                  'subsample': 0.5385682613272109,
                  'colsample_bytree': 0.5185754497137016,
                  'min_child_weight': 1,
                  'gamma': 0.1832798205532591,
                  'n_jobs': -1,
                  "objective" : "binary:logistic",
                  "eval_metric" : "logloss"
                  }

In [None]:
lgbm_params = {'classifier__colsample_bytree': 0.7,
               'classifier__learning_rate': 0.05,
               'classifier__n_estimators': 300,
               'classifier__num_leaves': 20,
               'classifier__subsample': 0.7}

In [None]:
# def objective(trial):
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 3, 18),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
#         'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'gamma': trial.suggest_float('gamma', 0, 5),
#     }


#     xgb_pipeline = Pipeline([
#     ('features', FeatureEngineer()),
#     ('preprocessing', preprocessor),
#     ('classifier', XGBClassifier(**params,
#                                  tree_method = 'hist',
#                                  device =  'cuda',
#                                  scale_pos_weight = scale_pos_weight))
#     ])


#     score = cross_val_score(xgb_pipeline, X, y, cv=cv,  scoring='accuracy').mean()
#     return score

In [None]:
# ! pip install optuna

In [None]:
# import optuna

# study = optuna.create_study(study_name="example_xgboost_study", direction='maximize')
# study.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)

# best_params = study.best_params
# print(f"\nBest parameters: {best_params}")

In [None]:
xgb_pipeline = Pipeline([
    ('features', FeatureEngineer()),
    ('preprocessing', preprocessor)
    ])

In [None]:
X_preprocessed = xgb_pipeline.fit_transform(X)
X_test_preprocessed = xgb_pipeline.transform(test_data)

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_preprocessed))
test_preds = np.zeros(len(X_test_preprocessed))
all_shap_values = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_preprocessed, y)):
    X_train, X_val = X_preprocessed[train_idx], X_preprocessed[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test_preprocessed)

    xgb_model = xgb.train(best_params, dtrain, num_boost_round=100,
                      evals=[(dval, "valid")],
                      early_stopping_rounds=10, verbose_eval=False)

    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_val)
    all_shap_values.append(shap_vals)

    oof_preds[val_idx] = model.predict(dval) > 0.5
    test_preds += model.predict(dtest) / skf.n_splits

cv_acc = accuracy_score(y, oof_preds)
print(f"Cross-Validation Accuracy: {cv_acc:.4f}")

In [None]:
feature_engineer = xgb_pipeline.named_steps['features']
feature_names = feature_engineer.get_feature_names_out()

combined_shap = np.vstack(all_shap_values)
shap.summary_plot(combined_shap, X_preprocessed, feature_names=feature_names)

In [None]:
shap.summary_plot(combined_shap, X_preprocessed, feature_names=feature_names, plot_type="bar")

# Submition

In [None]:
y_pred_test = (test_preds >= 0.5).astype(int)

In [None]:
mapping = {0: "Extrovert", 1: "Introvert"}

y_test_labels = pd.Series(y_pred_test).map(mapping)

In [None]:
submission = pd.DataFrame({
    'id': test_data["id"],
    'Personality': y_test_labels
})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)