In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score

print("All libraries imported successfully!")

# Load the datasets
try:
    train_df = pd.read_csv('../data/raw/train.csv')
    test_df = pd.read_csv('../data/raw/test.csv')
    sample_submission = pd.read_csv('../data/raw/sample_submission.csv')

    print("\nData loaded successfully!")
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")

except FileNotFoundError:
    print("Error: Make sure 'train.csv', 'test.csv', and 'sample_submission.csv' are in the 'data/raw/' directory.")
    # Exit or handle gracefully if files aren't found
    # For now, we'll assume they are found to proceed with instructions
test_ids = test_df['id'].copy() if test_df is not None and 'id' in test_df.columns else None
display(train_df.head())

features = ['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality']

All libraries imported successfully!

Data loaded successfully!
Train data shape: (18524, 9)
Test data shape: (6175, 8)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [6]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64
id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64


In [36]:
# Global variables to store fitted imputers and encoders
# This allows them to be used across different function calls (e.g., for train and then for validation/test)
global_numerical_imputer = None
global_categorical_imputers = {} # Dictionary to store imputers for each categorical column
global_categorical_encoders = {} # Dictionary to store encoders for each categorical column


In [35]:
def fit_and_transform_train_data(X_train_df):
    """
    Fits imputers and encoders on the training data and returns the transformed training data.
    Assumes 'id' column and target column are already removed from X_train_df.
    """
    X_train_processed = X_train_df.copy()

    # Identify columns based on their data types
    numerical_cols = X_train_processed.select_dtypes(include=np.number).columns.tolist()
    # Categorical columns are typically 'object' or 'category' dtype
    categorical_cols = X_train_processed.select_dtypes(include=['object', 'category']).columns.tolist()

    # --- Numerical Imputation ---
    if numerical_cols:
        # Fit numerical imputer on training data
        global global_numerical_imputer
        global_numerical_imputer = SimpleImputer(strategy='median')
        global_numerical_imputer.fit(X_train_processed[numerical_cols])
        # Transform training data
        X_train_processed[numerical_cols] = global_numerical_imputer.transform(X_train_processed[numerical_cols])

    # --- Categorical Imputation and Encoding ---
    if categorical_cols:
        for col in categorical_cols:
            # Impute missing categorical values with the mode of the training data
            imputer = SimpleImputer(strategy='most_frequent')
            imputer.fit(X_train_processed[[col]]) # Fit only on this column
            X_train_processed[[col]] = imputer.transform(X_train_processed[[col]])
            global_categorical_imputers[col] = imputer # Store the fitted imputer

            # Encode categorical values using OrdinalEncoder
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) # Handles unseen categories in test data
            encoder.fit(X_train_processed[[col]]) # Fit only on this column
            X_train_processed[[col]] = encoder.transform(X_train_processed[[col]])
            global_categorical_encoders[col] = encoder # Store the fitted encoder

    return X_train_processed

            


In [37]:
def transform_data(X_data_df):
    """
    Transforms new data (validation/test) using the globally fitted imputers and encoders.
    Assumes 'id' column and target column are already removed from X_data_df.
    """
    X_data_processed = X_data_df.copy()

    # Identify columns (should be same as training data)
    numerical_cols = X_data_processed.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X_data_processed.select_dtypes(include=['object', 'category']).columns.tolist()

    # --- Numerical Imputation ---
    if numerical_cols and global_numerical_imputer:
        # Use the imputer fitted on training data to transform new data
        X_data_processed[numerical_cols] = global_numerical_imputer.transform(X_data_processed[numerical_cols])
    elif numerical_cols and not global_numerical_imputer:
        print("Warning: Numerical imputer not fitted. Run 'fit_and_transform_train_data' first.")

    # --- Categorical Imputation and Encoding ---
    if categorical_cols:
        for col in categorical_cols:
            if col in global_categorical_imputers and col in global_categorical_encoders:
                # Use imputers and encoders fitted on training data
                imputer = global_categorical_imputers[col]
                encoder = global_categorical_encoders[col]

                X_data_processed[[col]] = imputer.transform(X_data_processed[[col]])
                X_data_processed[[col]] = encoder.transform(X_data_processed[[col]])
            else:
                print(f"Warning: Imputer/Encoder not found for categorical column '{col}'. It might not have been in training data or preprocessing was skipped.")
                # Fallback: if not found, fillna and try to encode (might create issues if new categories are not handled by encoder)
                # For simplicity, we'll just print warning. In real code, you might want a more robust fallback.
                X_data_processed[col] = X_data_processed[col].fillna(X_data_processed[col].mode()[0]) # Basic fill if no imputer
                # If encoder not found, this will fail or pass through original object type if not handled.
                # It's crucial that test data columns match train data columns and types.

    return X_data_processed


In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from IPython.display import display # For display(train.head()) equivalent

# --- Define the Feature Engineering Function (as provided previously) ---
def feature_engineering(df):
    df_fe = df.copy()
    epsilon = 1e-6

    df_fe['alone_to_social_ratio'] = df_fe['Time_spent_Alone'] / (df_fe['Social_event_attendance'] + epsilon)
    df_fe['social_anxiety_score'] = df_fe['Drained_after_socializing_Yes'] + df_fe['Stage_fear_Yes']
    df_fe['outside_per_post'] = df_fe['Going_outside'] / (df_fe['Post_frequency'] + epsilon)
    df_fe['friends_to_social_events'] = df_fe['Friends_circle_size'] / (df_fe['Social_event_attendance'] + epsilon)
    df_fe['alone_x_drained'] = df_fe['Time_spent_Alone'] * df_fe['Drained_after_socializing_Yes']
    df_fe['stage_fear_x_outside'] = df_fe['Stage_fear_Yes'] * df_fe['Going_outside']
    df_fe['social_events_x_friends'] = df_fe['Social_event_attendance'] * df_fe['Friends_circle_size']
    df_fe['posts_x_friends'] = df_fe['Post_frequency'] * df_fe['Friends_circle_size']
    df_fe['alone_x_social_anxiety'] = df_fe['Time_spent_Alone'] * df_fe['social_anxiety_score']
    df_fe['outside_x_drained'] = df_fe['Going_outside'] * df_fe['Drained_after_socializing_Yes']
    df_fe['Time_spent_Alone_sq'] = df_fe['Time_spent_Alone']**2
    df_fe['Going_outside_sq'] = df_fe['Going_outside']**2
    df_fe['Friends_circle_size_sq'] = df_fe['Friends_circle_size']**2
    df_fe['Post_frequency_sq'] = df_fe['Post_frequency']**2

    for col in ['Time_spent_Alone', 'Going_outside', 'Friends_circle_size', 'Post_frequency', 'Social_event_attendance']:
        if col in df_fe.columns and (df_fe[col].min() >= 0).all() and df_fe[col][df_fe[col].notna()].std() > epsilon:
             df_fe[f'{col}_log'] = np.log1p(df_fe[col])

    df_fe['total_social_engagement'] = df_fe['Social_event_attendance'] + df_fe['Friends_circle_size'] + df_fe['Post_frequency']
    df_fe['posts_per_event'] = df_fe['Post_frequency'] / (df_fe['Social_event_attendance'] + epsilon)
    df_fe['alone_vs_outside'] = df_fe['Time_spent_Alone'] / (df_fe['Going_outside'] + epsilon)
    df_fe['net_social_battery'] = df_fe['Social_event_attendance'] - df_fe['Drained_after_socializing_Yes']

    return df_fe

# --- Define the Simplified Preprocessor Pipeline Function (as provided previously) ---
def simple_preprocessor_pipeline(X_df_train):
    """
    Creates and fits a preprocessing pipeline (ColumnTransformer) based on column dtypes
    from the training data. This handles imputation and scaling/encoding.

    Args:
        X_df_train (pd.DataFrame): The training DataFrame of features to fit the preprocessor on.

    Returns:
        ColumnTransformer: A fitted ColumnTransformer object.
    """
    numerical_cols = X_df_train.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X_df_train.select_dtypes(include=['object', 'category']).columns.tolist()

    transformers = []

    if numerical_cols:
        transformers.append(('num_pipeline',
                             Pipeline([
                                 ('imputer', SimpleImputer(strategy='median')),
                                 ('scaler', StandardScaler())
                             ]),
                             numerical_cols))

    if categorical_cols:
        transformers.append(('cat_pipeline',
                             Pipeline([
                                 ('imputer', SimpleImputer(strategy='most_frequent')),
                                 ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
                             ]),
                             categorical_cols))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='passthrough'
    )
    preprocessor.fit(X_df_train)
    return preprocessor

# --- Your Updated Code Block Starts Here ---

# Assume 'train_df' is your initial raw DataFrame containing 'id' and 'Personality' (as string).
# Example dummy train_df for demonstration (replace with your actual data loading)


# 1. Initial Load, Drop 'id', Separate Target (y)
if 'id' in train_df.columns:
    X_features_only = train_df.drop(columns='id').copy()
else:
    X_features_only = train_df.copy()

# Extract the target variable 'Personality'
y = X_features_only['Personality']
X_features_only = X_features_only.drop(columns='Personality') # Remove 'Personality' from features

# 2. Encode the target variable (y) using LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y) # This converts 'Extrovert'/'Introvert' to 0/1

# 3. Apply Feature Engineering to the raw features
X_features_engineered = feature_engineering(X_features_only)

# 4. Split data into training and validation sets (now with engineered features)
# y_encoded is used here as 'stratify' needs numerical values.
X_train, X_val, y_train, y_val = train_test_split(
    X_features_engineered, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 5. Fit the final preprocessor on X_train (this handles NaNs, scaling, etc. for ALL features)
final_data_preprocessor = simple_preprocessor_pipeline(X_train)

# 6. Transform both training and validation data using the fitted preprocessor
X_train_final_processed = final_data_preprocessor.transform(X_train)
X_val_final_processed = final_data_preprocessor.transform(X_val)

# --- Equivalent to display(train.head()) but for the processed features ---
# ColumnTransformer outputs a NumPy array. If you want a DataFrame with column names,
# it's a bit more involved to get the exact names after OneHotEncoding, etc.
# For simple inspection, you can display the first few rows of the NumPy array:
print("First 5 rows of X_train_final_processed (NumPy array after all steps):")
display(X_train_final_processed[:5])

# You now have:
# X_train_final_processed (NumPy array): Your fully preprocessed and engineered training features
# y_train (NumPy array): Your encoded training target
# X_val_final_processed (NumPy array): Your fully preprocessed and engineered validation features
# y_val (NumPy array): Your encoded validation target


First 5 rows of X_train_final_processed (NumPy array after all steps):


array([[ 0.57735027,  1.71808243, -0.15329284, -0.13483997,  0.04607757,
        -0.57735027, -0.57735027, -0.45749597, -0.57735027, -0.27304851,
        -0.57419554, -0.57265629, -0.37796447,  1.32434032, -0.2173781 ,
        -0.57265629, -0.37796447,  0.23328474, -0.49829623, -0.44108109,
        -0.07675374,  0.74540043,  0.19103213,  0.1381395 ,  0.15980461,
         1.39960066,  0.79697229, -0.54469405, -0.45749531,  1.66302633],
       [ 1.15470054, -0.6381449 ,  1.07304987, -1.75291964, -1.4284046 ,
         1.73205081,  1.73205081, -0.45749379,  1.73205081,  1.67845629,
        -0.57419571,  1.46345495,  2.64575131, -0.85692609, -1.46493936,
         1.46345495,  2.64575131,  1.04978132,  1.10815131, -1.23502706,
        -1.27162277,  1.09392467,  0.91648581, -2.15504527, -1.58256485,
        -0.42312912, -1.64082531, -0.54469302, -0.45749552, -0.85398649],
       [-0.57735027, -1.03084946, -1.37963555,  1.4832397 ,  1.52055974,
        -0.57735027, -0.57735027,  0.15249817, -0

In [44]:

import joblib # Add this line
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV # GridSearchCV for tuning
from sklearn.linear_model import LogisticRegression # Our model
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # More evaluation metrics
from IPython.display import display

print("--- Training Logistic Regression Model (Initial) ---")

model = LogisticRegression(random_state=42, solver='liblinear', C=1.0)
model.fit(X_train_final_processed, y_train)

print("\nInitial model training complete.")

# Make predictions on the validation set
y_val_pred = model.predict(X_val_final_processed)
y_val_prob = model.predict_proba(X_val_final_processed)[:, 1] # Probabilities for ROC AUC

# Evaluate the model
print("\n--- Initial Model Evaluation on Validation Set ---")
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_val, y_val_prob):.4f}")

# --- Hyperparameter Tuning using GridSearchCV ---

print("\n--- Starting Hyperparameter Tuning with GridSearchCV ---")

# Define the parameter grid for Logistic Regression
# C: Inverse of regularization strength. Smaller C means stronger regularization.
# Values are chosen to explore different regularization levels.
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear'] # Sticking with liblinear as it's good for small datasets
    # Add other parameters like 'penalty': ['l1', 'l2'] if exploring different regularization types with liblinear
}

# Create a GridSearchCV object
# estimator: The model to tune (LogisticRegression)
# param_grid: The grid of hyperparameters to search
# cv: Number of cross-validation folds (e.g., 5-fold cross-validation)
# scoring: Metric to optimize (e.g., 'f1' is often good for imbalanced classes, 'accuracy' for balanced)
# verbose: Controls the verbosity: 0 (silent), 1 (results on training), 2 (results on each fold)
# n_jobs: Number of CPU cores to use. -1 means use all available cores.
grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    scoring='accuracy', # Or 'f1', 'roc_auc' depending on your primary metric
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_final_processed, y_train)

print("\nHyperparameter tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score (e.g., Accuracy): {grid_search.best_score_:.4f}")

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# --- Re-evaluate with the Best Model on Validation Set ---
print("\n--- Evaluation with Best Model on Validation Set ---")

y_val_pred_best = best_model.predict(X_val_final_processed)
y_val_prob_best = best_model.predict_proba(X_val_final_processed)[:, 1]

print("Classification Report (Best Model):")
print(classification_report(y_val, y_val_pred_best))
print(f"Accuracy (Best Model): {accuracy_score(y_val, y_val_pred_best):.4f}")
print(f"ROC AUC Score (Best Model): {roc_auc_score(y_val, y_val_prob_best):.4f}")


# --- Model Persistence (Saving the Preprocessor and the Best Model) ---

print("\n--- Saving Preprocessor and Best Model ---")

preprocessor_filename = 'final_data_preprocessor.joblib'
model_filename = 'best_logistic_regression_model.joblib'

# Save the fitted preprocessor
joblib.dump(final_data_preprocessor, preprocessor_filename)
print(f"Preprocessor saved to: {preprocessor_filename}")

# Save the best trained model
joblib.dump(best_model, model_filename)
print(f"Best model saved to: {model_filename}")

print("\nDone with model creation, tuning, evaluation, and saving.")

# --- How to Load and Use Later ---
# print("\n--- Example: Loading and Using Saved Model and Preprocessor ---")
# loaded_preprocessor = joblib.load(preprocessor_filename)
# loaded_model = joblib.load(model_filename)

# # Assume you have new_raw_data_df (e.g., test data or data for deployment)
# # You need to apply the same steps as X_initial -> X_features_engineered
# # Example:
# # new_raw_data_df = pd.DataFrame(...) # Your new data
# # if 'id' in new_raw_data_df.columns:
# #     new_X_features_only = new_raw_data_df.drop(columns='id').copy()
# # else:
# #     new_X_features_only = new_raw_data_df.copy()
# #
# # # IMPORTANT: If 'Personality' column exists in new_raw_data_df, drop it here too
# # if 'Personality' in new_X_features_only.columns:
# #     new_X_features_only = new_X_features_only.drop(columns='Personality')
# #
# # new_X_features_engineered = feature_engineering(new_X_features_only)
# # new_X_processed = loaded_preprocessor.transform(new_X_features_engineered)
# # final_predictions = loaded_model.predict(new_X_processed)
# # print(f"Predictions for new data: {final_predictions[:5]}")


--- Training Logistic Regression Model (Initial) ---

Initial model training complete.

--- Initial Model Evaluation on Validation Set ---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Accuracy: 1.0000
ROC AUC Score: 1.0000

--- Starting Hyperparameter Tuning with GridSearchCV ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Hyperparameter tuning complete.
Best parameters found: {'C': 0.001, 'solver': 'liblinear'}
Best cross-validation score (e.g., Accuracy): 1.0000

--- Evaluation with Best Model on Validation Set ---
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
          

