In [None]:
# üìö Predict Lifestyle Change - Full Classification Pipeline (with Enhancements)

# ‚úÖ Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
import shap
import warnings
import time
import scipy

warnings.filterwarnings('ignore')

# ‚è≥ Step 2: Load Dataset
start_time = time.time()
df = pd.read_excel("Data of 300 people v2.xlsx")

# üîç Step 3: Initial Exploration
print("Data Shape:", df.shape)
df.info()
print("Missing Values:\n", df.isnull().sum())
print(df.describe())
df.head()

# üßº Step 4: Handle Missing Values
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(include=np.number).columns

num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# üîé Step 5: Exploratory Data Analysis (EDA)
plt.figure(figsize=(12, 10))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

for col in num_cols:
    sns.kdeplot(df[col])
    plt.title(f"KDE Plot: {col}")
    plt.show()

for col in num_cols:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot for {col}")
    plt.show()

sample_cat = cat_cols[:5]
for col in sample_cat:
    plt.figure(figsize=(8, 4))
    sns.violinplot(data=df, x=col, y='predict_lifestyle_change')
    plt.title(f"Violin plot: {col} vs predict_lifestyle_change")
    plt.xticks(rotation=45)
    plt.show()

# üéØ Step 6: Outlier Handling (Winsorization)
from scipy.stats.mstats import winsorize
for col in num_cols:
    df[col] = winsorize(df[col], limits=[0.01, 0.01])

# üß† Step 7: Encode Categorical Columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# üìê Step 8: Split Target and Features
X = df.drop(columns=['predict_lifestyle_change'])
y = df['predict_lifestyle_change'].astype(int)

# ‚öñÔ∏è Step 9: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# ‚ûï Step 10: Data Augmentation (20x)
augmented = [X_res + np.random.normal(0, 0.01, X_res.shape) for _ in range(19)]
X_aug = pd.concat([pd.DataFrame(X_res)] + [pd.DataFrame(x) for x in augmented], ignore_index=True)
y_aug = pd.concat([pd.Series(y_res)] * 20, ignore_index=True)

# üîÅ Step 11: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aug)

# üìä Step 12: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_aug, test_size=0.2, random_state=42)

# ü§ñ Step 13: Define Models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_jobs=-1, random_state=42),
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "SVM": SVC(probability=True, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1, random_state=42)
}

# üìà Step 14: Cross-Validation, Training, Evaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1": f1_score(y_test, y_pred, average='weighted')
    }

    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

    # ROC Curve if binary classification
    if y_proba is not None and len(np.unique(y)) == 2:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_proba):.2f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()
        plt.show()

results_df = pd.DataFrame(results).T
print("\nModel Performance Summary:")
print(results_df.sort_values(by="F1", ascending=False))

# üîç Step 15: Explainable AI - SHAP (XGBoost)
best_model = models['XGBoost']
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
shap.plots.beeswarm(shap_values)

# üîß Step 16: Hyperparameter Tuning (RandomForest Example)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train, y_train)
print("\nBest Parameters (Random Forest):", grid_search.best_params_)
print("Best Cross-Validated F1:", grid_search.best_score_)

# üïí Step 17: Runtime Summary
end_time = time.time()
print(f"\nTotal execution time: {(end_time - start_time):.2f} seconds")

# ‚úÖ Project Extensions:
# - Add MLflow/Weights & Biases for experiment tracking
# - Try CatBoost and LightGBM models
# - Use SHAP dependence and interaction plots
# - Build dashboard with Streamlit/Gradio
# - Integrate prediction into web application
# - Apply PCA/feature selection techniques for optimization
# - Use LIME or ELI5 for model interpretation
# - Compare boosting strategies: CatBoost vs LightGBM vs XGBoost
