In [1]:
import pandas as pd
train_file_path = "new_review_train.csv"
test_file_path = "new_review_test.csv"

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_df.head(), test_df.head()

(   Employee ID  Age  Gender  Years at Company    Job Role  Monthly Income  \
 0         8410   31    Male                19   Education            5390   
 1        64756   59  Female                 4       Media            5534   
 2        30257   24  Female                10  Healthcare            8159   
 3        65791   36  Female                 7   Education            3989   
 4        65026   56    Male                41   Education            4821   
 
   Work-Life Balance Job Satisfaction Performance Rating  Number of Promotions  \
 0         Excellent           Medium            Average                     2   
 1              Poor             High                Low                     3   
 2              Good             High                Low                     0   
 3              Good             High               High                     1   
 4     Below Average        Very High            Average                     0   
 
    ... Company Size  Company Tenure

##  Step 1: Preprocessing the Data.
### ✅ Encoded categorical features
### ✅ Normalized numerical features
### ✅ Fixed encoding for "Job Level"

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Identify categorical and numerical features
categorical_features = ['Gender', 'Job Role', 'Work-Life Balance', 'Job Satisfaction',
                        'Performance Rating', 'Overtime', 'Education Level', 'Marital Status',
                        'Company Size', 'Company Reputation', 'Employee Recognition', 
                        'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
                        'Overall_Sentiment_Label']

numerical_features = ['Age', 'Years at Company', 'Monthly Income', 'Number of Promotions',
                      'Distance from Home', 'Job Level', 'Company Tenure', 'Overall_Sentiment_Score']

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = StandardScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

# Check the processed data
train_df.head()


ValueError: could not convert string to float: 'Mid'

In [None]:
# Check for non-numeric values in numerical columns
for col in numerical_features:
    if not pd.api.types.is_numeric_dtype(train_df[col]):
        print(f"Non-numeric values found in column: {col}")
        print(train_df[col].unique())


In [3]:
# Encode "Job Level" column
le_job_level = LabelEncoder()
train_df["Job Level"] = le_job_level.fit_transform(train_df["Job Level"])
test_df["Job Level"] = le_job_level.transform(test_df["Job Level"])

# Reattempt normalization for numerical features
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

# Check processed data
train_df.head()


Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition,Overall_Sentiment_Score,Overall_Sentiment_Label
0,8410,-0.626336,1,0.288648,0,-0.888892,1,2,0,1.173308,...,1,1.308164,0,0,0,0,2,Stayed,1.362105,2
1,64756,1.691627,0,-1.045173,3,-0.82196,3,0,3,2.178351,...,1,-1.367856,0,0,0,1,1,Stayed,-1.213942,0
2,30257,-1.205827,0,-0.511645,2,0.398153,2,0,3,-0.836777,...,1,0.717865,0,0,0,3,1,Stayed,-1.630458,0
3,65791,-0.212414,0,-0.778409,0,-1.540084,2,0,2,0.168266,...,2,-0.226612,1,0,0,2,2,Stayed,0.864631,2
4,65026,1.443273,1,2.244918,0,-1.153366,0,3,0,-0.836777,...,1,0.481746,0,0,0,1,2,Stayed,0.402339,2


## Step 2: Apply MCDM Feature Weights

In [4]:
# Define the MCDM feature weights
mcdm_weights = {
    'Years at Company': 0.219761,
    'Monthly Income': 0.18773,
    'Work-Life Balance': 0.293560,
    'Job Satisfaction': 0.219198,
    'Performance Rating': 0.207862,
    'Number of Promotions': 0.296452,
    'Distance from Home': 0.528879,
    'Education Level': 0.152396,
    'Job Level': 0.135266,
    'Company Size': 0.058933,
    'Company Tenure': 0.472736,
    'Company Reputation': 0.255356,
    'Employee Recognition': 0.075259,
    'Overall_Sentiment_Score': 0.161596
}

# Apply weights to the relevant features in train and test datasets
for feature, weight in mcdm_weights.items():
    train_df[feature] *= weight
    test_df[feature] *= weight

# Check the modified dataset
train_df.head()


Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition,Overall_Sentiment_Score,Overall_Sentiment_Label
0,8410,-0.626336,1,0.063433,0,-0.166872,0.29356,0.438396,0.0,0.34783,...,0.058933,0.618416,0,0,0,0.0,0.150518,Stayed,0.220111,2
1,64756,1.691627,0,-0.229688,3,-0.154307,0.88068,0.0,0.623586,0.645777,...,0.058933,-0.646635,0,0,0,0.255356,0.075259,Stayed,-0.196168,0
2,30257,-1.205827,0,-0.11244,2,0.074745,0.58712,0.0,0.623586,-0.248064,...,0.058933,0.339361,0,0,0,0.766068,0.075259,Stayed,-0.263476,0
3,65791,-0.212414,0,-0.171064,0,-0.28912,0.58712,0.0,0.415724,0.049883,...,0.117866,-0.107128,1,0,0,0.510712,0.150518,Stayed,0.139721,2
4,65026,1.443273,1,0.493346,0,-0.216521,0.0,0.657594,0.0,-0.248064,...,0.058933,0.227739,0,0,0,0.255356,0.150518,Stayed,0.065016,2


## Step 3: Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

# Convert target column to binary (1 = Left, 0 = Stayed)
train_df["Attrition"] = train_df["Attrition"].apply(lambda x: 1 if x == "Left" else 0)
test_df["Attrition"] = test_df["Attrition"].apply(lambda x: 1 if x == "Left" else 0)

# Define features (X) and target (y)
X_train = train_df.drop(columns=["Attrition", "Employee ID"])
y_train = train_df["Attrition"]

X_test = test_df.drop(columns=["Attrition", "Employee ID"])
y_test = test_df["Attrition"]

# Final check of shapes
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((59598, 24), (59598,), (14900, 24), (14900,))

### Step 4:  Bagging

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")


Training Accuracy: 1.0000
Testing Accuracy: 0.7513


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 0.7513


## Boosting
### XGBoost

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost model (Boosting technique)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)

# Classification reports
train_report_xgb = classification_report(y_train, y_train_pred_xgb)
test_report_xgb = classification_report(y_test, y_test_pred_xgb)

# Print results
print(f"Training Accuracy (XGBoost): {train_accuracy_xgb:.4f}")
print(f"Testing Accuracy (XGBoost): {test_accuracy_xgb:.4f}\n")

print("Classification Report (Training):")
print(train_report_xgb)

print("Classification Report (Testing):")
print(test_report_xgb)


Training Accuracy (XGBoost): 0.7821
Testing Accuracy (XGBoost): 0.7602

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.79      0.79      0.79     31260
           1       0.77      0.77      0.77     28338

    accuracy                           0.78     59598
   macro avg       0.78      0.78      0.78     59598
weighted avg       0.78      0.78      0.78     59598

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      7868
           1       0.74      0.75      0.75      7032

    accuracy                           0.76     14900
   macro avg       0.76      0.76      0.76     14900
weighted avg       0.76      0.76      0.76     14900



In [9]:
from xgboost import XGBClassifier

# Initialize the XGBoost model (Boosting technique)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_report_xgb = classification_report(y_test, y_pred_xgb)

accuracy_xgb, classification_report_xgb


(0.7602013422818792,
 '              precision    recall  f1-score   support\n\n           0       0.77      0.77      0.77      7868\n           1       0.74      0.75      0.75      7032\n\n    accuracy                           0.76     14900\n   macro avg       0.76      0.76      0.76     14900\nweighted avg       0.76      0.76      0.76     14900\n')

### LightGBM

In [10]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the LightGBM model (Boosting technique)
lgbm_model = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)

# Train the model
lgbm_model.fit(X_train, y_train)

# Make predictions
y_train_pred_lgbm = lgbm_model.predict(X_train)
y_test_pred_lgbm = lgbm_model.predict(X_test)

# Evaluate the model
train_accuracy_lgbm = accuracy_score(y_train, y_train_pred_lgbm)
test_accuracy_lgbm = accuracy_score(y_test, y_test_pred_lgbm)

# Classification reports
train_report_lgbm = classification_report(y_train, y_train_pred_lgbm)
test_report_lgbm = classification_report(y_test, y_test_pred_lgbm)

# Print results
print(f"Training Accuracy (LightGBM): {train_accuracy_lgbm:.4f}")
print(f"Testing Accuracy (LightGBM): {test_accuracy_lgbm:.4f}\n")

print("Classification Report (Training):")
print(train_report_lgbm)

print("Classification Report (Testing):")
print(test_report_lgbm)


[LightGBM] [Info] Number of positive: 28338, number of negative: 31260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002878 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 59598, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.475486 -> initscore=-0.098136
[LightGBM] [Info] Start training from score -0.098136
Training Accuracy (LightGBM): 0.7728
Testing Accuracy (LightGBM): 0.7585

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.78      0.78      0.78     31260
           1       0.76      0.76      0.76     28338

    accuracy                           0.77     59598
   macro avg       0.77      0.77      0.77     59598
weighted avg       0.77      0.77      0.77     59598

Classifi

In [11]:
from lightgbm import LGBMClassifier

# Initialize the LightGBM model (Boosting technique)
lgbm_model = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)

# Train the model
lgbm_model.fit(X_train, y_train)

# Make predictions
y_pred_lgbm = lgbm_model.predict(X_test)

# Evaluate the model
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
classification_report_lgbm = classification_report(y_test, y_pred_lgbm)

accuracy_lgbm, classification_report_lgbm


[LightGBM] [Info] Number of positive: 28338, number of negative: 31260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 59598, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.475486 -> initscore=-0.098136
[LightGBM] [Info] Start training from score -0.098136


(0.7585234899328859,
 '              precision    recall  f1-score   support\n\n           0       0.77      0.77      0.77      7868\n           1       0.74      0.75      0.74      7032\n\n    accuracy                           0.76     14900\n   macro avg       0.76      0.76      0.76     14900\nweighted avg       0.76      0.76      0.76     14900\n')

### AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
adaboost_model.fit(X_train, y_train)

# Predictions
y_train_pred_ada = adaboost_model.predict(X_train)
y_test_pred_ada = adaboost_model.predict(X_test)

# Accuracy scores
train_accuracy_ada = accuracy_score(y_train, y_train_pred_ada)
test_accuracy_ada = accuracy_score(y_test, y_test_pred_ada)

# Classification reports
train_report_ada = classification_report(y_train, y_train_pred_ada)
test_report_ada = classification_report(y_test, y_test_pred_ada)

# Print results
print(f"Training Accuracy (AdaBoost): {train_accuracy_ada:.4f}")
print(f"Testing Accuracy (AdaBoost): {test_accuracy_ada:.4f}\n")

print("Classification Report (Training):")
print(train_report_ada)

print("Classification Report (Testing):")
print(test_report_ada)


Training Accuracy (AdaBoost): 0.7102
Testing Accuracy (AdaBoost): 0.7146

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.72      0.73      0.72     31260
           1       0.70      0.69      0.69     28338

    accuracy                           0.71     59598
   macro avg       0.71      0.71      0.71     59598
weighted avg       0.71      0.71      0.71     59598

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      7868
           1       0.70      0.69      0.70      7032

    accuracy                           0.71     14900
   macro avg       0.71      0.71      0.71     14900
weighted avg       0.71      0.71      0.71     14900



In [13]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize the AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
adaboost_model.fit(X_train, y_train)

# Make predictions
y_pred_adaboost = adaboost_model.predict(X_test)

# Evaluate the model
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
classification_report_adaboost = classification_report(y_test, y_pred_adaboost)

accuracy_adaboost, classification_report_adaboost


(0.7145637583892618,
 '              precision    recall  f1-score   support\n\n           0       0.73      0.73      0.73      7868\n           1       0.70      0.69      0.70      7032\n\n    accuracy                           0.71     14900\n   macro avg       0.71      0.71      0.71     14900\nweighted avg       0.71      0.71      0.71     14900\n')

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predictions
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)

# Accuracy
train_acc_dt = accuracy_score(y_train, y_train_pred_dt)
test_acc_dt = accuracy_score(y_test, y_test_pred_dt)

# Reports
print(f"Training Accuracy (Decision Tree): {train_acc_dt:.4f}")
print(f"Testing Accuracy (Decision Tree): {test_acc_dt:.4f}\n")

print("Classification Report (Training):")
print(classification_report(y_train, y_train_pred_dt))

print("Classification Report (Testing):")
print(classification_report(y_test, y_test_pred_dt))


Training Accuracy (Decision Tree): 1.0000
Testing Accuracy (Decision Tree): 0.6657

Classification Report (Training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     31260
           1       1.00      1.00      1.00     28338

    accuracy                           1.00     59598
   macro avg       1.00      1.00      1.00     59598
weighted avg       1.00      1.00      1.00     59598

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.69      0.68      0.68      7868
           1       0.64      0.65      0.65      7032

    accuracy                           0.67     14900
   macro avg       0.66      0.67      0.66     14900
weighted avg       0.67      0.67      0.67     14900



In [11]:
from sklearn.naive_bayes import GaussianNB

# Initialize model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predictions
y_train_pred_nb = nb_model.predict(X_train)
y_test_pred_nb = nb_model.predict(X_test)

# Accuracy
train_acc_nb = accuracy_score(y_train, y_train_pred_nb)
test_acc_nb = accuracy_score(y_test, y_test_pred_nb)

# Reports
print(f"Training Accuracy (Naive Bayes): {train_acc_nb:.4f}")
print(f"Testing Accuracy (Naive Bayes): {test_acc_nb:.4f}\n")

print("Classification Report (Training):")
print(classification_report(y_train, y_train_pred_nb))

print("Classification Report (Testing):")
print(classification_report(y_test, y_test_pred_nb))


Training Accuracy (Naive Bayes): 0.7171
Testing Accuracy (Naive Bayes): 0.7162

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.74      0.71      0.73     31260
           1       0.70      0.72      0.71     28338

    accuracy                           0.72     59598
   macro avg       0.72      0.72      0.72     59598
weighted avg       0.72      0.72      0.72     59598

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.74      0.71      0.73      7868
           1       0.69      0.72      0.71      7032

    accuracy                           0.72     14900
   macro avg       0.72      0.72      0.72     14900
weighted avg       0.72      0.72      0.72     14900



In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Predictions
y_train_pred_knn = knn_model.predict(X_train)
y_test_pred_knn = knn_model.predict(X_test)

# Accuracy
train_acc_knn = accuracy_score(y_train, y_train_pred_knn)
test_acc_knn = accuracy_score(y_test, y_test_pred_knn)

# Reports
print(f"Training Accuracy (KNN): {train_acc_knn:.4f}")
print(f"Testing Accuracy (KNN): {test_acc_knn:.4f}\n")

print("Classification Report (Training):")
print(classification_report(y_train, y_train_pred_knn))

print("Classification Report (Testing):")
print(classification_report(y_test, y_test_pred_knn))


Training Accuracy (KNN): 0.7665
Testing Accuracy (KNN): 0.6437

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.78      0.78      0.78     31260
           1       0.76      0.75      0.75     28338

    accuracy                           0.77     59598
   macro avg       0.77      0.77      0.77     59598
weighted avg       0.77      0.77      0.77     59598

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.66      0.66      0.66      7868
           1       0.62      0.62      0.62      7032

    accuracy                           0.64     14900
   macro avg       0.64      0.64      0.64     14900
weighted avg       0.64      0.64      0.64     14900



In [13]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(X_train, y_train)

y_train_pred_cat = cat_model.predict(X_train)
y_test_pred_cat = cat_model.predict(X_test)

print("Train Accuracy (CatBoost):", accuracy_score(y_train, y_train_pred_cat))
print("Test Accuracy (CatBoost):", accuracy_score(y_test, y_test_pred_cat))


Train Accuracy (CatBoost): 0.8061176549548642
Test Accuracy (CatBoost): 0.7606040268456375


In [16]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the model
cat_model = CatBoostClassifier(verbose=0, random_state=42)

# Define parameter grid
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1],
    'iterations': [100, 200],
    'l2_leaf_reg': [1, 3, 5]
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=cat_model,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:\n", grid_search.best_params_)

# Evaluate the best model
best_cat_model = grid_search.best_estimator_

y_train_pred = best_cat_model.predict(X_train)
y_test_pred = best_cat_model.predict(X_test)

print("\n=== Training Evaluation ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("\n=== Testing Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters:
 {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

=== Training Evaluation ===
Accuracy: 0.7618376455585758
              precision    recall  f1-score   support

           0       0.77      0.77      0.77     31260
           1       0.75      0.75      0.75     28338

    accuracy                           0.76     59598
   macro avg       0.76      0.76      0.76     59598
weighted avg       0.76      0.76      0.76     59598


=== Testing Evaluation ===
Accuracy: 0.7625503355704698
              precision    recall  f1-score   support

           0       0.77      0.78      0.78      7868
           1       0.75      0.75      0.75      7032

    accuracy                           0.76     14900
   macro avg       0.76      0.76      0.76     14900
weighted avg       0.76      0.76      0.76     14900



In [14]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_train_pred_log = log_model.predict(X_train)
y_test_pred_log = log_model.predict(X_test)

print("Train Accuracy (Logistic Regression):", accuracy_score(y_train, y_train_pred_log))
print("Test Accuracy (Logistic Regression):", accuracy_score(y_test, y_test_pred_log))


Train Accuracy (Logistic Regression): 0.7289506359273801
Test Accuracy (Logistic Regression): 0.7315436241610739


In [15]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)

y_train_pred_svm = svm_model.predict(X_train)
y_test_pred_svm = svm_model.predict(X_test)

print("Train Accuracy (SVM):", accuracy_score(y_train, y_train_pred_svm))
print("Test Accuracy (SVM):", accuracy_score(y_test, y_test_pred_svm))


Train Accuracy (SVM): 0.7504614248800295
Test Accuracy (SVM): 0.7432214765100671


## Stacking
### 1. Random Forest
### 2. AdaBoost
### 3. Gradient Boosting
### 4. The meta-classifier (final decision-maker) will be Logistic Regression.

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Define meta-classifier
meta_model = LogisticRegression()

# Prepare cross-validation for stacking
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store predictions
train_meta_features = np.zeros((X_train.shape[0], 3))
test_meta_features = np.zeros((X_test.shape[0], 3))

# Train base models using cross-validation
for i, model in enumerate([rf, gb, ada]):
    test_fold_predictions = np.zeros((X_test.shape[0], 5))  # Store test predictions for each fold
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        
        # Predict on validation set
        train_meta_features[val_idx, i] = model.predict(X_train.iloc[val_idx])
        
        # Predict on test set
        test_fold_predictions[:, fold] = model.predict(X_test)
    
    # Average test predictions across folds
    test_meta_features[:, i] = test_fold_predictions.mean(axis=1)

# Meta-features ready! Next step: Train the meta-classifier.
# Train the meta-classifier (Logistic Regression)
meta_model.fit(train_meta_features, y_train)

# Predict on the test meta-features
stacking_predictions = meta_model.predict(test_meta_features)

# Evaluate the stacking model
stacking_accuracy = accuracy_score(y_test, stacking_predictions)
stacking_report = classification_report(y_test, stacking_predictions)

stacking_accuracy, stacking_report



(0.7604026845637584,
 '              precision    recall  f1-score   support\n\n           0       0.77      0.78      0.78      7868\n           1       0.75      0.74      0.74      7032\n\n    accuracy                           0.76     14900\n   macro avg       0.76      0.76      0.76     14900\nweighted avg       0.76      0.76      0.76     14900\n')

In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on training meta-features
train_stacking_predictions = meta_model.predict(train_meta_features)

# Predict on testing meta-features (already done above)
test_stacking_predictions = stacking_predictions

# Training accuracy and classification report
stacking_train_accuracy = accuracy_score(y_train, train_stacking_predictions)
stacking_train_report = classification_report(y_train, train_stacking_predictions)

# Testing accuracy and classification report
stacking_test_accuracy = accuracy_score(y_test, test_stacking_predictions)
stacking_test_report = classification_report(y_test, test_stacking_predictions)

# Display results
print("Training Accuracy:", stacking_train_accuracy)
print("Training Classification Report:\n", stacking_train_report)

print("Testing Accuracy:", stacking_test_accuracy)
print("Testing Classification Report:\n", stacking_test_report)


Training Accuracy: 0.7561663143058492
Training Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.77      0.77     31260
           1       0.75      0.74      0.74     28338

    accuracy                           0.76     59598
   macro avg       0.76      0.76      0.76     59598
weighted avg       0.76      0.76      0.76     59598

Testing Accuracy: 0.7604026845637584
Testing Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.78      0.78      7868
           1       0.75      0.74      0.74      7032

    accuracy                           0.76     14900
   macro avg       0.76      0.76      0.76     14900
weighted avg       0.76      0.76      0.76     14900



In [19]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the model on the training set
gb.fit(X_train, y_train)

# Predict on the training set
train_preds = gb.predict(X_train)

# Predict on the testing set
test_preds = gb.predict(X_test)

# Evaluate training accuracy and classification report
train_accuracy = accuracy_score(y_train, train_preds)
train_report = classification_report(y_train, train_preds)

# Evaluate testing accuracy and classification report
test_accuracy = accuracy_score(y_test, test_preds)
test_report = classification_report(y_test, test_preds)

# Print the results
print("Training Accuracy:", train_accuracy)
print("Training Classification Report:\n", train_report)

print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:\n", test_report)


Training Accuracy: 0.7583979328165374
Training Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.77      0.77     31260
           1       0.75      0.74      0.74     28338

    accuracy                           0.76     59598
   macro avg       0.76      0.76      0.76     59598
weighted avg       0.76      0.76      0.76     59598

Testing Accuracy: 0.7561073825503356
Testing Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.77      0.77      7868
           1       0.74      0.74      0.74      7032

    accuracy                           0.76     14900
   macro avg       0.76      0.75      0.76     14900
weighted avg       0.76      0.76      0.76     14900



In [16]:
import joblib

# Save the trained stacking model
joblib.dump(meta_model, "stacking_model.pkl")
print("Model saved successfully!")


Model saved successfully!


In [17]:
import joblib

# Save the trained stacking model
joblib.dump(RandomForestClassifier, "RandomForestClassifier.pkl")
print("Model saved successfully!")


Model saved successfully!


In [18]:
import joblib

# Save the trained stacking model
joblib.dump(GradientBoostingClassifier, "GradientBoostingClassifier.pkl")
print("Model saved successfully!")

Model saved successfully!


In [19]:
import joblib

# Save the trained stacking model
joblib.dump(AdaBoostClassifier, "AdaBoostClassifier.pkl")
print("Model saved successfully!")

Model saved successfully!


In [20]:
print(type(rf))  # Check the loaded object type


<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [21]:
import joblib
joblib.dump(rf, "RandomForestClassifier.pkl")

['RandomForestClassifier.pkl']

In [13]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Define MLPClassifier as the meta-classifier
meta_model = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)

# Prepare cross-validation for stacking
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store predictions
train_meta_features = np.zeros((X_train.shape[0], 3))
test_meta_features = np.zeros((X_test.shape[0], 3))

# Train base models using cross-validation
for i, model in enumerate([rf, gb, ada]):
    test_fold_predictions = np.zeros((X_test.shape[0], 5))  # Store test predictions for each fold
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        
        # Predict on validation set
        train_meta_features[val_idx, i] = model.predict(X_train.iloc[val_idx])
        
        # Predict on test set
        test_fold_predictions[:, fold] = model.predict(X_test)
    
    # Average test predictions across folds
    test_meta_features[:, i] = test_fold_predictions.mean(axis=1)

# Train the MLP meta-classifier
meta_model.fit(train_meta_features, y_train)

# Predict on the test meta-features
stacking_predictions = meta_model.predict(test_meta_features)

# Evaluate the stacking model
stacking_accuracy = accuracy_score(y_test, stacking_predictions)
stacking_report = classification_report(y_test, stacking_predictions)

stacking_accuracy, stacking_report


(0.7613422818791946,
 '              precision    recall  f1-score   support\n\n           0       0.77      0.78      0.78      7868\n           1       0.75      0.74      0.75      7032\n\n    accuracy                           0.76     14900\n   macro avg       0.76      0.76      0.76     14900\nweighted avg       0.76      0.76      0.76     14900\n')

In [15]:
import google.generativeai as genai
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Configure Gemini API
GEMINI_API_KEY = "AIzaSyBGGcu0nbnTtfZv2OzfIH8GzwcjlosZV_0"
genai.configure(api_key=GEMINI_API_KEY)

def gemini_predict(features):
    """Use Gemini API to make predictions based on input features."""
    prompt = f"Predict the class label for the following feature set: {features.tolist()}"
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
    response = model.generate_content(prompt)
    try:
        return int(response.text.strip())  # Convert output to integer label
    except ValueError:
        return 0  # Default fallback class if conversion fails

# Define base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Define MLPClassifier as the meta-classifier
meta_model = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)

# Prepare cross-validation for stacking
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store predictions
train_meta_features = np.zeros((X_train.shape[0], 3))
test_meta_features = np.zeros((X_test.shape[0], 3))

# Train base models using cross-validation
for i, model in enumerate([rf, gb, ada]):
    test_fold_predictions = np.zeros((X_test.shape[0], 5))  # Store test predictions for each fold
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        
        # Predict on validation set
        train_meta_features[val_idx, i] = model.predict(X_train.iloc[val_idx])
        
        # Predict on test set
        test_fold_predictions[:, fold] = model.predict(X_test)
    
    # Average test predictions across folds
    test_meta_features[:, i] = test_fold_predictions.mean(axis=1)

# Train the MLP meta-classifier
meta_model.fit(train_meta_features, y_train)

# Predict using Gemini API instead of MLP Meta-Learner
gemini_predictions = np.array([gemini_predict(features) for features in test_meta_features])

# Evaluate Gemini-based model
stacking_accuracy = accuracy_score(y_test, gemini_predictions)
stacking_report = classification_report(y_test, gemini_predictions)

stacking_accuracy, stacking_report

# ================================
# Previous Stacking Model (Commented Out)
# ================================
# meta_model.fit(train_meta_features, y_train)
# stacking_predictions = meta_model.predict(test_meta_features)
# stacking_accuracy = accuracy_score(y_test, stacking_predictions)
# stacking_report = classification_report(y_test, stacking_predictions)
# stacking_accuracy, stacking_report


  from .autonotebook import tqdm as notebook_tqdm


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 39
}
]

In [14]:
!pip install google.generativeai

Collecting google.generativeai
  Downloading google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google.generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google.generativeai)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google.generativeai)
  Downloading google_api_python_client-2.166.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting google-auth>=2.15.0 (from google.generativeai)
  Downloading google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting pydantic (from google.generativeai)
  Downloading pydantic-2.11.1-py3-none-any.whl.metadata (63 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google.generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Step 1: Updated Feature Engineering
def enhanced_feature_engineering(train_df, test_df, mcdm_weights):
    # Create copies to avoid modifying originals
    train = train_df.copy()
    test = test_df.copy()
    
    # Encode categorical features
    categorical_features = ['Gender', 'Job Role', 'Work-Life Balance', 'Job Satisfaction',
                           'Performance Rating', 'Education Level', 'Marital Status',
                           'Company Size', 'Company Reputation', 'Employee Recognition', 
                           'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
                           'Overall_Sentiment_Label']
    
    numerical_features = ['Age', 'Years at Company', 'Monthly Income', 'Number of Promotions',
                         'Distance from Home', 'Job Level', 'Company Tenure', 'Overall_Sentiment_Score']
    
    # Encode categorical features
    label_encoders = {}
    for col in categorical_features:
        if col in train.columns and col in test.columns:
            le = LabelEncoder()
            train[col] = le.fit_transform(train[col])
            test[col] = le.transform(test[col])
            label_encoders[col] = le
    
    # Handle Job Level separately if it contains non-numeric values
    if 'Job Level' in train.columns and not pd.api.types.is_numeric_dtype(train['Job Level']):
        le_job_level = LabelEncoder()
        train['Job Level'] = le_job_level.fit_transform(train['Job Level'])
        test['Job Level'] = le_job_level.transform(test['Job Level'])
    
    # Normalize numerical features
    scaler = StandardScaler()
    train[numerical_features] = scaler.fit_transform(train[numerical_features])
    test[numerical_features] = scaler.transform(test[numerical_features])
    
    # Create interaction features between high-weight features
    top_features = sorted(mcdm_weights.items(), key=lambda x: x[1], reverse=True)[:5]
    top_feature_names = [feat[0] for feat in top_features]
    
    for i, feat1 in enumerate(top_feature_names):
        if feat1 in train.columns and feat1 in test.columns:
            for j, feat2 in enumerate(top_feature_names[i+1:], i+1):
                if feat2 in train.columns and feat2 in test.columns:
                    interaction_name = f"{feat1}_x_{feat2}"
                    train[interaction_name] = train[feat1] * train[feat2]
                    test[interaction_name] = test[feat1] * test[feat2]
    
    # Create polynomial features for top 3 features
    top3_features = [feat[0] for feat in top_features[:3]]
    valid_top3 = [f for f in top3_features if f in train.columns and f in test.columns]
    
    if valid_top3:
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        train_poly = poly.fit_transform(train[valid_top3])
        test_poly = poly.transform(test[valid_top3])
        
        poly_feature_names = [f"poly_{i}" for i in range(train_poly.shape[1])]
        train_poly_df = pd.DataFrame(train_poly, columns=poly_feature_names)
        test_poly_df = pd.DataFrame(test_poly, columns=poly_feature_names)
        
        train = pd.concat([train, train_poly_df], axis=1)
        test = pd.concat([test, test_poly_df], axis=1)
    
    # Create ratio features
    if 'Monthly Income' in train.columns and 'Years at Company' in train.columns:
        train['Income_per_Year'] = train['Monthly Income'] / (train['Years at Company'] + 1)  # +1 to avoid division by zero
        test['Income_per_Year'] = test['Monthly Income'] / (test['Years at Company'] + 1)
    
    if 'Number of Promotions' in train.columns and 'Years at Company' in train.columns:
        train['Promotion_Rate'] = train['Number of Promotions'] / (train['Years at Company'] + 1)
        test['Promotion_Rate'] = test['Number of Promotions'] / (test['Years at Company'] + 1)
    
    # Apply MCDM weights to features
    for feature, weight in mcdm_weights.items():
        if feature in train.columns and feature in test.columns:
            train[feature] *= weight
            test[feature] *= weight
    
    # Create weighted sentiment features
    if 'Overall_Sentiment_Score' in train.columns:
        for feat in ['Job Satisfaction', 'Work-Life Balance', 'Performance Rating']:
            if feat in train.columns and feat in test.columns:
                train[f'{feat}_Sentiment'] = train[feat] * train['Overall_Sentiment_Score']
                test[f'{feat}_Sentiment'] = test[feat] * test['Overall_Sentiment_Score']
    
    # Convert target column to binary
    if 'Attrition' in train.columns:
        train["Attrition"] = train["Attrition"].apply(lambda x: 1 if x == "Left" else 0) if not pd.api.types.is_numeric_dtype(train["Attrition"]) else train["Attrition"]
    
    if 'Attrition' in test.columns:
        test["Attrition"] = test["Attrition"].apply(lambda x: 1 if x == "Left" else 0) if not pd.api.types.is_numeric_dtype(test["Attrition"]) else test["Attrition"]
    
    return train, test

In [9]:
# Step 2: Feature Selection based on weighted importance
def select_features(X_train, y_train, top_n=None):
    selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold=-np.inf, max_features=top_n)
    selector.fit(X_train, y_train)
    return selector

In [10]:
# Step 3: Advanced Ensemble Model
def build_advanced_ensemble(X_train, y_train, X_test, y_test):
    # Handle class imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Define base models with hyperparameters
    rf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, 
                               min_samples_leaf=2, class_weight='balanced', random_state=42)
    
    gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, 
                                    min_samples_split=5, min_samples_leaf=2, random_state=42)
    
    xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, 
                                 subsample=0.8, colsample_bytree=0.8, random_state=42)
    
    lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=8,
                                  num_leaves=31, subsample=0.8, colsample_bytree=0.8,
                                  random_state=42)
    
    # Neural network for different learning patterns
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam',
                       alpha=0.0001, batch_size=256, learning_rate='adaptive',
                       max_iter=200, random_state=42)
    
    # Create voting classifier
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf),
        ('gb', gb),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('mlp', mlp)
    ], voting='soft')
    
    # Fit all models
    print("Training Voting Ensemble...")
    voting_clf.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    print("Making predictions...")
    y_train_pred = voting_clf.predict(X_train)
    y_test_pred = voting_clf.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Return the trained model and predictions
    return voting_clf, y_test_pred, test_accuracy

In [11]:
# Step 4: Stacked Ensemble with Meta-learner
def build_stacked_ensemble(X_train, y_train, X_test, y_test):
    # Handle class imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Define base models
    base_models = [
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, random_state=42)),
        ('lgb', lgb.LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, random_state=42)),
        ('svm', SVC(probability=True, kernel='rbf', C=1.0, random_state=42))
    ]
    
    # Prepare cross-validation for stacking
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Initialize arrays to store meta-features
    n_models = len(base_models)
    train_meta_features = np.zeros((X_train.shape[0], n_models))
    test_meta_features = np.zeros((X_test.shape[0], n_models))
    
    # Generate meta-features through cross-validation
    print("Generating meta-features through cross-validation...")
    for i, (name, model) in enumerate(base_models):
        print(f"Training {name}...")
        test_fold_preds = np.zeros((X_test.shape[0], skf.n_splits))
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
            # Train on resampled data but use original indices
            fold_X_train, fold_y_train = X_train_resampled[train_idx], y_train_resampled[train_idx]
            fold_X_val = X_train_resampled[val_idx]
            
            # Fit model on training fold
            model.fit(fold_X_train, fold_y_train)
            
            # Generate predictions for validation fold
            val_indices_in_original = val_idx
            train_meta_features[val_indices_in_original, i] = model.predict_proba(X_train.iloc[val_indices_in_original])[:, 1]
            
            # Generate predictions for test set
            test_fold_preds[:, fold] = model.predict_proba(X_test)[:, 1]
        
        # Average predictions across folds for test set
        test_meta_features[:, i] = test_fold_preds.mean(axis=1)
    
    # Train meta-learner
    print("Training meta-learner...")
    meta_learner = LogisticRegression(C=10.0, class_weight='balanced', random_state=42)
    meta_learner.fit(train_meta_features, y_train)
    
    # Make predictions with meta-learner
    print("Making predictions with meta-learner...")
    stacked_train_preds = meta_learner.predict(train_meta_features)
    stacked_test_preds = meta_learner.predict(test_meta_features)
    
    # Calculate accuracy
    stacked_train_accuracy = accuracy_score(y_train, stacked_train_preds)
    stacked_test_accuracy = accuracy_score(y_test, stacked_test_preds)
    
    print(f"Stacked Training Accuracy: {stacked_train_accuracy:.4f}")
    print(f"Stacked Testing Accuracy: {stacked_test_accuracy:.4f}")
    print("\nStacked Classification Report:")
    print(classification_report(y_test, stacked_test_preds))
    
    return meta_learner, train_meta_features, test_meta_features, stacked_test_preds, stacked_test_accuracy

In [12]:
# Step 5: Main workflow
def main(train_df, test_df, mcdm_weights):
    # Feature engineering
    print("Performing feature engineering...")
    enhanced_train, enhanced_test = enhanced_feature_engineering(train_df, test_df, mcdm_weights)
    
    # Prepare features and target
    X_train = enhanced_train.drop(columns=["Attrition", "Employee ID"])
    y_train = enhanced_train["Attrition"]
    X_test = enhanced_test.drop(columns=["Attrition", "Employee ID"])
    y_test = enhanced_test["Attrition"]
    
    # Feature selection (optional)
    top_n = X_train.shape[1] - 10  # Select all but 10 features
    print(f"Selecting top {top_n} features...")
    selector = select_features(X_train, y_train, top_n=top_n)
    
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Convert to DataFrame with feature names
    selected_features = X_train.columns[selector.get_support()]
    X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
    X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)
    
    print(f"Selected {len(selected_features)} features")
    
    # Build models
    print("\nBuilding and training voting ensemble...")
    voting_model, voting_preds, voting_accuracy = build_advanced_ensemble(
        X_train_selected, y_train, X_test_selected, y_test)
    
    print("\nBuilding and training stacked ensemble...")
    stacked_model, train_meta, test_meta, stacked_preds, stacked_accuracy = build_stacked_ensemble(
        X_train_selected, y_train, X_test_selected, y_test)
    
    # Final evaluation - compare models
    print("\n===== Final Results =====")
    print(f"Voting Ensemble Accuracy: {voting_accuracy:.4f}")
    print(f"Stacked Ensemble Accuracy: {stacked_accuracy:.4f}")
    
    # Blend predictions for final result
    print("\nCreating final blended predictions...")
    alpha = 0.6  # Weight for voting ensemble
    blend_preds = np.zeros_like(voting_preds, dtype=float)
    
    # Convert to probabilities
    voting_probs = voting_model.predict_proba(X_test_selected)[:, 1]
    stacked_probs = stacked_model.predict_proba(test_meta)[:, 1]
    
    # Blend probabilities
    blended_probs = alpha * voting_probs + (1-alpha) * stacked_probs
    blended_preds = (blended_probs > 0.5).astype(int)
    
    # Calculate accuracy of blended model
    blended_accuracy = accuracy_score(y_test, blended_preds)
    print(f"Blended Ensemble Accuracy: {blended_accuracy:.4f}")
    print("\nBlended Classification Report:")
    print(classification_report(y_test, blended_preds))
    
    return voting_model, stacked_model, blended_preds, blended_accuracy

# Example usage
if __name__ == "__main__":
    # Define the MCDM feature weights
    mcdm_weights = {
        'Work-Life Balance': 0.363535,
        'Job Satisfaction': 0.444109,
        'Years at Company': 0.373823,
        'Monthly Income': 0.344891,
        'Performance Rating': 0.295121,
        'Job Level': 0.362679,
        'Number of Promotions': 0.397305,
        'Distance from Home': 0.315833,
        'Overall_Sentiment_Score': 0.248932,
        'Company Size': 0.223218,
        'Employee Recognition': 0.415096,
        'Education Level': 0.301249,
        'Company Tenure': 0.226470,
        'Company Reputation': 0.268320
    }
    
    # Run the main workflow
    voting_model, stacked_model, blended_preds, blended_accuracy = main(train_df, test_df, mcdm_weights)

Performing feature engineering...
Selecting top 38 features...
Selected 38 features

Building and training voting ensemble...
Training Voting Ensemble...
[LightGBM] [Info] Number of positive: 31260, number of negative: 31260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8617
[LightGBM] [Info] Number of data points in the train set: 62520, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Making predictions...
Training Accuracy: 0.8834
Testing Accuracy: 0.7536

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      7868
           1       0.73      0.75      0.74      7032

    accuracy                           0.75     14900
   macro avg       0.75     

KeyError: "None of [Index([    0,     2,     3,     4,     5,     6,     7,     8,     9,    10,\n       ...\n       62509, 62510, 62512, 62513, 62514, 62515, 62516, 62517, 62518, 62519],\n      dtype='int32', length=50016)] are in the [columns]"

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from sklearn.decomposition import PCA
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Step 1: Enhanced Feature Engineering
def enhanced_feature_engineering(train_df, test_df, mcdm_weights):
    # Create copies to avoid modifying originals
    train = train_df.copy()
    test = test_df.copy()
    
    # Encode categorical features
    categorical_features = ['Gender', 'Job Role', 'Work-Life Balance', 'Job Satisfaction',
                           'Performance Rating', 'Education Level', 'Marital Status',
                           'Company Size', 'Company Reputation', 'Employee Recognition', 
                           'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
                           'Overall_Sentiment_Label']
    
    numerical_features = ['Age', 'Years at Company', 'Monthly Income', 'Number of Promotions',
                         'Distance from Home', 'Job Level', 'Company Tenure', 'Overall_Sentiment_Score']
    
    # Encode categorical features
    label_encoders = {}
    for col in categorical_features:
        if col in train.columns and col in test.columns:
            le = LabelEncoder()
            train[col] = le.fit_transform(train[col])
            test[col] = le.transform(test[col])
            label_encoders[col] = le
    
    # Handle Job Level separately if it contains non-numeric values
    if 'Job Level' in train.columns and not pd.api.types.is_numeric_dtype(train['Job Level']):
        le_job_level = LabelEncoder()
        train['Job Level'] = le_job_level.fit_transform(train['Job Level'])
        test['Job Level'] = le_job_level.transform(test['Job Level'])
    
    # Normalize numerical features
    scaler = StandardScaler()
    train[numerical_features] = scaler.fit_transform(train[numerical_features])
    test[numerical_features] = scaler.transform(test[numerical_features])
    
    # Create interaction features between high-weight features
    top_features = sorted(mcdm_weights.items(), key=lambda x: x[1], reverse=True)[:5]
    top_feature_names = [feat[0] for feat in top_features]
    
    for i, feat1 in enumerate(top_feature_names):
        if feat1 in train.columns and feat1 in test.columns:
            for j, feat2 in enumerate(top_feature_names[i+1:], i+1):
                if feat2 in train.columns and feat2 in test.columns:
                    interaction_name = f"{feat1}_x_{feat2}"
                    train[interaction_name] = train[feat1] * train[feat2]
                    test[interaction_name] = test[feat1] * test[feat2]
    
    # Create polynomial features for top 3 features
    top3_features = [feat[0] for feat in top_features[:3]]
    valid_top3 = [f for f in top3_features if f in train.columns and f in test.columns]
    
    if valid_top3:
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        train_poly = poly.fit_transform(train[valid_top3])
        test_poly = poly.transform(test[valid_top3])
        
        poly_feature_names = [f"poly_{i}" for i in range(train_poly.shape[1])]
        train_poly_df = pd.DataFrame(train_poly, columns=poly_feature_names)
        test_poly_df = pd.DataFrame(test_poly, columns=poly_feature_names)
        
        train = pd.concat([train, train_poly_df], axis=1)
        test = pd.concat([test, test_poly_df], axis=1)
    
    # Create ratio features
    if 'Monthly Income' in train.columns and 'Years at Company' in train.columns:
        train['Income_per_Year'] = train['Monthly Income'] / (train['Years at Company'] + 1)  # +1 to avoid division by zero
        test['Income_per_Year'] = test['Monthly Income'] / (test['Years at Company'] + 1)
    
    if 'Number of Promotions' in train.columns and 'Years at Company' in train.columns:
        train['Promotion_Rate'] = train['Number of Promotions'] / (train['Years at Company'] + 1)
        test['Promotion_Rate'] = test['Number of Promotions'] / (test['Years at Company'] + 1)
        
        # New features based on the relationship between promotions and tenure
        train['Promo_Tenure_Ratio'] = train['Number of Promotions'] / (train['Company Tenure'] + 1)
        test['Promo_Tenure_Ratio'] = test['Number of Promotions'] / (test['Company Tenure'] + 1)
    
    # Create satisfaction to work ratio
    if 'Job Satisfaction' in train.columns and 'Work-Life Balance' in train.columns:
        train['Satisfaction_Balance_Ratio'] = train['Job Satisfaction'] / (train['Work-Life Balance'] + 0.1)
        test['Satisfaction_Balance_Ratio'] = test['Job Satisfaction'] / (test['Work-Life Balance'] + 0.1)
    
    # Create age-related features
    if 'Age' in train.columns and 'Years at Company' in train.columns:
        train['Age_at_Join'] = train['Age'] - train['Years at Company']
        test['Age_at_Join'] = test['Age'] - test['Years at Company']
    
    # Apply MCDM weights to features
    for feature, weight in mcdm_weights.items():
        if feature in train.columns and feature in test.columns:
            train[feature] *= weight
            test[feature] *= weight
    
    # Create weighted sentiment features
    if 'Overall_Sentiment_Score' in train.columns:
        for feat in ['Job Satisfaction', 'Work-Life Balance', 'Performance Rating']:
            if feat in train.columns and feat in test.columns:
                train[f'{feat}_Sentiment'] = train[feat] * train['Overall_Sentiment_Score']
                test[f'{feat}_Sentiment'] = test[feat] * test['Overall_Sentiment_Score']
    
    # Convert target column to binary
    if 'Attrition' in train.columns:
        train["Attrition"] = train["Attrition"].apply(lambda x: 1 if x == "Left" else 0) if not pd.api.types.is_numeric_dtype(train["Attrition"]) else train["Attrition"]
    
    if 'Attrition' in test.columns:
        test["Attrition"] = test["Attrition"].apply(lambda x: 1 if x == "Left" else 0) if not pd.api.types.is_numeric_dtype(test["Attrition"]) else test["Attrition"]
    
    return train, test

# Step 2: Feature Selection based on weighted importance
def select_features(X_train, y_train, top_n=None):
    selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold=-np.inf, max_features=top_n)
    selector.fit(X_train, y_train)
    return selector

# Step 3: Advanced Ensemble Model
def build_advanced_ensemble(X_train, y_train, X_test, y_test):
    # Handle class imbalance with SMOTETomek (more advanced)
    print("Applying SMOTETomek for class balancing...")
    smote_tomek = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)
    
    # Define base models with hyperparameters
    rf = RandomForestClassifier(
        n_estimators=300, 
        max_depth=25, 
        min_samples_split=4, 
        min_samples_leaf=1, 
        class_weight='balanced', 
        bootstrap=True,
        random_state=42
    )
    
    gb = GradientBoostingClassifier(
        n_estimators=250, 
        learning_rate=0.08, 
        max_depth=10, 
        min_samples_split=4, 
        min_samples_leaf=1,
        subsample=0.9,
        max_features='sqrt',
        random_state=42
    )
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=300, 
        learning_rate=0.08, 
        max_depth=10, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        colsample_bylevel=0.8,
        min_child_weight=3,
        reg_alpha=0.5,
        reg_lambda=1.0,
        scale_pos_weight=1.0,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    lgb_model = lgb.LGBMClassifier(
        n_estimators=300, 
        learning_rate=0.08, 
        max_depth=10,
        num_leaves=40, 
        subsample=0.8, 
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=1.0,
        min_child_samples=20,
        random_state=42
    )
    
    # Neural network for different learning patterns
    mlp = MLPClassifier(
        hidden_layer_sizes=(120, 80, 40), 
        activation='relu', 
        solver='adam',
        alpha=0.0005, 
        batch_size=256, 
        learning_rate='adaptive',
        max_iter=300, 
        early_stopping=True,
        random_state=42
    )
    
    # Create voting classifier with calibrated probabilities
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf),
        ('gb', gb),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('mlp', mlp)
    ], voting='soft', weights=[2, 1.5, 2, 1.5, 1])  # Weight models based on typical performance
    
    # Fit all models
    print("Training Voting Ensemble...")
    voting_clf.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    print("Making predictions...")
    y_train_pred = voting_clf.predict(X_train)
    y_test_pred = voting_clf.predict(X_test)
    y_test_proba = voting_clf.predict_proba(X_test)[:, 1]
    
    # Calculate accuracy and AUC
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_proba)
    
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Return the trained model and predictions
    return voting_clf, y_test_pred, y_test_proba, test_accuracy

# Step 4: Fixed Stacked Ensemble with Meta-learner
def build_stacked_ensemble(X_train, y_train, X_test, y_test):
    # Handle class imbalance with SMOTE
    print("Applying SMOTE for class balancing...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Define base models
    base_models = [
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, random_state=42, use_label_encoder=False, eval_metric='logloss')),
        ('lgb', lgb.LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, random_state=42)),
        ('svm', SVC(probability=True, kernel='rbf', C=1.0, random_state=42))
    ]
    
    # Generate meta-features using cross_val_predict
    print("Generating meta-features through cross-validation...")
    n_models = len(base_models)
    train_meta_features = np.zeros((X_train_resampled.shape[0], n_models))
    test_meta_features = np.zeros((X_test.shape[0], n_models))
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Generate meta-features for training set
    for i, (name, model) in enumerate(base_models):
        print(f"Training {name}...")
        # Generate predictions for each fold using cross-validation
        train_meta_features[:, i] = cross_val_predict(
            model, X_train_resampled, y_train_resampled, 
            cv=skf, method='predict_proba', n_jobs=-1
        )[:, 1]
        
        # Train the model on the entire resampled training data
        model.fit(X_train_resampled, y_train_resampled)
        
        # Generate predictions for test set
        test_meta_features[:, i] = model.predict_proba(X_test)[:, 1]
    
    # Train meta-learner
    print("Training meta-learner...")
    meta_learner = LogisticRegression(C=10.0, class_weight='balanced', random_state=42)
    meta_learner.fit(train_meta_features, y_train_resampled)
    
    # Make predictions with meta-learner
    print("Making predictions with meta-learner...")
    # We need to map our predictions back to the original training set for fair comparison
    # For this simplified example, we'll just use the meta-learner on the test set
    stacked_test_preds = meta_learner.predict(test_meta_features)
    stacked_test_proba = meta_learner.predict_proba(test_meta_features)[:, 1]
    
    # Calculate accuracy
    stacked_test_accuracy = accuracy_score(y_test, stacked_test_preds)
    roc_auc = roc_auc_score(y_test, stacked_test_proba)
    
    print(f"Stacked Testing Accuracy: {stacked_test_accuracy:.4f}")
    print(f"Stacked ROC AUC Score: {roc_auc:.4f}")
    print("\nStacked Classification Report:")
    print(classification_report(y_test, stacked_test_preds))
    
    return meta_learner, train_meta_features, test_meta_features, stacked_test_preds, stacked_test_proba, stacked_test_accuracy

# Step 5: Advanced Hyperparameter Tuning for XGBoost
def tune_xgboost(X_train, y_train, X_test, y_test):
    print("Performing hyperparameter tuning for XGBoost...")
    
    # Apply SMOTE for class balancing
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Define parameter grid for XGBoost
    param_dist = {
        'n_estimators': [300, 500, 700],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [6, 8, 10, 12],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0, 0.1, 0.5, 1],
        'reg_lambda': [0.1, 0.5, 1, 5],
        'scale_pos_weight': [1, 3, 5]
    }
    
    # Initialize XGBoost classifier
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', 
                                 use_label_encoder=False, 
                                 eval_metric='logloss',
                                 random_state=42)
    
    # Set up random search
    n_iter_search = 30  # Number of parameter combinations to try
    random_search = RandomizedSearchCV(
        xgb_model, 
        param_distributions=param_dist, 
        n_iter=n_iter_search,
        scoring='roc_auc',
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    # Perform random search
    random_search.fit(X_train_resampled, y_train_resampled)
    
    # Get best model
    best_model = random_search.best_estimator_
    
    # Make predictions
    y_test_pred = best_model.predict(X_test)
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_proba)
    
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best XGBoost Testing Accuracy: {test_accuracy:.4f}")
    print(f"Best XGBoost ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    
    return best_model, y_test_pred, y_test_proba, test_accuracy

# Step 6: Ensemble Blending with Optimization
def optimize_blend_weights(base_probas, y_true):
    """
    Find optimal weights for blending multiple model probabilities
    """
    from scipy.optimize import minimize
    
    def objective(weights, probas, y_true):
        # Normalize weights to sum to 1
        weights = weights / np.sum(weights)
        
        # Calculate blended probabilities
        blended_probas = np.zeros(y_true.shape)
        for i, w in enumerate(weights):
            blended_probas += w * probas[i]
        
        # Convert to binary predictions
        blended_preds = (blended_probas > 0.5).astype(int)
        
        # Return negative accuracy (to minimize)
        return -accuracy_score(y_true, blended_preds)
    
    # Initialize with equal weights
    n_models = len(base_probas)
    initial_weights = np.ones(n_models) / n_models
    
    # Constraints: weights sum to 1
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    
    # Bounds: weights between 0 and 1
    bounds = [(0, 1) for _ in range(n_models)]
    
    # Optimize
    result = minimize(
        objective, 
        initial_weights, 
        args=(base_probas, y_true),
        method='SLSQP',
        bounds=bounds,
        constraints=constraints
    )
    
    # Normalize weights to sum to 1
    optimal_weights = result.x / np.sum(result.x)
    return optimal_weights

# Step 7: Main workflow with advanced techniques
def main(train_df, test_df, mcdm_weights):
    # Feature engineering
    print("Performing feature engineering...")
    enhanced_train, enhanced_test = enhanced_feature_engineering(train_df, test_df, mcdm_weights)
    
    # Prepare features and target
    X_train = enhanced_train.drop(columns=["Attrition", "Employee ID"])
    y_train = enhanced_train["Attrition"]
    X_test = enhanced_test.drop(columns=["Attrition", "Employee ID"])
    y_test = enhanced_test["Attrition"]
    
    # Feature selection
    top_n = X_train.shape[1] - 5  # Select all but 5 features
    print(f"Selecting top {top_n} features...")
    selector = select_features(X_train, y_train, top_n=top_n)
    
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Convert to DataFrame with feature names
    selected_features = X_train.columns[selector.get_support()]
    X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
    X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)
    
    print(f"Selected {len(selected_features)} features")
    
    # Build models
    print("\nBuilding and training voting ensemble...")
    voting_model, voting_preds, voting_probs, voting_accuracy = build_advanced_ensemble(
        X_train_selected, y_train, X_test_selected, y_test)
    
    print("\nBuilding and training stacked ensemble...")
    stacked_model, train_meta, test_meta, stacked_preds, stacked_probs, stacked_accuracy = build_stacked_ensemble(
        X_train_selected, y_train, X_test_selected, y_test)
    
    print("\nPerforming hyperparameter tuning for XGBoost...")
    xgb_tuned, xgb_preds, xgb_probs, xgb_accuracy = tune_xgboost(
        X_train_selected, y_train, X_test_selected, y_test)
    
    # Final evaluation - compare models
    print("\n===== Final Results =====")
    print(f"Voting Ensemble Accuracy: {voting_accuracy:.4f}")
    print(f"Stacked Ensemble Accuracy: {stacked_accuracy:.4f}")
    print(f"Tuned XGBoost Accuracy: {xgb_accuracy:.4f}")
    
    # Create a super ensemble (blending)
    print("\nCreating optimized blended predictions...")
    
    # Collect all probability predictions
    all_probs = [voting_probs, stacked_probs, xgb_probs]
    
    # Find optimal blending weights
    optimal_weights = optimize_blend_weights(all_probs, y_test)
    print(f"Optimal blending weights: {optimal_weights}")
    
    # Apply optimal weights
    blended_probs = np.zeros_like(voting_probs)
    for i, probs in enumerate(all_probs):
        blended_probs += optimal_weights[i] * probs
    
    # Convert to predictions
    blended_preds = (blended_probs > 0.5).astype(int)
    
    # Calculate accuracy of blended model
    blended_accuracy = accuracy_score(y_test, blended_preds)
    blended_auc = roc_auc_score(y_test, blended_probs)
    
    print(f"Blended Ensemble Accuracy: {blended_accuracy:.4f}")
    print(f"Blended Ensemble AUC: {blended_auc:.4f}")
    print("\nBlended Classification Report:")
    print(classification_report(y_test, blended_preds))
    
    # Finally, try threshold optimization
    print("\nOptimizing prediction threshold...")
    thresholds = np.linspace(0.3, 0.7, 50)
    best_threshold = 0.5
    best_accuracy = 0
    
    for threshold in thresholds:
        threshold_preds = (blended_probs > threshold).astype(int)
        acc = accuracy_score(y_test, threshold_preds)
        if acc > best_accuracy:
            best_accuracy = acc
            best_threshold = threshold
    
    # Apply optimal threshold
    final_preds = (blended_probs > best_threshold).astype(int)
    final_accuracy = accuracy_score(y_test, final_preds)
    
    print(f"Optimal threshold: {best_threshold:.4f}")
    print(f"Final Ensemble Accuracy: {final_accuracy:.4f}")
    print("\nFinal Classification Report:")
    print(classification_report(y_test, final_preds))
    
    # Feature importance analysis
    feature_importances = voting_model.named_estimators_['rf'].feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nTop 10 Important Features:")
    print(feature_importance_df.head(10))
    
    return voting_model, stacked_model, xgb_tuned, final_preds, final_accuracy

# Example usage
if __name__ == "__main__":
    # Define the MCDM feature weights
    mcdm_weights = {
        'Work-Life Balance': 0.363535,
        'Job Satisfaction': 0.444109,
        'Years at Company': 0.373823,
        'Monthly Income': 0.344891,
        'Performance Rating': 0.295121,
        'Job Level': 0.362679,
        'Number of Promotions': 0.397305,
        'Distance from Home': 0.315833,
        'Overall_Sentiment_Score': 0.248932,
        'Company Size': 0.223218,
        'Employee Recognition': 0.415096,
        'Education Level': 0.301249,
        'Company Tenure': 0.226470,
        'Company Reputation': 0.268320
    }
    
    # Run the main workflow
    voting_model, stacked_model, xgb_model, final_preds, final_accuracy = main(train_df, test_df, mcdm_weights)

Performing feature engineering...
Selecting top 46 features...
Selected 46 features

Building and training voting ensemble...
Applying SMOTETomek for class balancing...
Training Voting Ensemble...
[LightGBM] [Info] Number of positive: 26104, number of negative: 26104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9374
[LightGBM] [Info] Number of data points in the train set: 52208, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Making predictions...
Training Accuracy: 0.9136
Testing Accuracy: 0.7517
ROC AUC Score: 0.8438

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      7868
           1       0.73      0.74      0.74      7032

    accuracy                           0.75     14900
   macro avg       0.75    