### Model Building and Saving  

1. **Training Multiple Models**
- Implemented **Logistic Regression, Decision Tree, and Random Forest** for churn prediction.  
- Evaluated performance using **accuracy, precision, recall, and AUC-ROC**.  
- Selected the **best-performing model** for deployment.  

2. **Model Evaluation**
- Compared models using validation data.  
- Choose the final model based on **highest AUC-ROC** and other metrics.  

3. **Model Saving**
- Saved the trained model using `joblib` for future inference:  
  ```python
  import joblib
  joblib.dump(best_model, "churn_model.pkl")


#### Extracting Data for training

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus

# Database connection configuration with URL-encoded password
password = quote_plus("131412aA@")  # URL-encode special characters in password
DB_URI = f"postgresql+psycopg2://postgres:{password}@localhost:5432/bank_churn"
engine = create_engine(DB_URI)

# ✅ Corrected query to fetch the latest training data
query = """
SELECT * 
FROM feature_values
WHERE data_source = 'train'
  AND version = (
      SELECT version FROM feature_values 
      WHERE data_source = 'train' 
      ORDER BY version DESC LIMIT 1
  );
"""

# Load the latest training data into a Pandas DataFrame
df_train = pd.read_sql(query, engine)

# Drop non-feature columns (keeping target 'Exited')
columns_to_drop = ['id', 'data_source', 'version']
df_train.drop(columns=[col for col in columns_to_drop if col in df_train.columns], inplace=True)

print("✅ Cleaned Training Data Preview:")
print(df_train.head())


✅ Cleaned Training Data Preview:
   creditscore       age  tenure   balance  numofproducts  isactivemember  \
0     0.569231 -0.596932       3 -0.883161              2               0   
1     0.472189 -0.596932       1 -0.883161              2               1   
2     0.592899  0.258795      10 -0.883161              2               0   
3     0.363314 -0.474685       2  1.486914              1               1   
4     0.682840 -0.596932       5 -0.883161              2               1   

   geography_france  geography_germany  geography_spain  balanceperproduct  \
0              True              False            False          -0.441580   
1              True              False            False          -0.441580   
2              True              False            False          -0.441580   
3              True              False            False           1.486914   
4             False              False             True          -0.441580   

   exited  
0       0  
1       0  

#### Splitting the data for training

In [2]:
# Drop non-feature columns (if present)
cols_to_drop = [col for col in ['id', 'version'] if col in df_train.columns]
df_train.drop(columns=cols_to_drop, inplace=True)

# Define features and target ('Exited')
X = df_train.drop(columns=["exited"])
y = df_train["exited"]

# Split into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-test split completed:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


Train-test split completed:
X_train shape: (132027, 10)
X_test shape: (33007, 10)


#### Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_prec = precision_score(y_test, y_pred_lr)
lr_rec = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print("Logistic Regression Performance:")
print("Accuracy:", lr_acc)
print("Precision:", lr_prec)
print("Recall:", lr_rec)
print("F1 Score:", lr_f1)
print(classification_report(y_test, y_pred_lr))


Logistic Regression Performance:
Accuracy: 0.837307237858636
Precision: 0.6867126218207749
Recall: 0.41622244633338135
F1 Score: 0.5182992465016146
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     26066
           1       0.69      0.42      0.52      6941

    accuracy                           0.84     33007
   macro avg       0.77      0.68      0.71     33007
weighted avg       0.82      0.84      0.82     33007



#### Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_prec = precision_score(y_test, y_pred_rf)
rf_rec = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print("Random Forest Performance:")
print("Accuracy:", rf_acc)
print("Precision:", rf_prec)
print("Recall:", rf_rec)
print("F1 Score:", rf_f1)
print(classification_report(y_test, y_pred_rf))


Random Forest Performance:
Accuracy: 0.8421546944587511
Precision: 0.6548577563070317
Recall: 0.5273015415646161
F1 Score: 0.5841979249800479
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     26066
           1       0.65      0.53      0.58      6941

    accuracy                           0.84     33007
   macro avg       0.77      0.73      0.74     33007
weighted avg       0.83      0.84      0.84     33007



#### Xgboost

In [5]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### 🔹 Train XGBoost Model
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# XGBoost Performance
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_prec = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

print("\n🔹 XGBoost Performance:")
print(f"Accuracy: {xgb_acc:.6f}")
print(f"Precision: {xgb_prec:.6f}")
print(f"Recall: {xgb_recall:.6f}")
print(f"F1 Score: {xgb_f1:.6f}")
print("\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.




🔹 XGBoost Performance:
Accuracy: 0.862302
Precision: 0.732802
Recall: 0.543293
F1 Score: 0.623976

               precision    recall  f1-score   support

           0       0.89      0.95      0.92     26066
           1       0.73      0.54      0.62      6941

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.85      0.86      0.85     33007



#### Comparing all three models and storing report and best model with versioning

In [8]:
import os
import pickle
import pandas as pd
from datetime import datetime

# Define a single directory for models and reports
ARTIFACTS_DIR = "ModelnReports"

# Create directory if it doesn't exist
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Generate timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define model performance dictionary
model_performance = {
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "Accuracy": [lr_acc, rf_acc, xgb_acc],
    "Precision": [lr_prec, rf_prec, xgb_prec],
    "Recall": [lr_rec, rf_rec, xgb_recall],
    "F1 Score": [lr_f1, rf_f1, xgb_f1]
}

# Convert to DataFrame
performance_df = pd.DataFrame(model_performance)

# Generate a versioned report filename
report_filename = f"model_comparison_report_{timestamp}.csv"
report_path = os.path.join(ARTIFACTS_DIR, report_filename)

# Save the performance report as CSV
performance_df.to_csv(report_path, index=False)
print(f"✅ Model comparison report saved at: {report_path}")

# Select the best model based on F1 Score
best_model_name = performance_df.loc[performance_df["F1 Score"].idxmax(), "Model"]
best_model = {"Logistic Regression": lr_model, "Random Forest": rf_model, "XGBoost": xgb_model}[best_model_name]

# Generate versioned model filename
model_filename = f"{best_model_name.replace(' ', '_').lower()}_{timestamp}.pkl"
model_path = os.path.join(ARTIFACTS_DIR, model_filename)

# Save the best model
with open(model_path, "wb") as model_file:
    pickle.dump(best_model, model_file)

print(f"✅ Best model ({best_model_name}) saved as: {model_path}")


✅ Model comparison report saved at: ModelnReports\model_comparison_report_20250311_203553.csv
✅ Best model (XGBoost) saved as: ModelnReports\xgboost_20250311_203553.pkl


#### Fetching the API data to simulate real world scenario to predict customer churn on unseen data

In [10]:
# Fetch the latest API data from feature_values table
query_api = """
SELECT *
FROM feature_values
WHERE data_source = 'api'
  AND version = (
      SELECT MAX(version)
      FROM feature_values
      WHERE data_source = 'api'
  );
"""
df_api = pd.read_sql(query_api, engine)

# Drop non-feature columns to keep only the features used for prediction
cols_to_drop = ['id', 'data_source', 'version', 'exited']
df_api_clean = df_api.drop(columns=[col for col in cols_to_drop if col in df_api.columns], errors='ignore')

# Print API Data Preview
print("✅ API Data Extracted and Cleaned:")
print(df_api_clean.head())


✅ API Data Extracted and Cleaned:
   creditscore       age  tenure   balance  numofproducts  isactivemember  \
0     0.375148 -1.822121       2 -0.881270              2               1   
1     0.604734  0.994853       2 -0.881270              1               0   
2     0.540828 -0.474873       7 -0.881270              2               0   
3     0.600000 -0.229919       8 -0.881270              1               0   
4     0.768047  0.015036      10  1.050033              1               0   

   geography_france  geography_germany  geography_spain  balanceperproduct  
0              True              False            False          -0.440635  
1              True              False            False          -0.881270  
2              True              False            False          -0.440635  
3              True              False            False          -0.881270  
4             False               True            False           1.050033  


#### Applying the best saved model on unseen data to predcit customer churn

In [12]:
import os
import pickle
import glob

# Directory where models are saved
MODEL_DIR = "ModelnReports"  # Ensure it matches the directory in the saving script

# Function to fetch the latest model file based on timestamp
def get_latest_model(model_dir):
    model_files = glob.glob(os.path.join(model_dir, "*.pkl"))  # Match any .pkl file
    if not model_files:
        raise FileNotFoundError("❌ No saved model found in 'saved_models' directory!")

    latest_model = max(model_files, key=os.path.getctime)  # Get the most recent model
    print(f"✅ Latest model found: {latest_model}")
    return latest_model

# Fetch and load the latest model
latest_model_filename = get_latest_model(MODEL_DIR)
with open(latest_model_filename, "rb") as f:
    best_model = pickle.load(f)

# Use the loaded model to predict churn on the API data
predictions = best_model.predict(df_api_clean)

# Calculate the percentage of customers predicted to churn (assuming churn is encoded as 1)
churn_percentage = (predictions.sum() / len(predictions)) * 100
print(f"🔹 Percentage of customers predicted to churn: {churn_percentage:.2f}%")


✅ Latest model found: ModelnReports\xgboost_20250311_203553.pkl
🔹 Percentage of customers predicted to churn: 15.41%
