### Importing Libraries

In [None]:
import catboost
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay

## Modelling
### 1. CatBoost

##### Loading Datasets

In [None]:
X_train_res = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/X_train_res.csv')
X_test = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/X_test.csv')
X_val = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/X_val.csv')
y_train_res = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/y_train_res.csv')
y_test = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/y_test.csv')
y_val = pd.read_csv('/Users/nandaniyadav/McGill MMA/Winter 2024/INSY 695/Group Project/customer_churn_2024/data/y_val.csv')

In [None]:
# Initialize the CatBoost Classifier
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='Logloss', verbose=200)

# Train the model with resampled train and test data
catboost_model.fit(X_train_res, y_train_res)

# %%

# predicting and evaluating model on validation set
y_val_pred = catboost_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Report: {val_report}')

#confusion matrix
ConfusionMatrixDisplay.from_predictions(y_val, y_val_pred)


In [None]:
# Predicting on the test set
y_pred = catboost_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

# confusion matrix on test set
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

## Hyper Parameter Tuning


# 1. using Gridsearch CV

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model
model = CatBoostClassifier(verbose=0)  # Turn off verbose to suppress detailed output for each fit

# Define the parameters grid to search
param_grid = {
    'iterations': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the grid search to the data
final_model_cat_grid = grid_search.fit(X_train_res, y_train_res)

# After fitting, you can get the best parameters and the best score
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_parameters}")
print(f"Best score: {best_score}")

# 2. using Optuna

In [None]:
import optuna
from sklearn.metrics import accuracy_score

def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
    }
    
    model = CatBoostClassifier(**param, verbose=0)
    model.fit(X_train_res, y_train_res, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=0)
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_trial.params
print('Best parameters:', best_params)

In [None]:
#evaluate on validation set
final_model = CatBoostClassifier(**best_params, verbose=0)
final_model.fit(X_train_res, y_train_res, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=0)
y_val_pred = final_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

#printing the accuracy and classification report
print(f'Validation Accuracy: {val_accuracy}')   
print(f'Validation Report: {val_report}')

#prinitng confusion matrix
ConfusionMatrixDisplay.from_predictions(y_val, y_val_pred)

In [None]:
# Evaluate on the test set
y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

#printing the accuracy and classification report
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_test_pred))

#prinitng confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred)


## PR Curve

In [None]:
from yellowbrick.classifier import PrecisionRecallCurve
import matplotlib.pyplot as plt

viz = PrecisionRecallCurve(final_model, is_fitted=True)
viz.fit(X_train_res, y_train_res)
viz.score(X_val, y_val)
viz.show()