### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score,precision_score,accuracy_score,recall_score,classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold, RandomizedSearchCV,RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler,LabelEncoder,Normalizer,MaxAbsScaler
from sklearn.decomposition import PCA
import random
#from imblearn.combine import SMOTEENN
from feature_engine.encoding import CountFrequencyEncoder,OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import pickle
from datetime import date
import datetime
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import learning_curve
from sklearn.calibration import calibration_curve



### Functions


In [4]:
def hist_group_by_exit(df,var):
    plt.figure(figsize=(16, 6))

    ax1 = plt.subplot(1, 2, 1)
    ax2 = plt.subplot(1, 2, 2)

    sns.histplot(df[df['Exited'] == 1][var], ax=ax1, color='salmon', alpha=0.7)
    sns.histplot(df[df['Exited'] == 0][var], ax=ax2, color='steelblue', alpha=0.7)

    ax1.set_title(f'Distribución de {var} en Clientes que Hicieron Churn')
    ax1.set_xlabel(var)
    ax1.set_ylabel('Cantidad')

    ax2.set_title(f'Distribución de {var} en Clientes Retenidos')
    ax2.set_xlabel(var)
    ax2.set_ylabel('Cantidad')

    ax1.legend(['Churned'], loc='upper right')
    ax2.legend(['Not churned'], loc='upper right')

    plt.tight_layout()

    plt.show()

def boxplot_by_exit(df,var):
    plt.figure(figsize=(16, 6))

    ax1 = plt.subplot(1, 2, 1)
    ax2 = plt.subplot(1, 2, 2)

    sns.boxplot(y=var,data=df[df['Exited'] == 1], ax=ax1, color='salmon')
    sns.boxplot(y=var,data=df[df['Exited'] == 0], ax=ax2, color='steelblue')

    ax1.set_title(f'Boxplot de {var} en Clientes que Hicieron Churn')
    ax1.set_xlabel(var)
    ax1.set_ylabel('Cantidad')

    ax2.set_title(f'Boxplot de {var} en Clientes Retenidos')
    ax2.set_xlabel(var)
    ax2.set_ylabel('Cantidad')

    ax1.legend(['Churned'], loc='upper right')
    ax2.legend(['Not churned'], loc='upper right')

    plt.tight_layout()

    plt.show()

def new_variables(df):
  df['CreditScore_x_Age']=df['CreditScore']/df['Age']
  df['CreditScore_x_Balance']=df['Balance']/df['CreditScore']
  df['NumOfProducts_x_Age']=df['NumOfProducts']/df['Age']
  df['Tenure_x_Age']=df.apply(lambda x: x['Tenure']/x['Age'] if x['Age']!=None else 0,axis=1)
  df['%SalaryInBank']=(df['Tenure']*df['Balance'])/df['EstimatedSalary']
  df['Balance_x_EstimatedSalary']=df['Balance']/df['EstimatedSalary']
  df['AgeofEntry']=df['Age']-df['Tenure']
  df['CustomerEngagement']=df.apply(lambda x:x['Age']*x['CreditScore']*x['NumOfProducts'],axis=1)
  df['EducationProduct']=df.apply(lambda x:x['Age']*x['EducationYears']*x['NumOfProducts'],axis=1)

  return df

def encoding(df):
  df=pd.get_dummies(data=df,columns=['Geography'])
  return df

def imputer(df,cat_imputer,num_imputer,train=True):
  cat=[]
  num=[]
  if 'Id' in df.columns:
    df=df.drop('Id',axis=1)
  else:
    pass
  if 'Exited' in df.columns:
    for col in df.drop('Exited',axis=1).columns:
      if df[col].dtype=='object':
        cat.append(col)
      else:
        num.append(col)
    if train==True:
      df[cat]=cat_imputer.fit_transform(df[cat])
      df[num]=num_imputer.fit_transform(df[num])
    else:
      df[cat]=cat_imputer.transform(df[cat])
      df[num]=num_imputer.transform(df[num])
    return df
  else:
    for col in df.columns:
      if df[col].dtype=='object':
        cat.append(col)
      else:
        num.append(col)
    if train==True:
      df[cat]=cat_imputer.fit_transform(df[cat])
      df[num]=num_imputer.fit_transform(df[num])
    else:
      df[cat]=cat_imputer.transform(df[cat])
      df[num]=num_imputer.transform(df[num])
    return df

def print_metrics(y_val, y_pred,clf,X_val,skf):
  precision, recall, thresholds = precision_recall_curve(y_val, y_pred)
  pr_auc = auc(recall, precision)
  accuracy = accuracy_score(y_val, y_pred)
  precision=precision_score(y_val,y_pred)
  recall=recall_score(y_val,y_pred)
  f1=f1_score(y_val,y_pred)
  auc_score = roc_auc_score(y_val, y_pred)
  report = classification_report(y_val, y_pred)
  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1-Score: {f1}")
  print(f"AUC-Score: {auc_score}")
  print(f"PRAUC: {pr_auc:.4f}")
  print(f"Cross-Validation Accuracy: {cross_val_score(clf,X_val,y_val,cv=skf,scoring='accuracy').mean()}")
  print(f"Classification Report:\n{report}")


  

def plot_confusion_matrix(y_true, y_pred):
  sns.heatmap(confusion_matrix(y_true, y_pred),
              annot=True,
              fmt='g',
              xticklabels=['0', '1'],
              yticklabels=['0', '1'])
  plt.ylabel('Actual', fontsize=13)
  plt.xlabel('Prediction', fontsize=13)
  plt.title('Confusion Matrix', fontsize=17)
  plt.show()

def plot_roc_curve(y_true, y_pred):
  fpr, tpr, thresholds = roc_curve(y_true, y_pred)
  roc_auc = auc(fpr, tpr)
  plt.figure(figsize=(8, 6))
  plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
  plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC Curve')
  plt.legend(loc='lower right')
  plt.show()

def plot_learning_curve(model, X, y,skf):
  train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=skf, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 5))
  
  train_scores_mean = np.mean(train_scores, axis=1)
  train_scores_std = np.std(train_scores, axis=1)
  test_scores_mean = np.mean(test_scores, axis=1)
  test_scores_std = np.std(test_scores, axis=1)
  
  plt.title("Learning Curve")
  plt.xlabel("Training examples")
  plt.ylabel("Score")
  
  plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
  plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
  
  plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
  plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
  
  plt.legend(loc="best")
  plt.grid()
    
def plot_precision_recall_curve(y_true, y_pred):
  precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
  average_precision = average_precision_score(y_true, y_pred)
  
  plt.figure(figsize=(8, 6))
  plt.plot(recall, precision, marker='.', label=f'AP={average_precision:.2f}')
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.title('Precision-Recall Curve')
  plt.legend()
  plt.grid()
  plt.show()
    
    
def plot_calibration_curve(y_true, y_pred, n_bins=10):
  prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=n_bins)
  
  plt.figure(figsize=(8, 6))
  plt.plot(prob_pred, prob_true, marker='o', label='Calibration Curve')
  plt.plot([0, 1], [0, 1], linestyle='--', label='Calibrated')
  plt.xlabel('Mean Predicted Probability')
  plt.ylabel('Fraction of Positives')
  plt.title('Calibration Curve')
  plt.legend()
  plt.grid()
  plt.show()
    
def plot_feature_importances(clf, X):
  importances = clf.feature_importances_

  # Sort importances in descending order
  indices = np.argsort(importances)[::-1]

  feature_names = X.columns[indices]

  plt.figure(figsize=(12, 8))
  plt.title("Feature Importances")
  plt.bar(range(len(indices)), importances[indices], color="b", align="center")
  plt.xticks(range(len(indices)), feature_names, rotation=90)
  plt.xlabel("Feature")
  plt.ylabel("Importance")
  plt.tight_layout()
  plt.show()
  
def plot_metrics(y_true,y_pred,clf,X_val,X_train,skf):
  plot_confusion_matrix(y_true,y_pred)
  plot_roc_curve(y_true,y_pred)
  plot_learning_curve(clf,X_val,y_true,skf)
  plot_precision_recall_curve(y_true,y_pred)
  plot_calibration_curve(y_true,y_pred)
  plot_feature_importances(clf,X_val)
  

### Pickles

In [6]:
with open('Pickle_ipynb/model_LGBM.pkl', 'rb') as f:
    clf = pickle.load(f)
    print(clf)
    
with open('Pickle_ipynb/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
    print(scaler)

with open('Pickle_ipynb/encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)
    print(encoder)
    
with open('Pickle_ipynb/encoder2.pkl', 'rb') as f:
    encoder2 = pickle.load(f)
    print(encoder)

LGBMClassifier(bagging_fraction=0.6, bagging_freq=10, boosting_type='dart',
               feature_freaction=0.8, lambda_l1=1, lambda_l2=1, max_depth=10,
               metric='binary_logloss', min_data_in_leaf=20,
               min_gain_to_split=0.1, num_leaves=20, objective='binary')
StandardScaler()
CountFrequencyEncoder(encoding_method='frequency', variables=['Geography'])
CountFrequencyEncoder(encoding_method='frequency', variables=['Geography'])


### Load data and predict

In [8]:
tests=pd.read_csv('base_val.csv')
test=tests

test=test.drop(['Id','Surname','Passport'],axis=1)

# Feature Engineering
test=new_variables(test)
test=encoder.transform(test)
test=encoder2.transform(test)
test=encoding(test)
test.drop('Gender',axis=1,inplace=True)
test_columns=test.columns
test_scaled=scaler.transform(test)
test=pd.DataFrame(test_scaled,columns=test_columns)

# Prediction
y_pred = clf.predict_proba(test)[:, 1]
y_pred = (y_pred > 0.49).astype(int)
tests['Exited']=y_pred
sample_data=tests[['Id','Exited']]

# File
filename = f'Predictions - {datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")} - .csv'
with open(filename, 'a', encoding='utf-8') as file:
    file.write("ID,Predictions\n")
    for i in range(sample_data.shape[0]):
        text = str(sample_data.Id[i])
        predictions = str(sample_data.Exited[i])
        encoded_text = text.encode('utf-8', errors='ignore')
        encoded_predictions = predictions.encode('utf-8', errors='ignore')
        decoded_text = encoded_text.decode('utf-8')
        decoded_predictions = encoded_predictions.decode('utf-8')

        line = f"{decoded_text},{decoded_predictions}"
        file.write(line + "\n")

