#### Model Selection

In [70]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,recall_score,precision_score

##### Consumer Loan - Product Sale

In [71]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/Customer_Loan.pkl', 'rb') as cl:
     df_train_cl= pickle.load(cl)


In [72]:
# Drop the target variables
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_sale_cl = df_train_cl['Sale_CL']
#y_revenue_cl = df_train_cl['Revenue_CL']

X_train,X_test,y_train,y_test = train_test_split(X_cl,y_sale_cl,test_size=0.2,random_state=0)

##### Normalize The Features

In [73]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Cross Validation

In [74]:
# Verifying which model has better score
models = {
    'KNeighbors': KNeighborsClassifier(),
    'Logistic': LogisticRegression(max_iter=10000,solver = 'sag'),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'XGBoost':XGBClassifier()
}

scores = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_sale_cl, cv=10)
    scores.append({
        'model': model_name,
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
    })


In [75]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores = pd.DataFrame(scores)
scores

Unnamed: 0,model,mean_score,std_score
0,KNeighbors,0.655284,0.039991
1,Logistic,0.696596,0.02016
2,RandomForest,0.707947,0.031942
3,DecisionTree,0.63256,0.0336
4,XGBoost,0.703748,0.041342


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [76]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

In [77]:
y_pred = rf.predict(X_test)

#### Evaluation

In [79]:
confusion_matrix(y_pred,y_test)

array([[134,  45],
       [  3,  12]])

In [87]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test,y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, y_pred))

Evaluation metrics:
Accuracy: 0.7525773195876289
Precision: 0.8
Recall: 0.21052631578947367
AUC: 0.5943142527852477


#### Targeting