#### Model Selection

In [45]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier,XGBRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, make_scorer, mean_squared_error

In [46]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/Customer_Loan.pkl', 'rb') as cl:
     df_train_cl= pickle.load(cl)


##### Train Test Split

In [47]:
# Product - Sale
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_sale_cl = df_train_cl['Sale_CL']

X_train_sl,X_test_sl,y_train_sl,y_test_sl = train_test_split(X_cl,y_sale_cl,test_size=0.2,random_state=0)

In [48]:
# Revenue
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_revenue_cl = df_train_cl['Revenue_CL']

X_train_rev,X_test_rev,y_train_rev,y_test_rev = train_test_split(X_cl,y_revenue_cl,test_size=0.2,random_state=0)

##### Normalize The Features

In [49]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_sl)
X_test = scaler.transform(X_test_sl)

#### Cross Validation

##### Classification

In [50]:
# Verifying which model has better score for Classification
models = {
    'KNeighbors': KNeighborsClassifier(),
    'Logistic': LogisticRegression(max_iter=10000,solver = 'sag'),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'XGBoost':XGBClassifier()
}

scores_sl = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_sale_cl, cv=10)
    scores_sl.append({
        'model': model_name,
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
    })


In [51]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_sl = pd.DataFrame(scores_sl)
scores_sl

Unnamed: 0,model,mean_score,std_score
0,KNeighbors,0.652201,0.030784
1,Logistic,0.697605,0.015734
2,RandomForest,0.705874,0.023708
3,DecisionTree,0.615034,0.023623
4,XGBoost,0.699656,0.028221


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [52]:
rf = RandomForestClassifier()

rf.fit(X_train_sl,y_train_sl)

In [53]:
y_pred_sl = rf.predict(X_test_sl)

##### Evaluation

In [54]:
confusion_matrix(y_pred_sl,y_test_sl)

array([[134,  47],
       [  3,  10]])

In [55]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test_sl,y_pred_sl))
print('Precision:', precision_score(y_test_sl, y_pred_sl))
print('Recall:', recall_score(y_test_sl, y_pred_sl))
print('AUC:', roc_auc_score(y_test_sl, y_pred_sl))

Evaluation metrics:
Accuracy: 0.7422680412371134
Precision: 0.7692307692307693
Recall: 0.17543859649122806
AUC: 0.5767703931361251


##### Regression

In [56]:
# Define models
models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor()
}

scores_rev = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_revenue_cl, cv=10)
    scores_rev.append({
        'model': model_name,
        'mean_score': cv_scores.mean() * -1,
        'std_score': cv_scores.std(),
        
    })


In [57]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_rev = pd.DataFrame(scores_rev)
scores_rev

Unnamed: 0,model,mean_score,std_score
0,Linear,0.14035,0.253244
1,RandomForest,0.097835,0.130755
2,DecisionTree,1.936547,1.999629
3,XGBoost,0.391543,0.423703


I will choose RandomForest as it has a good balance between accuracy and stability

In [58]:
rf = RandomForestRegressor()

rf.fit(X_train_rev,y_train_rev)

In [59]:
y_pred_rev = rf.predict(X_test_rev)

##### Evaluation

In [60]:
mse=mean_squared_error(y_test_rev,y_pred_rev)
mse

47.39266056794714

In [61]:
rmse=np.sqrt(mse)
rmse

6.884232750855184

#### Targeting

In [67]:
cl_model_df = pd.DataFrame({
    'Client':  X_test_sl['Client'].values,
    'Revenue_CL':  y_pred_rev
}).sort_values('Revenue_CL', ascending= False)

cl_model_df.head(10)

Unnamed: 0,Client,Revenue_CL
67,219,24.659179
122,734,13.468121
34,102,12.757757
40,642,12.063982
161,1513,11.975121
153,1091,11.420804
150,1220,11.380989
168,1529,11.380279
18,1595,11.062982
56,928,10.643079
