#### Model Selection

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, mean_squared_error

In [2]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/Customer_Loan.pkl', 'rb') as f1:
     df_train_cl= pickle.load(f1)
with open('Pickle_Files/40pct.pkl', 'rb') as f2:
     df_40= pickle.load(f2)

##### Train Test Split

In [3]:
# Product - Sale
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_sale_cl = df_train_cl['Sale_CL']

X_train_sl,X_test_sl,y_train_sl,y_test_sl = train_test_split(X_cl,y_sale_cl,test_size=0.2,random_state=0)

In [4]:
X_cl.shape

(969, 28)

In [5]:
X_train_sl.head()

Unnamed: 0,Client,Sex,Age,Tenure,Count_CA,Count_SA,Count_MF,Count_OVD,Count_CC,ActBal_CA,...,VolumeDeb,VolumeDeb_CA,VolumeDebCash_Card,VolumeDebCashless_Card,VolumeDeb_PaymentOrder,TransactionsDeb,TransactionsDeb_CA,TransactionsDebCash_Card,TransactionsDebCashless_Card,TransactionsDeb_PaymentOrder
311,735,1,23,53,1,0.0,0.0,0.0,0.0,188.994643,...,339.935714,339.935714,214.285714,104.757143,16.5,20.0,20.0,6.0,8.0,1.0
773,442,1,13,38,1,0.0,0.0,1.0,0.0,1083.015357,...,1739.75,1739.75,1178.571429,243.535714,311.392857,14.0,14.0,3.0,1.0,6.0
1347,955,0,19,62,1,0.0,0.0,0.0,0.0,3131.223929,...,246.528571,246.528571,0.0,0.0,129.642857,6.0,6.0,0.0,0.0,3.0
99,469,1,38,183,1,0.0,0.0,0.0,0.0,4415.646071,...,1549.925,1549.925,500.0,0.0,789.928571,13.0,13.0,1.0,0.0,7.0
128,536,1,29,14,1,0.0,0.0,0.0,0.0,663.843571,...,185.642857,185.642857,0.0,0.0,183.464286,2.0,2.0,0.0,0.0,1.0


In [6]:
# Revenue
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_revenue_cl = df_train_cl['Revenue_CL']

X_train_rev,X_test_rev,y_train_rev,y_test_rev = train_test_split(X_cl,y_revenue_cl,test_size=0.2,random_state=0)

##### Normalize The Features

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_sl)
X_test = scaler.transform(X_test_sl)

#### Cross Validation

##### Classification

In [8]:
# Verifying which model has better score for Classification
models = {
    'KNeighbors': KNeighborsClassifier(),
    'Logistic': LogisticRegression(max_iter=10000,solver = 'sag'),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
}

scores_sl = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_sale_cl, cv=10)
    scores_sl.append({
        'model': model_name,
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
    })


In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_sl = pd.DataFrame(scores_sl)
scores_sl

Unnamed: 0,model,mean_score,std_score
0,KNeighbors,0.653232,0.030502
1,Logistic,0.697605,0.015734
2,RandomForest,0.7234,0.017559
3,DecisionTree,0.641881,0.046012


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [10]:
rf = RandomForestClassifier()

rf.fit(X_train_sl,y_train_sl)

In [11]:
y_pred_sl = rf.predict(X_test_sl)

##### Evaluation

In [12]:
confusion_matrix(y_pred_sl,y_test_sl)

array([[133,  47],
       [  4,  10]])

In [13]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test_sl,y_pred_sl))
print('Precision:', precision_score(y_test_sl, y_pred_sl))
print('Recall:', recall_score(y_test_sl, y_pred_sl))
print('AUC:', roc_auc_score(y_test_sl, y_pred_sl))

Evaluation metrics:
Accuracy: 0.7371134020618557
Precision: 0.7142857142857143
Recall: 0.17543859649122806
AUC: 0.5731207580996287


##### Regression

In [14]:
# Define models
models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
}

scores_rev = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_revenue_cl, cv=10)
    scores_rev.append({
        'model': model_name,
        'mean_score': cv_scores.mean() * -1,
        'std_score': cv_scores.std(),
        
    })


In [15]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_rev = pd.DataFrame(scores_rev)
scores_rev

Unnamed: 0,model,mean_score,std_score
0,Linear,0.135607,0.266524
1,RandomForest,0.082174,0.171804
2,DecisionTree,1.622186,2.073639


I will choose RandomForest as it has a good balance between accuracy and stability

In [16]:
rf = RandomForestRegressor()

rf.fit(X_train_rev,y_train_rev)

In [17]:
y_pred_rev = rf.predict(X_test_rev)

##### Evaluation

In [18]:
mse=mean_squared_error(y_test_rev,y_pred_rev)
rmse=np.sqrt(mse)

print('MSE:', mse)
print('RMSE:', rmse)


MSE: 50.44216239176987
RMSE: 7.102264596012308


#### Targeting

In [19]:
df_40_cl = df_40.drop(['Count_CL','ActBal_CL','Revenue_CL','Sale_CL','Sale_CC','Sale_MF','Revenue_CC','Revenue_MF'], axis=1)

df_40_cl.shape

(646, 28)

In [20]:
scaler.transform(df_40_cl)

array([[ 0.2148316 ,  0.95334516, -1.18006758, ..., -0.36767449,
        -0.64317598, -0.09071104],
       [-0.23555346,  0.95334516, -0.31503814, ..., -0.00560622,
        -0.64317598,  0.10456967],
       [-0.60229558, -1.04893804, -1.28819626, ...,  0.71853032,
        -0.26411789, -0.48127247],
       ...,
       [-0.73955579,  0.95334516,  1.3609564 , ..., -0.00560622,
        -0.26411789,  0.29985039],
       [-0.85751378,  0.95334516, -0.63942418, ..., -0.36767449,
        -0.51682329,  0.88569253],
       [ 1.40942436,  0.95334516,  1.0906347 , ..., -0.36767449,
        -0.51682329, -0.28599176]])

In [21]:

rev_40=rf.predict(df_40_cl)

consumer_loan = pd.DataFrame({
    'Client':  df_40['Client'].values,
    'Revenue_CL':  rev_40
}).sort_values('Revenue_CL', ascending= False)

consumer_loan.head(10)


Unnamed: 0,Client,Revenue_CL
63,217,24.544382
587,532,17.331321
168,1530,16.663521
525,243,16.270757
496,785,16.204489
286,1241,15.498529
197,878,15.377132
404,350,15.034875
306,1493,14.613796
560,1487,14.581625
