#### Model Selection

In [59]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, mean_squared_error, f1_score

In [60]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/Customer_Loan.pkl', 'rb') as f1:
     df_train_cl= pickle.load(f1)
with open('Pickle_Files/40pct.pkl', 'rb') as f2:
     df_40= pickle.load(f2)

##### Train Test Split

In [61]:
# Split the dataset into training and test sets based on product
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_sale_cl = df_train_cl['Sale_CL']

X_train_sl,X_test_sl,y_train_sl,y_test_sl = train_test_split(X_cl,y_sale_cl,test_size=0.2,random_state=0)

In [62]:
# Split the dataset into training and test sets based on revenue
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_revenue_cl = df_train_cl['Revenue_CL']

X_train_rev,X_test_rev,y_train_rev,y_test_rev = train_test_split(X_cl,y_revenue_cl,test_size=0.2,random_state=0)

##### Normalize The Features

In [63]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_sl)
X_test = scaler.transform(X_test_sl)

#### Cross Validation

##### Classification

In [64]:
# Verifying which model has better score for Classification
model_param = {
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'param' : {'n_neighbors':[3,5,10,100],
                   'weights':['uniform','distance']}
        },
    'Logistic':{
        'model':LogisticRegression(max_iter=10000,solver='sag'), # converge warning hence sag
        'param': { 'C':[1,5,10]}
        },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param' : {'n_estimators':[5,10,50,100]}
        },
    'DecisionTree':{
        'model': DecisionTreeClassifier(),
        'param' : {'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],  
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4]}
        }
}


In [65]:
scores_class = []

for model_name, mp in model_param.items():
    hyperpara_model = GridSearchCV(mp['model'],mp['param'],cv=5,return_train_score=False)
    hyperpara_model.fit(X_cl,y_sale_cl)
    scores_class.append({
        'model': model_name,
        'best_score':hyperpara_model.best_score_,
        'best_param':hyperpara_model.best_params_
    })

In [66]:
scores_cl = pd.DataFrame(scores_class)
scores_cl

Unnamed: 0,model,best_score,best_param
0,KNeighbors,0.700721,"{'n_neighbors': 100, 'weights': 'uniform'}"
1,Logistic,0.695566,{'C': 1}
2,RandomForest,0.715159,{'n_estimators': 100}
3,DecisionTree,0.686272,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}"


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [67]:
rf = RandomForestClassifier(n_estimators=100)

rf.fit(X_train_sl,y_train_sl)

In [68]:
y_pred_sl = rf.predict(X_test_sl)

In [69]:
# Probability of being correctly classified
y_pred_sl_prob=rf.predict_proba(X_test_sl)

In [70]:
type(y_pred_sl_prob)

numpy.ndarray

##### Evaluation

In [71]:
confusion_matrix(y_pred_sl,y_test_sl)

array([[134,  48],
       [  3,   9]])

In [72]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test_sl,y_pred_sl))
print('Precision:', precision_score(y_test_sl, y_pred_sl))
print('Recall:', recall_score(y_test_sl, y_pred_sl))
print('F1:', f1_score(y_test_sl, y_pred_sl))
print('AUC:', roc_auc_score(y_test_sl, y_pred_sl))

Evaluation metrics:
Accuracy: 0.7371134020618557
Precision: 0.75
Recall: 0.15789473684210525
F1: 0.2608695652173913
AUC: 0.5679984633115637


##### Regression

In [73]:
# # Define models
# models = {
#     'Linear': LinearRegression(),
#     'RandomForest': RandomForestRegressor(),
#     'DecisionTree': DecisionTreeRegressor(),
# }

# scores_rev = []

# for model_name, model in models.items():
#     # Use cross_val_score for simplicity
#     cv_scores = cross_val_score(model, X_cl, y_revenue_cl, cv=10)
#     scores_rev.append({
#         'model': model_name,
#         'mean_score': cv_scores.mean() * -1,
#         'std_score': cv_scores.std(),
        
#     })


In [74]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

# scores_rev = pd.DataFrame(scores_rev)
# scores_rev

Unnamed: 0,model,mean_score,std_score
0,Linear,0.135509,0.251869
1,RandomForest,0.111269,0.190512
2,DecisionTree,1.963564,2.372205


In [92]:
model_param = {
    'Linear': {
        'model': LinearRegression(),
        'param' : {}
        },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'param' : {'n_estimators':[5,10,50,100]}
        },
    'DecisionTree':{
        'model': DecisionTreeRegressor(),
        'param' : {'criterion': ['friedman_mse', 'squared_error', 'absolute_error', 'poisson'],
            'max_depth': [None, 10, 20, 30],  
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4]}
        }
}

In [93]:
scores_regression = []

for model_name, mp in model_param.items():
    hyperpara_model = GridSearchCV(mp['model'],mp['param'],cv=5,return_train_score=False)
    hyperpara_model.fit(X_cl,y_sale_cl)
    scores_regression.append({
        'model': model_name,
        'best_score':hyperpara_model.best_score_,
        'best_param':hyperpara_model.best_params_
    })

In [94]:
scores_cl_reg = pd.DataFrame(scores_regression)
scores_cl_reg

Unnamed: 0,model,best_score,best_param
0,Linear,0.039677,{}
1,RandomForest,0.034139,{'n_estimators': 50}
2,DecisionTree,-0.262529,"{'criterion': 'poisson', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}"


I will choose RandomForest as it has a good balance between accuracy and stability

In [96]:
rf_1 = RandomForestRegressor(n_estimators=50)

rf_1.fit(X_train_rev,y_train_rev)

In [97]:
y_pred_rev = rf_1.predict(X_test_rev)

##### Evaluation

In [98]:
mse=mean_squared_error(y_test_rev,y_pred_rev)
rmse=np.sqrt(mse)

print('MSE:', mse)
print('RMSE:', rmse)


MSE: 50.531381566687095
RMSE: 7.1085428581874


##### Feature Importance

In [99]:
feature_importances = pd.DataFrame(rf_1.feature_importances_,
                                   index = X_cl.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
VolumeDeb_PaymentOrder,0.105544
Tenure,0.105165
Age,0.088647
ActBal_SA,0.077987
VolumeDeb_CA,0.077445
VolumeDeb,0.07196
VolumeCred,0.06714
ActBal_CA,0.060148
VolumeDebCash_Card,0.042444
VolumeCred_CA,0.041729


#### Targeting

In [100]:
df_40_cl = df_40.drop(['Count_CL','ActBal_CL','Revenue_CL','Sale_CL','Sale_CC','Sale_MF','Revenue_CC','Revenue_MF'], axis=1)

df_40_cl.shape

(646, 27)

In [101]:
scaler.transform(df_40_cl)

array([[1.        , 0.1375    , 0.099631  , ..., 0.04      , 0.        ,
        0.11764706],
       [1.        , 0.3375    , 0.64575646, ..., 0.08      , 0.        ,
        0.14705882],
       [0.        , 0.1125    , 0.25830258, ..., 0.16      , 0.05084746,
        0.05882353],
       ...,
       [1.        , 0.725     , 0.56457565, ..., 0.08      , 0.05084746,
        0.17647059],
       [1.        , 0.2625    , 0.33579336, ..., 0.04      , 0.01694915,
        0.26470588],
       [1.        , 0.6625    , 0.35793358, ..., 0.04      , 0.01694915,
        0.08823529]])

In [102]:
prob_cl=rf.predict_proba(df_40_cl)
#rev_40_classifier = rf_1.predict(df_40_cl)

In [103]:
rev_40=rf_1.predict(df_40_cl)

consumer_loan = pd.DataFrame({
    'Client': df_40_cl.index,
    'Revenue_CL':  rev_40,
    'Probability': prob_cl[:,1]
}).sort_values('Probability', ascending= False)

consumer_loan.rename(columns={'Revenue_CL':'Revenue'},inplace=True)
consumer_loan['Product']='CL'
consumer_loan.head(10)

Unnamed: 0,Client,Revenue,Probability,Product
252,595,6.831179,0.76,CL
140,485,8.740693,0.69,CL
394,41,12.772829,0.69,CL
178,706,6.549607,0.67,CL
568,498,10.33785,0.64,CL
44,889,2.516114,0.62,CL
585,731,12.571386,0.6,CL
573,1210,12.209357,0.6,CL
261,239,8.423786,0.6,CL
83,852,5.1056,0.59,CL


In [83]:
file_path1 = 'Pickle_Files/Revenue_CL.pkl'

with open(file_path1, 'wb') as f1:
    pickle.dump(consumer_loan, f1)