#### Model Selection

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier,XGBRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, make_scorer, mean_squared_error

In [2]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/Customer_Loan.pkl', 'rb') as cl:
     df_train_cl= pickle.load(cl)


##### Train Test Split

In [3]:
# Product - Sale
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_sale_cl = df_train_cl['Sale_CL']

X_train_sl,X_test_sl,y_train_sl,y_test_sl = train_test_split(X_cl,y_sale_cl,test_size=0.2,random_state=0)

In [4]:
X_cl.shape

(969, 28)

In [5]:
# Revenue
X_cl = df_train_cl.copy()
X_cl.drop(['Revenue_CL','Sale_CL'],inplace=True,axis=1)

y_revenue_cl = df_train_cl['Revenue_CL']

X_train_rev,X_test_rev,y_train_rev,y_test_rev = train_test_split(X_cl,y_revenue_cl,test_size=0.2,random_state=0)

##### Normalize The Features

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_sl)
X_test = scaler.transform(X_test_sl)

#### Cross Validation

##### Classification

In [7]:
# Verifying which model has better score for Classification
models = {
    'KNeighbors': KNeighborsClassifier(),
    'Logistic': LogisticRegression(max_iter=10000,solver = 'sag'),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'XGBoost':XGBClassifier()
}

scores_sl = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_sale_cl, cv=10)
    scores_sl.append({
        'model': model_name,
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
    })


In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_sl = pd.DataFrame(scores_sl)
scores_sl

Unnamed: 0,model,mean_score,std_score
0,KNeighbors,0.652201,0.030784
1,Logistic,0.697605,0.015734
2,RandomForest,0.706927,0.024327
3,DecisionTree,0.617096,0.025561
4,XGBoost,0.699656,0.028221


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [9]:
rf = RandomForestClassifier()

rf.fit(X_train_sl,y_train_sl)

In [10]:
y_pred_sl = rf.predict(X_test_sl)

##### Evaluation

In [11]:
confusion_matrix(y_pred_sl,y_test_sl)

array([[133,  47],
       [  4,  10]])

In [12]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test_sl,y_pred_sl))
print('Precision:', precision_score(y_test_sl, y_pred_sl))
print('Recall:', recall_score(y_test_sl, y_pred_sl))
print('AUC:', roc_auc_score(y_test_sl, y_pred_sl))

Evaluation metrics:
Accuracy: 0.7371134020618557
Precision: 0.7142857142857143
Recall: 0.17543859649122806
AUC: 0.5731207580996287


##### Regression

In [13]:
# Define models
models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor()
}

scores_rev = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_cl, y_revenue_cl, cv=10)
    scores_rev.append({
        'model': model_name,
        'mean_score': cv_scores.mean() * -1,
        'std_score': cv_scores.std(),
        
    })


In [14]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_rev = pd.DataFrame(scores_rev)
scores_rev

Unnamed: 0,model,mean_score,std_score
0,Linear,0.14035,0.253244
1,RandomForest,0.125493,0.172994
2,DecisionTree,1.85514,1.712336
3,XGBoost,0.391543,0.423703


I will choose RandomForest as it has a good balance between accuracy and stability

In [15]:
rf = RandomForestRegressor()

rf.fit(X_train_rev,y_train_rev)

In [16]:
y_pred_rev = rf.predict(X_test_rev)

##### Evaluation

In [17]:
mse=mean_squared_error(y_test_rev,y_pred_rev)
mse

46.70703502789902

In [18]:
rmse=np.sqrt(mse)
rmse

6.8342545334439375

In [19]:
cl_model_df = pd.DataFrame({
    'Client':  X_test_sl['Client'].values,
    'Revenue_CL':  y_pred_rev
}).sort_values('Revenue_CL', ascending= False)

cl_model_df.head(10)

Unnamed: 0,Client,Revenue_CL
67,219,25.427936
122,734,15.123457
191,36,13.305629
40,642,13.007879
153,1091,12.527775
161,1513,12.189361
34,102,11.733986
150,1220,9.877193
83,413,9.729025
18,1595,9.205671


#### Targeting

In [20]:
# Loading the pickle file Customer_Loan

with open('Pickle_Files/40pct.pkl', 'rb') as cl:
     df_40= pickle.load(cl)

In [21]:
sex_mapping = {'M': 1, 'F': 0}
df_40['Sex'] = df_40['Sex'].replace(sex_mapping)

In [22]:
df_40 = df_40.replace(np.nan,0)

In [23]:
df_40.head()

Unnamed: 0,Client,Sex,Age,Tenure,Count_CA,Count_SA,Count_MF,Count_OVD,Count_CC,ActBal_CA,ActBal_SA,ActBal_MF,ActBal_OVD,ActBal_CC,VolumeCred,VolumeCred_CA,TransactionsCred,TransactionsCred_CA,VolumeDeb,VolumeDeb_CA,VolumeDebCash_Card,VolumeDebCashless_Card,VolumeDeb_PaymentOrder,TransactionsDeb,TransactionsDeb_CA,TransactionsDebCash_Card,TransactionsDebCashless_Card,TransactionsDeb_PaymentOrder
0,909,1.0,21,27,1,0.0,0.0,1.0,0.0,4.710714,0.0,0.0,0.0,0.0,789.129643,738.23,4.0,3.0,450.678571,448.892857,178.571429,0.0,166.571429,8.0,7.0,1.0,0.0,4.0
6,699,1.0,37,175,1,0.0,4.0,1.0,0.0,1823.057143,0.0,18491.444286,0.0,0.0,1033.496071,778.37,8.0,6.0,661.483214,566.126071,89.285714,0.0,216.892857,13.0,10.0,2.0,0.0,5.0
9,528,0.0,19,70,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,435.682143,435.682143,2.0,2.0,390.056429,390.056429,125.0,70.842143,190.821429,10.0,10.0,4.0,3.0,2.0
10,1145,1.0,61,45,1,0.0,0.0,0.0,0.0,324.71,0.0,0.0,0.0,0.0,132.158929,132.158929,3.0,3.0,3.392857,3.392857,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
13,517,0.0,41,28,1,0.0,0.0,0.0,0.0,17.051071,0.0,0.0,0.0,0.0,570.157857,570.157857,2.0,2.0,557.896429,557.896429,71.428571,224.146429,251.178571,17.0,17.0,2.0,10.0,3.0


In [24]:
df_40_cl = scaler.transform(df_40)


In [36]:

rev_40=rf.predict(df_40_cl)

features_df = pd.DataFrame({
    'Client':  df_40['Client'].values,
    'Revenue_CL':  rev_40
}).sort_values('Revenue_CL', ascending= False)

features_df.head(10)




Unnamed: 0,Client,Revenue_CL
631,1072,10.304943
108,957,9.98615
487,1112,9.791104
599,307,9.675218
520,1252,9.675218
551,530,9.675218
343,153,9.634493
534,1569,9.62665
387,1260,9.614621
78,1227,9.573936


In [27]:
scaler.feature_names_in_

array(['Client', 'Sex', 'Age', 'Tenure', 'Count_CA', 'Count_SA',
       'Count_MF', 'Count_OVD', 'Count_CC', 'ActBal_CA', 'ActBal_SA',
       'ActBal_MF', 'ActBal_OVD', 'ActBal_CC', 'VolumeCred',
       'VolumeCred_CA', 'TransactionsCred', 'TransactionsCred_CA',
       'VolumeDeb', 'VolumeDeb_CA', 'VolumeDebCash_Card',
       'VolumeDebCashless_Card', 'VolumeDeb_PaymentOrder',
       'TransactionsDeb', 'TransactionsDeb_CA',
       'TransactionsDebCash_Card', 'TransactionsDebCashless_Card',
       'TransactionsDeb_PaymentOrder'], dtype=object)

In [34]:
import pandas as pd
import numpy as np

# Assuming your array is named 'features_array'
features_array = np.array(['Client', 'Sex', 'Age', 'Tenure', 'Count_CA', 'Count_SA',
                           'Count_MF', 'Count_OVD', 'Count_CC', 'ActBal_CA', 'ActBal_SA',
                           'ActBal_MF', 'ActBal_OVD', 'ActBal_CC', 'VolumeCred',
                           'VolumeCred_CA', 'TransactionsCred', 'TransactionsCred_CA',
                           'VolumeDeb', 'VolumeDeb_CA', 'VolumeDebCash_Card',
                           'VolumeDebCashless_Card', 'VolumeDeb_PaymentOrder',
                           'TransactionsDeb', 'TransactionsDeb_CA',
                           'TransactionsDebCash_Card', 'TransactionsDebCashless_Card',
                           'TransactionsDeb_PaymentOrder'])

# Create a DataFrame
features_df = pd.DataFrame(features_array, columns=['Features'])

# Now you can access the features using features_df
print(features_df)




                        Features
0                         Client
1                            Sex
2                            Age
3                         Tenure
4                       Count_CA
5                       Count_SA
6                       Count_MF
7                      Count_OVD
8                       Count_CC
9                      ActBal_CA
10                     ActBal_SA
11                     ActBal_MF
12                    ActBal_OVD
13                     ActBal_CC
14                    VolumeCred
15                 VolumeCred_CA
16              TransactionsCred
17           TransactionsCred_CA
18                     VolumeDeb
19                  VolumeDeb_CA
20            VolumeDebCash_Card
21        VolumeDebCashless_Card
22        VolumeDeb_PaymentOrder
23               TransactionsDeb
24            TransactionsDeb_CA
25      TransactionsDebCash_Card
26  TransactionsDebCashless_Card
27  TransactionsDeb_PaymentOrder


In [None]:
# scores = []

# for model_name, mp in model_param.items():
#     hyperpara_model = GridSearchCV(mp['model'],cv=5,return_train_score=False)
#     hyperpara_model.fit(X_cl,y_sale_cl)
#     scores.append({
#         'model': model_name,
#         'best_score':hyperpara_model.best_score_,
#         'best_param':hyperpara_model.best_params_
#     })