#### Model Selection

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, precision_score, mean_squared_error,f1_score

In [2]:
# Loading the pickle file Customer_Loan

with open('/Users/jonathanrabbi/Desktop/KBC_ML/Pickle_Files/Mutual_Fund.pkl', 'rb') as f1:
     df_train_mf= pickle.load(f1)
with open('/Users/jonathanrabbi/Desktop/KBC_ML/Pickle_Files/40pct.pkl', 'rb') as f2:
     df_40= pickle.load(f2)

##### Train Test Split

In [3]:
# Split the dataset into training and test sets based on product
X_mf = df_train_mf.copy()
X_mf.drop(['Revenue_MF','Sale_MF'],inplace=True,axis=1)

y_sale_mf = df_train_mf['Sale_MF']

X_train_sl,X_test_sl,y_train_sl,y_test_sl = train_test_split(X_mf,y_sale_mf,test_size=0.2,random_state=0)

In [4]:
# Split the dataset into training and test sets based on revenue
X_mf = df_train_mf.copy()
X_mf.drop(['Revenue_MF','Sale_MF'],inplace=True,axis=1)

y_revenue_mf = df_train_mf['Revenue_MF']

X_train_rev,X_test_rev,y_train_rev,y_test_rev = train_test_split(X_mf,y_revenue_mf,test_size=0.2,random_state=0)

##### Normalize The Features

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_sl)
X_test = scaler.transform(X_test_sl)

#### Cross Validation

##### Classification

In [6]:
model_param = {
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'param' : {'n_neighbors':[3,5,10,100],
                   'weights':['uniform','distance']
                   }
        },
    'Logistic':{
        'model':LogisticRegression(max_iter=10000,solver='sag'), # converge warning hence sag
        'param': { 'C':[1,5,10]}
        },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param' : {'n_estimators':[5,10,50,100]}
        },
    'DecisionTree':{
        'model': DecisionTreeClassifier(),
        'param' : {'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],  
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4]}
        }
}

In [7]:
scores_class = []

for model_name, mp in model_param.items():
    hyperpara_model = GridSearchCV(mp['model'],mp['param'],cv=5,return_train_score=False)
    hyperpara_model.fit(X_mf,y_sale_mf)
    scores_class.append({
        'model': model_name,
        'best_score':hyperpara_model.best_score_,
        'best_param':hyperpara_model.best_params_
    })

In [8]:
scores_mf = pd.DataFrame(scores_class)
scores_mf

Unnamed: 0,model,best_score,best_param
0,KNeighbors,0.808039,"{'n_neighbors': 10, 'weights': 'uniform'}"
1,Logistic,0.802874,{'C': 1}
2,RandomForest,0.80907,{'n_estimators': 100}
3,DecisionTree,0.775007,"{'criterion': 'entropy', 'max_depth': 10, 'min..."


RandomForest seems to have the best score. Hence, I will be using this for model prediction. It is also more robust towards imbalanced datasets compared to KNeighbors + Logistic + DecisionTrees, as they are more biased towards majority data

In [24]:
rf = RandomForestClassifier(n_estimators=50)

rf.fit(X_train_sl,y_train_sl)

In [25]:
y_pred_sl = rf.predict(X_test_sl)

In [26]:
# Probability of being correctly classified
y_pred_sl_prob=rf.predict_proba(X_test_sl)

##### Evaluation

In [12]:
confusion_matrix(y_pred_sl,y_test_sl)

array([[149,  42],
       [  1,   2]])

In [13]:
print('Evaluation metrics:')
print('Accuracy:', accuracy_score(y_test_sl,y_pred_sl))
print('Precision:', precision_score(y_test_sl, y_pred_sl))
print('Recall:', recall_score(y_test_sl, y_pred_sl))
print('F-1:', f1_score(y_test_sl, y_pred_sl))
print('AUC:', roc_auc_score(y_test_sl, y_pred_sl))

Evaluation metrics:
Accuracy: 0.7783505154639175
Precision: 0.6666666666666666
Recall: 0.045454545454545456
F-1: 0.08510638297872342
AUC: 0.5193939393939393


##### Regression

In [14]:
# Define models
models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
}

scores_rev = []

for model_name, model in models.items():
    # Use cross_val_score for simplicity
    cv_scores = cross_val_score(model, X_mf, y_revenue_mf, cv=10)
    scores_rev.append({
        'model': model_name,
        'mean_score': cv_scores.mean() * -1,
        'std_score': cv_scores.std(),
        
    })


In [15]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

scores_rev = pd.DataFrame(scores_rev)
scores_rev

Unnamed: 0,model,mean_score,std_score
0,Linear,0.523526,0.51234
1,RandomForest,0.682435,0.65682
2,DecisionTree,7.444255,10.459197


I will choose RandomForest as it has a good balance between accuracy and stability

In [16]:
rf = RandomForestRegressor(n_estimators=50)

rf.fit(X_train_rev,y_train_rev)

In [17]:
y_pred_rev = rf.predict(X_test_rev)

In [None]:
y_pred_sl_prob=rf.predict_proba(X_test_sl)

##### Evaluation

In [18]:
mse=mean_squared_error(y_test_rev,y_pred_rev)
rmse=np.sqrt(mse)

print('MSE:', mse)
print('RMSE:', rmse)


MSE: 81.57237013193193
RMSE: 9.031742364125094


##### Feature Importance

In [19]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_mf.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
feature_importances.head()

Unnamed: 0,importance
VolumeCred_CA,0.121574
Age,0.09213
TransactionsDeb,0.089867
TransactionsDeb_PaymentOrder,0.072539
VolumeCred,0.068583


#### Targeting

In [20]:
df_40_mf = df_40.drop(['Count_MF','ActBal_MF','Revenue_CL','Sale_CL','Sale_CC','Sale_MF','Revenue_CC','Revenue_MF'], axis=1)

df_40_mf.shape

(646, 27)

In [21]:
scaler.transform(df_40_mf)

array([[ 0.95334516, -1.34751402, -1.15593713, ..., -0.36767449,
        -0.64317598, -0.09071104],
       [ 0.95334516, -0.41274672,  1.14550103, ..., -0.00560622,
        -0.64317598,  0.10456967],
       [-1.04893804, -1.46435993, -0.48727605, ...,  0.71853032,
        -0.26411789, -0.48127247],
       ...,
       [ 0.95334516,  1.39836492,  0.80339536, ..., -0.00560622,
        -0.26411789,  0.29985039],
       [ 0.95334516, -0.76328446, -0.16072063, ..., -0.36767449,
        -0.51682329,  0.88569253],
       [ 0.95334516,  1.10625014, -0.06741908, ..., -0.36767449,
        -0.51682329, -0.28599176]])

In [27]:
prob_cl=rf.predict_proba(df_40_mf)

In [28]:

rev_40=rf.predict(df_40_mf)

mutual_fund = pd.DataFrame({
    'Client': df_40_mf.index,
    'Revenue_MF':  rev_40,
    'Probability': prob_cl[:,1]
}).sort_values('Probability', ascending= False)


mutual_fund.rename(columns={'Revenue_CC':'Revenue'},inplace=True)
mutual_fund['Product']='MF'
mutual_fund.head(10)
mutual_fund.head(10)


Unnamed: 0,Client,Revenue_MF,Probability,Product
559,1480,1.0,0.66,MF
61,154,1.0,0.58,MF
586,1435,1.0,0.56,MF
146,313,1.0,0.54,MF
579,1468,1.0,0.54,MF
24,109,1.0,0.54,MF
207,910,1.0,0.54,MF
418,354,1.0,0.52,MF
409,1229,1.0,0.52,MF
431,1518,0.0,0.5,MF


In [31]:
mutual_fund['Revenue_prob'] = mutual_fund['Revenue_MF']*mutual_fund['Probability']

mutual_fund.head()

Unnamed: 0,Client,Revenue_MF,Probability,Product,Revenue_prob
559,1480,1.0,0.66,MF,0.66
61,154,1.0,0.58,MF,0.58
586,1435,1.0,0.56,MF,0.56
146,313,1.0,0.54,MF,0.54
579,1468,1.0,0.54,MF,0.54


In [33]:
file_path1 = '/Users/jonathanrabbi/Desktop/KBC_ML/Pickle_Files/Revenue_MF.pkl'

with open(file_path1, 'wb') as f1:
    pickle.dump(mutual_fund, f1)