In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer, precision_score, f1_score

In [2]:
file_id = '1nCbwq2PktId7IlJmlR5kGn7vvpgiE8vW'
file_url = f'https://drive.google.com/uc?id={file_id}'
data = pd.read_csv(file_url)

In [3]:
data

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0
...,...,...,...,...,...,...,...,...,...,...,...
245720,BPAWWXZN,Male,51,RG284,Self_Employed,X3,109,,1925586,No,0
245721,HFNB7JY8,Male,27,RG268,Salaried,X1,15,No,862952,Yes,0
245722,GEHAUCWT,Female,26,RG281,Salaried,X1,13,No,670659,No,0
245723,GE7V8SAH,Female,28,RG273,Salaried,X1,31,No,407504,No,0


In [4]:
data_is_lead = data[data['Is_Lead'] == 1].copy()
data_is_lead

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
6,ETQCZFEJ,Male,62,RG282,Other,X3,20,,1056750,Yes,1
15,UJ2NJKKL,Male,33,RG268,Self_Employed,X2,69,,517063,Yes,1
16,CNGSPYWS,Female,46,RG268,Other,X3,97,Yes,2282502,No,1
17,VH7NBNNQ,Female,59,RG283,Other,X3,15,Yes,2384692,No,1
20,7PMYNWB3,Male,44,RG269,Self_Employed,X2,19,Yes,1001650,No,1
...,...,...,...,...,...,...,...,...,...,...,...
245697,DTQJBXK6,Male,43,RG257,Salaried,X1,50,No,551203,Yes,1
245699,MARTXU7T,Male,53,RG254,Other,X2,19,,1141825,Yes,1
245709,4GZKAUQC,Male,36,RG284,Self_Employed,X3,15,Yes,608110,Yes,1
245713,BZ7NHPMJ,Female,41,RG277,Self_Employed,X2,13,Yes,898170,Yes,1


In [5]:
data.nunique()

ID                     245725
Gender                      2
Age                        63
Region_Code                35
Occupation                  4
Channel_Code                4
Vintage                    66
Credit_Product              2
Avg_Account_Balance    135292
Is_Active                   2
Is_Lead                     2
dtype: int64

In [6]:
data.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [7]:
data=data.drop('ID',axis=1)

In [8]:
# Assuming 'data' is the DataFrame that contains both  lead_1_data and lead_0_data

# Fill missing values in 'Credit_Product' column with 'Yes' where Lead is 1
data.loc[data['Is_Lead']==1,'Credit_Product']=data.loc[data['Is_Lead'] == 1, 'Credit_Product'].fillna('Yes')

# Fill missing values in 'Credit_Product' column with 'No' where Lead is 0
data.loc[data['Is_Lead']==0,'Credit_Product']=data.loc[data['Is_Lead'] == 0, 'Credit_Product'].fillna('No')

In [9]:
data['Region_Code'] = data['Region_Code'].str.replace('RG', '').astype(int)

In [10]:
data['Channel_Code'].nunique()

4

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Select the categorical columns to encode
categorical_columns = ['Occupation', 'Channel_Code']

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=True)

# Fit and transform the encoder on the selected categorical columns
encoder.fit(data[categorical_columns])


In [12]:
encoder.categories_

[array(['Entrepreneur', 'Other', 'Salaried', 'Self_Employed'], dtype=object),
 array(['X1', 'X2', 'X3', 'X4'], dtype=object)]

In [13]:
encoded_column_names = ['Entrepreneur', 'Other', 'Salaried', 'Self_Employed','X1', 'X2', 'X3', 'X4']

In [14]:
encoded_columns = pd.DataFrame(encoder.transform(data[categorical_columns]).todense(), columns=encoded_column_names)

In [15]:
encoded_columns

Unnamed: 0,Entrepreneur,Other,Salaried,Self_Employed,X1,X2,X3,X4
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
245720,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
245721,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245722,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245723,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [16]:
new_df = pd.concat([data, encoded_columns], axis=1, ignore_index=True)
new_df.columns = data.columns.to_list() + encoded_column_names
new_df.drop(columns=['Occupation', 'Channel_Code'], inplace=True)

In [17]:
new_df['Gender'] = ((new_df['Gender'] == 'Female')).astype(int)
new_df['Is_Active'] = ((new_df['Is_Active'] == 'Yes')).astype(int)
new_df['Credit_Product'] = ((new_df['Credit_Product'] == 'Yes')).astype(int)

In [18]:
new_df

Unnamed: 0,Gender,Age,Region_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,Entrepreneur,Other,Salaried,Self_Employed,X1,X2,X3,X4
0,1,73,268,43,0,1045696,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,30,277,32,0,581988,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,56,268,26,0,1484315,1,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,34,270,19,0,470454,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1,30,282,33,0,886787,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0,51,284,109,0,1925586,0,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
245721,0,27,268,15,0,862952,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245722,1,26,281,13,0,670659,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245723,1,28,273,31,0,407504,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [19]:
#transformation technique
#log transformation:

# Log transformation on the "avg_account_balance" column
new_df["Avg_Account_Balance"] = new_df["Avg_Account_Balance"].map(lambda i: np.log(i) if i > 0 else 0)

In [20]:
x=new_df.drop('Is_Lead',axis=1)
y=new_df['Is_Lead']

In [21]:
x

Unnamed: 0,Gender,Age,Region_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Entrepreneur,Other,Salaried,Self_Employed,X1,X2,X3,X4
0,1,73,268,43,0,13.860193,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,30,277,32,0,13.274205,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,56,268,26,0,14.210464,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,34,270,19,0,13.061453,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1,30,282,33,0,13.695360,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0,51,284,109,0,14.470741,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
245721,0,27,268,15,0,13.668114,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245722,1,26,281,13,0,13.416016,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245723,1,28,273,31,0,12.917806,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [22]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler(feature_range=(0,1))

In [23]:
x.iloc[:, [1,2,3]]

Unnamed: 0,Age,Region_Code,Vintage
0,73,268,43
1,30,277,32
2,56,268,26
3,34,270,19
4,30,282,33
...,...,...,...
245720,51,284,109
245721,27,268,15
245722,26,281,13
245723,28,273,31


In [24]:
minmax.fit(x.iloc[:, [1,2,3]])

In [25]:
x.iloc[:, [1,2,3]] = minmax.transform(x.iloc[:, [1,2,3]])

In [26]:
x

Unnamed: 0,Gender,Age,Region_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Entrepreneur,Other,Salaried,Self_Employed,X1,X2,X3,X4
0,1,0.806452,0.529412,0.281250,0,13.860193,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0.112903,0.794118,0.195312,0,13.274205,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,0.532258,0.529412,0.148438,0,14.210464,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,0.177419,0.588235,0.093750,0,13.061453,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1,0.112903,0.941176,0.203125,0,13.695360,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245720,0,0.451613,1.000000,0.796875,0,14.470741,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
245721,0,0.064516,0.529412,0.062500,0,13.668114,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245722,1,0.048387,0.911765,0.046875,0,13.416016,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
245723,1,0.080645,0.676471,0.187500,0,12.917806,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


# GB Using balanced bagging

In [27]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import make_scorer, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier

In [28]:

# Create a SMOTE instance
smote = SMOTE(random_state=42)

# Initialize StratifiedKFold for stratified cross-validation with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gb_params = {'subsample': 1.0,
             'random_state': None,
             'n_estimators': 140,
             'min_samples_split': 10,
             'min_samples_leaf': 4,
             'max_features': None,
             'max_depth': 3,
             'learning_rate': 0.2
             }

# Initialize lists to store precision and F1 scores
precision_scores_gb = []
f1_scores_gb = []

# Perform stratified cross-validation
for train_index, test_index in stratified_kfold.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to the training data
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

    # Initialize the Gradient Boosting classifier
    gb_classifier = GradientBoostingClassifier(**gb_params)

    # Fit the classifier on the SMOTE-augmented training data
    gb_classifier.fit(x_train_smote, y_train_smote)

    # Predict on the test data
    y_pred = gb_classifier.predict(x_test)

    # Calculate precision and F1 scores
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision_scores_gb.append(precision)
    f1_scores_gb.append(f1)

# Print cross-validation precision scores for Gradient Boosting
print("Cross-Validation Precision Scores (Gradient Boosting):", precision_scores_gb)
print("Mean Precision (Gradient Boosting):", sum(precision_scores_gb) / len(precision_scores_gb))
print("Standard Deviation (Precision) (Gradient Boosting):", np.std(precision_scores_gb))

# Print cross-validation F1 scores for Gradient Boosting
print("Cross-Validation F1 Scores (Gradient Boosting):", f1_scores_gb)
print("Mean F1 Score (Gradient Boosting):", sum(f1_scores_gb) / len(f1_scores_gb))
print("Standard Deviation (F1) (Gradient Boosting):", np.std(f1_scores_gb))

Cross-Validation Precision Scores (Gradient Boosting): [0.6012014134275618, 0.6065763743028898, 0.6072534276868642, 0.6042311290206519, 0.6117629803461425]
Mean Precision (Gradient Boosting): 0.606205064956822
Standard Deviation (Precision) (Gradient Boosting): 0.0034945760894688187
Cross-Validation F1 Scores (Gradient Boosting): [0.6592784903320804, 0.6577913917687715, 0.6531874405328257, 0.6571708080610449, 0.6596030679212461]
Mean F1 Score (Gradient Boosting): 0.6574062397231938
Standard Deviation (F1) (Gradient Boosting): 0.0022949254126720226


In [29]:
import pickle

pickle.dump(gb_classifier, open('gb_model.pickle','wb'))
pickle.dump(encoder, open('encoder.pickle','wb'))
pickle.dump(minmax, open('minmax.pickle','wb'))