# Problem Statement

### Credit Card Lead Prediction

Happy Customer Bank is a mid-sized private bank that deals in all kinds of banking products, like Savings accounts, Current accounts, investment products, credit products, among other offerings.


The bank also cross-sells products to its existing customers and to do so they use different kinds of communication like tele-calling, e-mails, recommendations on net banking, mobile banking, etc. 


In this case, the Happy Customer Bank wants to cross sell its credit cards to its existing customers. The bank has identified a set of customers that are eligible for taking these credit cards.


Now, the bank is looking for your help in identifying customers that could show higher intent towards a recommended credit card, given:

    Customer details (gender, age, region etc.)
    Details of his/her relationship with the bank (Channel_Code,Vintage, 'Avg_Asset_Value etc.)


In [1]:
# !wget https://datahack-prod.s3.amazonaws.com/train_file/train_s3TEQDk.csv
# !wget https://datahack-prod.s3.amazonaws.com/test_file/test_mSzZ8RL.csv
# pip install -U scikit-learn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

pd.options.display.max_columns = 300

In [3]:
data = pd.read_csv("train_s3TEQDk.csv")
test = pd.read_csv("test_mSzZ8RL.csv")
print(f"Train shape {data.shape}, Test Shape {test.shape}")

Train shape (245725, 11), Test Shape (105312, 10)


In [4]:
train,valid = train_test_split(data,test_size=0.20,random_state=345,stratify=data['Is_Lead'])
train = train.copy()
valid = valid.copy()
print(f"Train shape {train.shape} Validation shape {valid.shape}")

Train shape (196580, 11) Validation shape (49145, 11)


In [5]:
train.head(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
53078,N3ZQ84QR,Female,46,RG280,Self_Employed,X2,51,No,863584,Yes,0
213644,JWGAMK7P,Male,67,RG258,Other,X2,43,Yes,706126,No,0
131870,CX9NGNQT,Male,46,RG279,Self_Employed,X2,26,Yes,422207,Yes,0


In [6]:
test.head(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No


In [7]:
valid.head(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
148453,OK9KJGZ2,Female,55,RG268,Self_Employed,X1,37,No,929257,No,0
117997,TTC7CPSI,Male,57,RG283,Self_Employed,X3,87,,909740,No,0
5432,MPUWVRAX,Male,39,RG275,Salaried,X1,8,Yes,961742,Yes,0


In [8]:
train['ID'].nunique()

196580

In [9]:
train.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         23525
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [10]:
train['Gender'].value_counts(normalize=True)

Male      0.546693
Female    0.453307
Name: Gender, dtype: float64

In [11]:
train['Region_Code'].nunique()

35

In [12]:
train['Occupation'].value_counts(normalize=True)

Self_Employed    0.411532
Salaried         0.292731
Other            0.284866
Entrepreneur     0.010871
Name: Occupation, dtype: float64

In [13]:
train['Channel_Code'].value_counts(normalize=True)

X1    0.421279
X3    0.280359
X2    0.275669
X4    0.022693
Name: Channel_Code, dtype: float64

In [14]:
train['Credit_Product'].value_counts(normalize=True)

No     0.667019
Yes    0.332981
Name: Credit_Product, dtype: float64

In [15]:
train['Avg_Account_Balance'].describe()

count    1.965800e+05
mean     1.129489e+06
std      8.532486e+05
min      2.079000e+04
25%      6.042470e+05
50%      8.954865e+05
75%      1.368733e+06
max      1.035201e+07
Name: Avg_Account_Balance, dtype: float64

In [16]:
train['Is_Active'].value_counts(normalize=True)

No     0.611375
Yes    0.388625
Name: Is_Active, dtype: float64

In [17]:
train['Is_Lead'].value_counts(normalize=True)

0    0.762794
1    0.237206
Name: Is_Lead, dtype: float64

In [18]:
train['Age'].describe()

count    196580.000000
mean         43.864971
std          14.821238
min          23.000000
25%          30.000000
50%          43.000000
75%          54.000000
max          85.000000
Name: Age, dtype: float64

In [19]:
train['Vintage'].describe()

count    196580.000000
mean         46.978121
std          32.346981
min           7.000000
25%          20.000000
50%          32.000000
75%          73.000000
max         135.000000
Name: Vintage, dtype: float64

In [20]:
train.groupby(['Is_Lead'])[['Age','Avg_Account_Balance']].median()

Unnamed: 0_level_0,Age,Avg_Account_Balance
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38,871158
1,49,980686


In [21]:
train['Credit_Product'] = train['Credit_Product'].fillna('NA')
train['Avg_Account_Balance'] = np.log(1+train['Avg_Account_Balance'])

test['Credit_Product'] = test['Credit_Product'].fillna('NA')
test['Avg_Account_Balance'] = np.log(1+test['Avg_Account_Balance'])

valid['Credit_Product'] = valid['Credit_Product'].fillna('NA')
valid['Avg_Account_Balance'] = np.log(1+valid['Avg_Account_Balance'])


In [22]:
train.sample(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
31990,PKVKG43V,Male,33,RG283,Self_Employed,X2,37,Yes,13.621458,Yes,1
33526,YYYAHISF,Male,56,RG269,Other,X4,15,No,13.66138,Yes,0
74313,HDXFTDTW,Male,38,RG261,Salaried,X3,19,Yes,13.920527,No,0


In [23]:
train['Is_Active'] = train['Is_Active'].replace({'No':'N','Yes':'Y'})
test['Is_Active'] = test['Is_Active'].replace({'No':'N','Yes':'Y'})
valid['Is_Active'] = valid['Is_Active'].replace({'No':'N','Yes':'Y'})

In [24]:
def bucket_age(age):
    if age <= 25:
        val = '0-25'
    elif age > 25 and age <=30:
        val = '25-30'
    elif age > 30 and age <= 35:
        val = '30-35'
    elif age > 35 and age <=40:
        val = '35-40'
    elif age > 40 and age <=45:
        val = '40-45'
    elif age > 45 and age <=50:
        val = '45-50'
    elif age > 50 and age <= 55:
        val = '50-55'
    elif age > 55 and age <= 60:
        val = '55-60'
    elif age > 60 and age <=65:
        val = '60-65'
    elif age >65 and age <=70:
        val = '65-70'
    else:
        val = '70+'
    return val

train['age_cat'] = train['Age'].apply(bucket_age)
test['age_cat'] = test['Age'].apply(bucket_age)
valid['age_cat'] = valid['Age'].apply(bucket_age)
train[['Age','age_cat']].sample(3)

Unnamed: 0,Age,age_cat
108575,41,40-45
238977,48,45-50
142863,29,25-30


In [27]:
train[['Age','age_cat']].sample(3)

Unnamed: 0,Age,age_cat
74574,58,55-60
99151,48,45-50
180014,43,40-45


In [28]:
cat_cols = ['age_cat','Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']
featured_cols = []
for idx,col in enumerate(cat_cols):
    for sub_col in cat_cols[idx+1:]:
        new_col = f"{col}-{sub_col}"
        featured_cols.append(new_col)
        train[new_col] = train[col] + "-" + train[sub_col]
        test[new_col] = test[col] + "-" + test[sub_col]
        valid[new_col] = valid[col] + "-" + valid[sub_col]

train.sample(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,age_cat,age_cat-Gender,age_cat-Region_Code,age_cat-Occupation,age_cat-Channel_Code,age_cat-Credit_Product,age_cat-Is_Active,Gender-Region_Code,Gender-Occupation,Gender-Channel_Code,Gender-Credit_Product,Gender-Is_Active,Region_Code-Occupation,Region_Code-Channel_Code,Region_Code-Credit_Product,Region_Code-Is_Active,Occupation-Channel_Code,Occupation-Credit_Product,Occupation-Is_Active,Channel_Code-Credit_Product,Channel_Code-Is_Active,Credit_Product-Is_Active
220460,3DZJLMZW,Female,51,RG254,Self_Employed,X2,86,No,13.666164,Y,0,50-55,50-55-Female,50-55-RG254,50-55-Self_Employed,50-55-X2,50-55-No,50-55-Y,Female-RG254,Female-Self_Employed,Female-X2,Female-No,Female-Y,RG254-Self_Employed,RG254-X2,RG254-No,RG254-Y,Self_Employed-X2,Self_Employed-No,Self_Employed-Y,X2-No,X2-Y,No-Y
191766,5NMCCFZ4,Female,69,RG258,Other,X2,74,Yes,13.677335,N,1,65-70,65-70-Female,65-70-RG258,65-70-Other,65-70-X2,65-70-Yes,65-70-N,Female-RG258,Female-Other,Female-X2,Female-Yes,Female-N,RG258-Other,RG258-X2,RG258-Yes,RG258-N,Other-X2,Other-Yes,Other-N,X2-Yes,X2-N,Yes-N
99947,EOT4S7CG,Female,32,RG272,Salaried,X1,26,,13.650617,N,1,30-35,30-35-Female,30-35-RG272,30-35-Salaried,30-35-X1,30-35-NA,30-35-N,Female-RG272,Female-Salaried,Female-X1,Female-NA,Female-N,RG272-Salaried,RG272-X1,RG272-NA,RG272-N,Salaried-X1,Salaried-NA,Salaried-N,X1-NA,X1-N,NA-N


In [29]:
all_cat_cols = cat_cols + featured_cols
num_col = ['Vintage','Avg_Account_Balance']
for idx,col in enumerate (all_cat_cols):
    print(f"\rWorking Cat Col {idx}/{len(all_cat_cols)}",end='')
    for ind, num in enumerate(num_col):

        grp = train.groupby([col])[num].agg(['mean','std'])
        grp['cv'] = grp['std']/(1+grp['mean'])
        grp = grp.add_prefix(f'{col}-{num}-')
        grp = grp.fillna(-1)
        grp = grp.reset_index()
        
        train = train.merge(grp,on=[col],how='left')
        test = test.merge(grp,on=[col],how='left')
        valid = valid.merge(grp,on=[col],how='left')

Working Cat Col 27/28

In [30]:
train.head(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead,age_cat,age_cat-Gender,age_cat-Region_Code,age_cat-Occupation,age_cat-Channel_Code,age_cat-Credit_Product,age_cat-Is_Active,Gender-Region_Code,Gender-Occupation,Gender-Channel_Code,Gender-Credit_Product,Gender-Is_Active,Region_Code-Occupation,Region_Code-Channel_Code,Region_Code-Credit_Product,Region_Code-Is_Active,Occupation-Channel_Code,Occupation-Credit_Product,Occupation-Is_Active,Channel_Code-Credit_Product,Channel_Code-Is_Active,Credit_Product-Is_Active,age_cat-Vintage-mean,age_cat-Vintage-std,age_cat-Vintage-cv,age_cat-Avg_Account_Balance-mean,age_cat-Avg_Account_Balance-std,age_cat-Avg_Account_Balance-cv,Gender-Vintage-mean,Gender-Vintage-std,Gender-Vintage-cv,Gender-Avg_Account_Balance-mean,Gender-Avg_Account_Balance-std,Gender-Avg_Account_Balance-cv,Region_Code-Vintage-mean,Region_Code-Vintage-std,Region_Code-Vintage-cv,Region_Code-Avg_Account_Balance-mean,Region_Code-Avg_Account_Balance-std,Region_Code-Avg_Account_Balance-cv,Occupation-Vintage-mean,Occupation-Vintage-std,Occupation-Vintage-cv,Occupation-Avg_Account_Balance-mean,Occupation-Avg_Account_Balance-std,Occupation-Avg_Account_Balance-cv,Channel_Code-Vintage-mean,Channel_Code-Vintage-std,Channel_Code-Vintage-cv,Channel_Code-Avg_Account_Balance-mean,Channel_Code-Avg_Account_Balance-std,Channel_Code-Avg_Account_Balance-cv,Credit_Product-Vintage-mean,Credit_Product-Vintage-std,Credit_Product-Vintage-cv,Credit_Product-Avg_Account_Balance-mean,Credit_Product-Avg_Account_Balance-std,Credit_Product-Avg_Account_Balance-cv,Is_Active-Vintage-mean,Is_Active-Vintage-std,Is_Active-Vintage-cv,Is_Active-Avg_Account_Balance-mean,Is_Active-Avg_Account_Balance-std,Is_Active-Avg_Account_Balance-cv,age_cat-Gender-Vintage-mean,age_cat-Gender-Vintage-std,age_cat-Gender-Vintage-cv,age_cat-Gender-Avg_Account_Balance-mean,age_cat-Gender-Avg_Account_Balance-std,age_cat-Gender-Avg_Account_Balance-cv,age_cat-Region_Code-Vintage-mean,age_cat-Region_Code-Vintage-std,age_cat-Region_Code-Vintage-cv,age_cat-Region_Code-Avg_Account_Balance-mean,age_cat-Region_Code-Avg_Account_Balance-std,age_cat-Region_Code-Avg_Account_Balance-cv,age_cat-Occupation-Vintage-mean,age_cat-Occupation-Vintage-std,age_cat-Occupation-Vintage-cv,age_cat-Occupation-Avg_Account_Balance-mean,age_cat-Occupation-Avg_Account_Balance-std,age_cat-Occupation-Avg_Account_Balance-cv,age_cat-Channel_Code-Vintage-mean,age_cat-Channel_Code-Vintage-std,age_cat-Channel_Code-Vintage-cv,age_cat-Channel_Code-Avg_Account_Balance-mean,age_cat-Channel_Code-Avg_Account_Balance-std,age_cat-Channel_Code-Avg_Account_Balance-cv,age_cat-Credit_Product-Vintage-mean,age_cat-Credit_Product-Vintage-std,age_cat-Credit_Product-Vintage-cv,age_cat-Credit_Product-Avg_Account_Balance-mean,age_cat-Credit_Product-Avg_Account_Balance-std,age_cat-Credit_Product-Avg_Account_Balance-cv,age_cat-Is_Active-Vintage-mean,age_cat-Is_Active-Vintage-std,age_cat-Is_Active-Vintage-cv,age_cat-Is_Active-Avg_Account_Balance-mean,age_cat-Is_Active-Avg_Account_Balance-std,age_cat-Is_Active-Avg_Account_Balance-cv,Gender-Region_Code-Vintage-mean,Gender-Region_Code-Vintage-std,Gender-Region_Code-Vintage-cv,Gender-Region_Code-Avg_Account_Balance-mean,Gender-Region_Code-Avg_Account_Balance-std,Gender-Region_Code-Avg_Account_Balance-cv,Gender-Occupation-Vintage-mean,Gender-Occupation-Vintage-std,Gender-Occupation-Vintage-cv,Gender-Occupation-Avg_Account_Balance-mean,Gender-Occupation-Avg_Account_Balance-std,Gender-Occupation-Avg_Account_Balance-cv,Gender-Channel_Code-Vintage-mean,Gender-Channel_Code-Vintage-std,Gender-Channel_Code-Vintage-cv,Gender-Channel_Code-Avg_Account_Balance-mean,Gender-Channel_Code-Avg_Account_Balance-std,Gender-Channel_Code-Avg_Account_Balance-cv,Gender-Credit_Product-Vintage-mean,Gender-Credit_Product-Vintage-std,Gender-Credit_Product-Vintage-cv,Gender-Credit_Product-Avg_Account_Balance-mean,Gender-Credit_Product-Avg_Account_Balance-std,Gender-Credit_Product-Avg_Account_Balance-cv,Gender-Is_Active-Vintage-mean,Gender-Is_Active-Vintage-std,Gender-Is_Active-Vintage-cv,Gender-Is_Active-Avg_Account_Balance-mean,Gender-Is_Active-Avg_Account_Balance-std,Gender-Is_Active-Avg_Account_Balance-cv,Region_Code-Occupation-Vintage-mean,Region_Code-Occupation-Vintage-std,Region_Code-Occupation-Vintage-cv,Region_Code-Occupation-Avg_Account_Balance-mean,Region_Code-Occupation-Avg_Account_Balance-std,Region_Code-Occupation-Avg_Account_Balance-cv,Region_Code-Channel_Code-Vintage-mean,Region_Code-Channel_Code-Vintage-std,Region_Code-Channel_Code-Vintage-cv,Region_Code-Channel_Code-Avg_Account_Balance-mean,Region_Code-Channel_Code-Avg_Account_Balance-std,Region_Code-Channel_Code-Avg_Account_Balance-cv,Region_Code-Credit_Product-Vintage-mean,Region_Code-Credit_Product-Vintage-std,Region_Code-Credit_Product-Vintage-cv,Region_Code-Credit_Product-Avg_Account_Balance-mean,Region_Code-Credit_Product-Avg_Account_Balance-std,Region_Code-Credit_Product-Avg_Account_Balance-cv,Region_Code-Is_Active-Vintage-mean,Region_Code-Is_Active-Vintage-std,Region_Code-Is_Active-Vintage-cv,Region_Code-Is_Active-Avg_Account_Balance-mean,Region_Code-Is_Active-Avg_Account_Balance-std,Region_Code-Is_Active-Avg_Account_Balance-cv,Occupation-Channel_Code-Vintage-mean,Occupation-Channel_Code-Vintage-std,Occupation-Channel_Code-Vintage-cv,Occupation-Channel_Code-Avg_Account_Balance-mean,Occupation-Channel_Code-Avg_Account_Balance-std,Occupation-Channel_Code-Avg_Account_Balance-cv,Occupation-Credit_Product-Vintage-mean,Occupation-Credit_Product-Vintage-std,Occupation-Credit_Product-Vintage-cv,Occupation-Credit_Product-Avg_Account_Balance-mean,Occupation-Credit_Product-Avg_Account_Balance-std,Occupation-Credit_Product-Avg_Account_Balance-cv,Occupation-Is_Active-Vintage-mean,Occupation-Is_Active-Vintage-std,Occupation-Is_Active-Vintage-cv,Occupation-Is_Active-Avg_Account_Balance-mean,Occupation-Is_Active-Avg_Account_Balance-std,Occupation-Is_Active-Avg_Account_Balance-cv,Channel_Code-Credit_Product-Vintage-mean,Channel_Code-Credit_Product-Vintage-std,Channel_Code-Credit_Product-Vintage-cv,Channel_Code-Credit_Product-Avg_Account_Balance-mean,Channel_Code-Credit_Product-Avg_Account_Balance-std,Channel_Code-Credit_Product-Avg_Account_Balance-cv,Channel_Code-Is_Active-Vintage-mean,Channel_Code-Is_Active-Vintage-std,Channel_Code-Is_Active-Vintage-cv,Channel_Code-Is_Active-Avg_Account_Balance-mean,Channel_Code-Is_Active-Avg_Account_Balance-std,Channel_Code-Is_Active-Avg_Account_Balance-cv,Credit_Product-Is_Active-Vintage-mean,Credit_Product-Is_Active-Vintage-std,Credit_Product-Is_Active-Vintage-cv,Credit_Product-Is_Active-Avg_Account_Balance-mean,Credit_Product-Is_Active-Avg_Account_Balance-std,Credit_Product-Is_Active-Avg_Account_Balance-cv
0,N3ZQ84QR,Female,46,RG280,Self_Employed,X2,51,No,13.668848,Y,0,45-50,45-50-Female,45-50-RG280,45-50-Self_Employed,45-50-X2,45-50-No,45-50-Y,Female-RG280,Female-Self_Employed,Female-X2,Female-No,Female-Y,RG280-Self_Employed,RG280-X2,RG280-No,RG280-Y,Self_Employed-X2,Self_Employed-No,Self_Employed-Y,X2-No,X2-Y,No-Y,63.827208,30.307173,0.467507,13.78569,0.590627,0.039946,41.827474,30.01163,0.700756,13.713333,0.622809,0.04233,43.188539,32.184806,0.728352,13.411736,0.548784,0.038079,55.55107,32.595353,0.576388,13.745605,0.607609,0.041206,54.598919,28.159967,0.506484,13.75621,0.604981,0.040998,40.581871,28.885153,0.694657,13.684008,0.624317,0.042517,55.489554,34.657305,0.613517,13.802428,0.625565,0.042261,62.977699,30.122365,0.470826,13.811018,0.588057,0.039704,60.989229,30.628936,0.494101,13.417902,0.521058,0.03614,62.908421,30.122425,0.471337,13.770971,0.587852,0.039798,59.700782,27.3079,0.449877,13.750146,0.582647,0.039501,60.597707,28.948966,0.469968,13.733113,0.597245,0.040538,65.428105,30.664196,0.461615,13.803724,0.598925,0.040458,37.097365,28.727333,0.75405,13.401521,0.550985,0.038259,52.403134,31.859805,0.596591,13.756265,0.610664,0.041383,52.13468,26.875135,0.505793,13.764786,0.605232,0.040992,36.246971,26.02971,0.698841,13.658229,0.62249,0.042467,50.468969,33.627966,0.653364,13.803423,0.626325,0.042309,52.210358,32.865881,0.617659,13.383783,0.528443,0.036739,52.755913,29.125233,0.541805,13.392667,0.531946,0.03696,38.642549,29.712687,0.749515,13.379518,0.551894,0.038381,53.452126,34.630722,0.635985,13.427537,0.552327,0.038283,53.313132,27.966771,0.514917,13.726652,0.59804,0.040609,51.904785,31.022695,0.586387,13.705602,0.616762,0.041941,59.285561,33.368966,0.553515,13.769124,0.617108,0.041784,53.602277,26.623489,0.487589,13.728676,0.618954,0.042024,53.959872,28.68473,0.521921,13.748476,0.61782,0.04189,48.25175,32.637442,0.662666,13.763813,0.63022,0.042687
1,JWGAMK7P,Male,67,RG258,Other,X2,43,Yes,13.46755,N,0,65-70,65-70-Male,65-70-RG258,65-70-Other,65-70-X2,65-70-Yes,65-70-N,Male-RG258,Male-Other,Male-X2,Male-Yes,Male-N,RG258-Other,RG258-X2,RG258-Yes,RG258-N,Other-X2,Other-Yes,Other-N,X2-Yes,X2-N,Yes-N,70.515147,32.091662,0.448739,13.924694,0.606062,0.040608,51.248928,33.568613,0.642475,13.751552,0.617448,0.041856,38.40629,27.976339,0.709946,13.331448,0.489656,0.034167,54.886123,34.306833,0.61387,13.798887,0.630414,0.042599,54.598919,28.159967,0.506484,13.75621,0.604981,0.040998,51.701079,34.259627,0.650074,13.794534,0.608655,0.041141,41.567754,29.540018,0.693953,13.690874,0.612792,0.041712,72.823574,31.875641,0.431781,13.918012,0.600994,0.040286,65.357143,31.327676,0.472107,13.567054,0.50631,0.034757,70.515147,32.091662,0.448739,13.924694,0.606062,0.040608,61.946872,27.049487,0.429719,13.879703,0.608501,0.040895,70.56758,33.537921,0.468619,13.953239,0.600272,0.040143,67.380449,31.610169,0.462269,13.923153,0.594644,0.039847,44.967431,30.760728,0.669185,13.354686,0.494156,0.034425,59.464858,34.764459,0.574953,13.818952,0.62599,0.042243,55.983858,28.764827,0.504789,13.751391,0.604796,0.040999,55.172474,34.830128,0.620057,13.796677,0.604623,0.040862,45.681388,31.394054,0.672518,13.715595,0.609421,0.041413,46.258883,31.807772,0.673054,13.339823,0.496179,0.034601,50.216157,28.161833,0.549862,13.374351,0.44384,0.030877,41.193853,29.966969,0.710221,13.372748,0.466577,0.032463,33.233302,24.184362,0.706457,13.314882,0.496401,0.034677,58.589195,27.653358,0.464067,13.804688,0.612041,0.041341,59.013508,35.438797,0.590514,13.866254,0.620115,0.041713,49.045805,32.594454,0.651292,13.75803,0.625097,0.042356,52.7749,28.80404,0.535641,13.783989,0.589136,0.03985,55.277626,27.576043,0.49,13.764424,0.590941,0.040025,45.636257,32.006965,0.686311,13.761389,0.602281,0.040801
2,CX9NGNQT,Male,46,RG279,Self_Employed,X2,26,Yes,12.953253,Y,0,45-50,45-50-Male,45-50-RG279,45-50-Self_Employed,45-50-X2,45-50-Yes,45-50-Y,Male-RG279,Male-Self_Employed,Male-X2,Male-Yes,Male-Y,RG279-Self_Employed,RG279-X2,RG279-Yes,RG279-Y,Self_Employed-X2,Self_Employed-Yes,Self_Employed-Y,X2-Yes,X2-Y,Yes-Y,63.827208,30.307173,0.467507,13.78569,0.590627,0.039946,51.248928,33.568613,0.642475,13.751552,0.617448,0.041856,40.181242,29.860427,0.725098,13.336136,0.525134,0.03663,55.55107,32.595353,0.576388,13.745605,0.607609,0.041206,54.598919,28.159967,0.506484,13.75621,0.604981,0.040998,51.701079,34.259627,0.650074,13.794534,0.608655,0.041141,55.489554,34.657305,0.613517,13.802428,0.625565,0.042261,64.368372,30.412947,0.465255,13.769555,0.591713,0.040063,55.836858,30.886851,0.54343,13.368687,0.467958,0.032568,62.908421,30.122425,0.471337,13.770971,0.587852,0.039798,59.700782,27.3079,0.449877,13.750146,0.582647,0.039501,63.361925,31.321242,0.486642,13.823748,0.580282,0.039145,65.428105,30.664196,0.461615,13.803724,0.598925,0.040458,44.668284,31.754611,0.695332,13.367298,0.540254,0.037603,57.670355,32.912512,0.560973,13.738429,0.605444,0.041079,55.983858,28.764827,0.504789,13.751391,0.604796,0.040999,55.172474,34.830128,0.620057,13.796677,0.604623,0.040862,59.018389,34.934066,0.582056,13.801728,0.625037,0.042227,48.938944,31.888579,0.638551,13.337831,0.491703,0.034294,49.232365,27.87216,0.554865,13.350726,0.507539,0.035367,43.177994,32.18376,0.728502,13.368497,0.529219,0.036832,47.788931,33.87039,0.694223,13.362416,0.543938,0.037872,53.313132,27.966771,0.514917,13.726652,0.59804,0.040609,56.46292,33.669439,0.585933,13.784911,0.595917,0.040306,59.285561,33.368966,0.553515,13.769124,0.617108,0.041784,52.7749,28.80404,0.535641,13.783989,0.589136,0.03985,53.959872,28.68473,0.521921,13.748476,0.61782,0.04189,65.513366,35.202327,0.529252,13.870021,0.616328,0.041448


In [41]:
all_cat_cols = cat_cols + featured_cols
encoder = LabelEncoder()
for col in all_cat_cols:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])
    valid[col] = encoder.transform(valid[col])

In [42]:
x_train = train.drop(['ID','Is_Lead'],axis=1)
y_train = train['Is_Lead']

x_valid = valid.drop(['ID','Is_Lead'],axis=1)
y_valid = valid['Is_Lead']

x_test = test.drop(['ID'],axis=1)

In [45]:
cat_indx = []
for idx, col in enumerate(x_train.columns):
    if col in all_cat_cols:
        cat_indx.append(idx)

In [None]:
hist_params = {'max_iter':1000,
               'learning_rate' : 0.003,
               'max_depth' : 30,
               'early_stopping' : 'auto',
               'verbose':1,
               'max_bins' : 255,
               'random_state':636,
              
              }


hist = HistGradientBoostingClassifier(**hist_params)
hist.fit(x_train,y_train)
pred = hist.predict_proba(x_valid)[:, 1]
roc_score = roc_auc_score(y_valid, pred)
print(f"roc_auc_score: {roc_score}")

In [54]:
submit = pd.DataFrame()
submit['ID'] = test['ID']
submit['Is_Lead'] = hist.predict_proba(x_test)[:, 1]
submit.head(3)

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.059454
1,CCMEWNKY,0.822981
2,VK3KGA9M,0.062201


In [55]:
submit.to_csv("Hist_submit_v2.csv",index=False)