In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [32]:
X = pd.read_csv('X_resample_SMOTE.csv')
y = pd.read_csv('y_resample_SMOTE.csv')
test = pd.read_csv('test.csv')

In [33]:
def calculate_woe_iv(data, feature, target, bins=10):
    """
    Calculate Weight of Evidence (WoE) and Information Value (IV) for a given feature.

    Args:
    - data (pd.DataFrame): DataFrame containing the feature and target.
    - feature (str or pd.Series): The feature column name or Series object.
    - target (str): The target column name (binary: 0 or 1).
    - bins (int or list): Number of bins or specific bin edges.

    Returns:
    - woe_df (pd.DataFrame): DataFrame with WoE and IV for each bin.
    - iv (float): Information Value of the feature.
    """
    # Check if `feature` is a column name or Series
    if isinstance(feature, str):
        feature_series = data[feature]
    elif isinstance(feature, pd.Series):
        feature_series = feature
    else:
        raise ValueError("Feature must be either a column name (str) or a pandas Series.")
    
    # Bin the feature
    if isinstance(bins, int):
        data['bin'] = pd.qcut(feature_series, q=bins, duplicates='drop')
    else:
        data['bin'] = pd.cut(feature_series, bins=bins)
    
    # Group by bins and calculate statistics
    grouped = data.groupby('bin').agg(
        total_count=(target, 'count'),
        bad_count=(target, 'sum'),
        good_count=(target, lambda x: x.count() - x.sum())
    ).reset_index()
    
    # Add good and bad percentages
    grouped['bad_pct'] = grouped['bad_count'] / grouped['bad_count'].sum()
    grouped['good_pct'] = grouped['good_count'] / grouped['good_count'].sum()
    
    # Calculate WoE and handle division by zero
    grouped['woe'] = np.log((grouped['good_pct'] + 1e-10) / (grouped['bad_pct'] + 1e-10))
    
    # Calculate IV
    grouped['iv'] = (grouped['good_pct'] - grouped['bad_pct']) * grouped['woe']
    iv = grouped['iv'].sum()
    
    return grouped[['bin', 'woe', 'iv']], iv

In [34]:
features = X.columns

In [35]:
X['TARGET'] = y['TARGET'].values

In [36]:
t = 'TARGET'

In [37]:
feature_iv = {}
for feature in features:
    woe_df, iv = calculate_woe_iv(data=X, feature=feature, target=t, bins=5)
    feature_iv[feature] = iv
    print(f"Feature: {feature}")
    print(woe_df)
    print(f"IV: {iv}\n")

  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_CONTRACT_TYPE
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: CODE_GENDER
               bin       woe        iv
0  (-0.001, 0.425]  0.219703  0.028846
1     (0.425, 1.0] -0.331232  0.043489
IV: 0.07233475857210649



  grouped = data.groupby('bin').agg(


Feature: FLAG_OWN_CAR
               bin       woe        iv
0  (-0.001, 0.106]  0.187219  0.020969
1     (0.106, 1.0] -0.281864  0.031570
IV: 0.05253966513080642

Feature: FLAG_OWN_REALTY
               bin       woe        iv
0  (-0.001, 0.959] -0.486421  0.092819
1     (0.959, 1.0]  0.320758  0.061208
IV: 0.15402703220336475



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: CNT_CHILDREN
             bin       woe        iv
0  (-0.001, 1.0]  0.048362  0.002056
1    (1.0, 19.0] -0.355196  0.015098
IV: 0.017153656117695502

Feature: AMT_INCOME_TOTAL
                bin       woe        iv
0  (10.151, 11.537]  0.073529  0.001081
1  (11.537, 11.813] -0.010930  0.000033
2  (11.813, 11.967] -0.281267  0.009903
3   (11.967, 12.28] -0.188250  0.006912
4   (12.28, 13.086]  0.303662  0.018302
IV: 0.036230994018206716



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: AMT_CREDIT
                            bin       woe        iv
0  (10.713000000000001, 12.506]  0.438948  0.040191
1              (12.506, 12.945] -0.548566  0.055217
2              (12.945, 13.246] -0.299293  0.017823
3              (13.246, 13.603] -0.192613  0.007413
4              (13.603, 14.408]  0.551593  0.059093
IV: 0.17973662242094576

Feature: AMT_ANNUITY
                bin       woe        iv
0    (7.387, 9.674]  0.317015  0.019935
1    (9.674, 9.999] -0.217936  0.009462
2   (9.999, 10.222] -0.229032  0.010444
3  (10.222, 10.475] -0.289701  0.016670
4  (10.475, 11.166]  0.424676  0.035535
IV: 0.09204636739203008



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: AMT_GOODS_PRICE
                bin       woe        iv
0  (10.608, 12.352]  0.413468  0.033713
1  (12.352, 12.806] -0.466843  0.042869
2  (12.806, 13.076] -0.250300  0.012449
3  (13.076, 13.442] -0.175202  0.006201
4  (13.442, 14.316]  0.492909  0.047032
IV: 0.142262729978393

Feature: REGION_POPULATION_RELATIVE
                   bin       woe        iv
0  (-0.00071, 0.00963]  0.306237  0.018836
1    (0.00963, 0.0163] -0.438074  0.037323
2     (0.0163, 0.0206] -0.091803  0.001684
3     (0.0206, 0.0287] -0.196201  0.008274
4     (0.0287, 0.0725]  0.464905  0.039148
IV: 0.10526528756658698



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: OWN_CAR_AGE
                bin       woe        iv
0  (-1.001, -0.234]  0.187219  0.020969
1   (-0.234, 7.564] -0.591479  0.067998
2     (7.564, 91.0]  0.014792  0.000044
IV: 0.08901166616008162

Feature: FLAG_MOBIL
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FLAG_EMP_PHONE
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: FLAG_WORK_PHONE
               bin       woe        iv
0  (-0.001, 0.602]  0.011282  0.000102
1     (0.602, 1.0] -0.045136  0.000407
IV: 0.0005092121507457285



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FLAG_CONT_MOBILE
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: FLAG_PHONE
               bin       woe        iv
0  (-0.001, 0.882] -0.210289  0.035247
1     (0.882, 1.0]  0.893043  0.149686
IV: 0.18493347109178243



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FLAG_EMAIL
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: CNT_FAM_MEMBERS
              bin       woe        iv
0  (0.999, 1.269]  0.192156  0.007362
1    (1.269, 2.0]  0.182206  0.015745
2      (2.0, 3.0] -0.429164  0.039050
3     (3.0, 20.0] -0.311556  0.010511
IV: 0.07266812798167531



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: REGION_RATING_CLIENT
            bin        woe        iv
0  (0.999, 2.0]   0.275930  0.056416
1  (2.0, 2.369] -20.806172  2.260466
2  (2.369, 3.0]  -0.488569  0.046812
IV: 2.3636952346867446

Feature: REGION_RATING_CLIENT_W_CITY
            bin        woe        iv
0  (0.999, 2.0]   0.260056  0.051362
1  (2.0, 2.258] -20.401948  1.479534
2  (2.258, 3.0]  -0.646540  0.080807
IV: 1.6117033060280073



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: HOUR_APPR_PROCESS_START
                bin       woe        iv
0   (-0.001, 9.466]  0.087836  0.001542
1     (9.466, 11.0]  0.151260  0.005185
2    (11.0, 12.784] -0.757770  0.094823
3  (12.784, 14.706] -0.088834  0.001577
4    (14.706, 23.0]  0.463405  0.042196
IV: 0.145323168659649

Feature: REG_REGION_NOT_LIVE_REGION
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: REG_REGION_NOT_WORK_REGION
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: LIVE_REGION_NOT_WORK_REGION
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: REG_CITY_NOT_LIVE_CITY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: REG_CITY_NOT_WORK_CITY
               bin       woe        iv
0  (-0.001, 0.832] -0.059457  0.002827
1     (0.832, 1.0]  0.238888  0.011360
IV: 0.014186824968872837



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: LIVE_CITY_NOT_WORK_CITY
               bin       woe        iv
0  (-0.001, 0.495]  0.061582  0.003033
1     (0.495, 1.0] -0.247505  0.012190
IV: 0.015222459830936019

Feature: EXT_SOURCE_1
                             bin       woe        iv
0  (0.013600000000000001, 0.407] -0.532548  0.055418
1                 (0.407, 0.502]  0.089077  0.004723
2                 (0.502, 0.505] -1.498212  0.008308
3                 (0.505, 0.952]  0.284845  0.016118
IV: 0.08456785251932157



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: EXT_SOURCE_2
                      bin       woe        iv
0  (-0.0009999183, 0.284] -0.621125  0.074771
1           (0.284, 0.44] -0.599684  0.069843
2           (0.44, 0.554] -0.228732  0.010418
3          (0.554, 0.646]  0.282022  0.015803
4          (0.646, 0.855]  1.282981  0.290419
IV: 0.4612540833605343

Feature: EXT_SOURCE_3
                  bin       woe        iv
0  (-0.000473, 0.314] -0.711270  0.097163
1      (0.314, 0.451] -0.571911  0.063662
2      (0.451, 0.511]  0.206368  0.010394
3      (0.511, 0.611] -0.059274  0.000546
4      (0.611, 0.896]  1.146273  0.237007
IV: 0.4087721569823295



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: APARTMENTS_AVG
                bin       woe        iv
0  (-0.001, 0.0794]  0.099738  0.001990
1   (0.0794, 0.115]  0.008046  0.000039
2    (0.115, 0.442] -0.129634  0.003221
IV: 0.005250016580753935

Feature: BASEMENTAREA_AVG
                bin       woe        iv
0  (-0.001, 0.0721] -0.045814  0.000420
1   (0.0721, 0.086]  0.067101  0.002769
2    (0.086, 0.336] -0.174147  0.005590
IV: 0.008778824050260084



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: YEARS_BEGINEXPLUATATION_AVG
                 bin        woe        iv
0  (-0.0001, 0.9777]   0.142698  0.012082
1   (0.9777, 0.9778] -18.539644  0.208817
2   (0.9778, 0.9829]  -0.760688  0.110454
3      (0.9829, 1.0]   0.362902  0.026054
IV: 0.35740740657287257

Feature: YEARS_BUILD_AVG
                 bin        woe        iv
0  (-0.0001, 0.7524]  -0.374081  0.027666
1   (0.7524, 0.7529]   0.205519  0.025232
2    (0.7529, 0.753] -16.152294  0.016714
3       (0.753, 1.0]  -0.240055  0.011470
IV: 0.08108229984483789



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: COMMONAREA_AVG
                bin       woe        iv
0  (-0.001, 0.0335] -0.095547  0.001824
1  (0.0335, 0.0413]  0.065434  0.002956
2   (0.0413, 0.271] -0.239502  0.006246
IV: 0.011025923006351294

Feature: ELEVATORS_AVG
                 bin       woe        iv
0  (-0.001, 0.00781]  0.840211  0.133433
1   (0.00781, 0.079] -0.245195  0.037456
2       (0.079, 1.0] -0.034798  0.000211
IV: 0.1710996902455106



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: ENTRANCES_AVG
               bin       woe        iv
0  (-0.001, 0.117]  0.046927  0.000440
1    (0.117, 0.15]  0.036421  0.000805
2      (0.15, 1.0] -0.163806  0.005160
IV: 0.0064055897979586815

Feature: FLOORSMAX_AVG
               bin       woe        iv
0  (-0.001, 0.167]  0.488095  0.058183
1   (0.167, 0.227] -0.192723  0.020539
2     (0.227, 1.0] -0.064380  0.000813
IV: 0.07953542751129883



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FLOORSMIN_AVG
              bin       woe        iv
0  (-0.001, 0.21]  0.162071  0.005242
1   (0.21, 0.232] -0.020993  0.000301
2    (0.232, 1.0] -0.153430  0.002765
IV: 0.008307554688517935

Feature: LANDAREA_AVG
                bin       woe        iv
0  (-0.001, 0.0478]  0.030427  0.000185
1  (0.0478, 0.0635]  0.057176  0.002033
2    (0.0635, 0.31] -0.235020  0.009786
IV: 0.012004020036464927



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: LIVINGAPARTMENTS_AVG
                bin       woe        iv
0  (-0.001, 0.0867] -0.131838  0.003471
1  (0.0867, 0.0982]  0.078679  0.004149
2   (0.0982, 0.379] -0.204632  0.005402
IV: 0.013022377991660443

Feature: LIVINGAREA_AVG
                 bin       woe        iv
0   (-1e-05, 0.0662]  0.144890  0.004194
1  (0.0662, 0.10417] -0.864388  0.140679
2  (0.10417, 0.1042]  0.354004  0.052431
3  (0.1042, 0.43942] -0.080748  0.001155
IV: 0.1984587258123924



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NONLIVINGAPARTMENTS_AVG
                  bin       woe        iv
0   (-0.001, 0.00432]  0.328707  0.021417
1  (0.00432, 0.00888] -0.065629  0.003183
2      (0.00888, 1.0] -0.276111  0.004598
IV: 0.02919885405713027

Feature: NONLIVINGAREA_AVG
                 bin       woe        iv
0  (-0.001, 0.00677]  0.591737  0.068057
1  (0.00677, 0.0252] -0.125045  0.010382
2    (0.0252, 0.237] -0.237727  0.007605
IV: 0.08604329347936064



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: YEARS_BUILD_MODE
               bin       woe        iv
0  (-0.001, 0.759] -0.334934  0.022229
1    (0.759, 0.76]  0.192131  0.022195
2      (0.76, 1.0] -0.250952  0.012335
IV: 0.05675887748053579

Feature: EMERGENCYSTATE_MODE
               bin       woe        iv
0  (-0.001, 0.218]  0.642190  0.159519
1   (0.218, 1.951] -3.959434  1.524484
2     (1.951, 2.0]  0.344950  0.047130
IV: 1.731131894077212



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: OBS_30_CNT_SOCIAL_CIRCLE
              bin       woe        iv
0   (-0.001, 1.0]  0.186207  0.021914
1    (1.0, 2.795] -0.842056  0.111309
2  (2.795, 348.0]  0.072534  0.001052
IV: 0.13427542202953305

Feature: DEF_30_CNT_SOCIAL_CIRCLE
              bin  woe   iv
0  (-0.001, 34.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: OBS_60_CNT_SOCIAL_CIRCLE
              bin       woe        iv
0   (-0.001, 1.0]  0.182974  0.021277
1    (1.0, 2.737] -0.812521  0.101868
2  (2.737, 344.0]  0.045446  0.000413
IV: 0.1235585063598199

Feature: DEF_60_CNT_SOCIAL_CIRCLE
              bin  woe   iv
0  (-0.001, 24.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: AMT_REQ_CREDIT_BUREAU_HOUR
                 bin       woe        iv
0  (-0.001, 0.00123]  0.157661  0.019845
1     (0.00123, 3.0] -0.651444  0.081996
IV: 0.10184081507410482

Feature: AMT_REQ_CREDIT_BUREAU_DAY
                 bin       woe        iv
0  (-0.001, 0.00127]  0.159475  0.020303
1     (0.00127, 9.0] -0.659455  0.083955
IV: 0.1042579209194213



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: AMT_REQ_CREDIT_BUREAU_WEEK
               bin       woe        iv
0  (-0.001, 0.014]  0.101404  0.008219
1     (0.014, 8.0] -0.410959  0.033310
IV: 0.04152913524042577

Feature: AMT_REQ_CREDIT_BUREAU_MON
               bin       woe        iv
0  (-0.001, 0.267]  0.059207  0.002915
1    (0.267, 27.0] -0.295076  0.014530
IV: 0.017445212761538386



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: AMT_REQ_CREDIT_BUREAU_QRT
                bin       woe        iv
0  (-0.001, 0.0175]  0.341867  0.069449
1   (0.0175, 0.267] -0.751443  0.110862
2    (0.267, 261.0] -0.287881  0.016010
IV: 0.19632127008277717

Feature: AMT_REQ_CREDIT_BUREAU_YEAR
               bin       woe        iv
0  (-0.001, 0.647]  0.362327  0.025973
1    (0.647, 1.35]  0.074969  0.001124
2      (1.35, 2.0]  0.030355  0.000268
3       (2.0, 3.0] -0.469610  0.030851
4      (3.0, 25.0] -0.178837  0.005328
IV: 0.06354344252181815



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: SK_ID_CURR
                      bin       woe            iv
0     (-0.001, 61748.459]  0.017291  5.979469e-05
1   (61748.459, 122679.0] -0.020453  8.366459e-05
2  (122679.0, 184102.681] -0.001216  2.957798e-07
3  (184102.681, 245958.0]  0.009486  1.799538e-05
4    (245958.0, 307510.0] -0.005108  5.217566e-06
IV: 0.0001669679981035246

Feature: MONTH_DAYS_BIRTH
                  bin       woe        iv
0  (249.632, 389.533] -0.074192  0.001100
1  (389.533, 465.067] -0.399545  0.031512
2    (465.067, 543.2] -0.232677  0.010778
3      (543.2, 647.0] -0.061329  0.000752
4    (647.0, 840.967]  0.801875  0.122115
IV: 0.16625720445466322



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: MONTH_DAYS_EMPLOYED
                bin       woe        iv
0  (-0.001, 25.458] -0.025163  0.000127
1  (25.458, 46.968] -0.470920  0.043551
2  (46.968, 75.771] -0.568369  0.062924
3    (75.771, 96.0]  0.634922  0.078033
4   (96.0, 597.067]  0.433325  0.036972
IV: 0.22160602153188547

Feature: MONTH_DAYS_REGISTRATION
                  bin       woe        iv
0      (-0.001, 58.1]  0.228682  0.010414
1     (58.1, 115.685] -0.334195  0.022131
2    (115.685, 170.1] -0.244739  0.011921
3    (170.1, 249.133] -0.179140  0.006401
4  (249.133, 518.313]  0.538294  0.056588
IV: 0.10745493491372005



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: MONTH_DAYS_ID_PUBLISH
                  bin       woe        iv
0    (-0.001, 48.244]  0.058167  0.000677
1    (48.244, 83.361] -0.401100  0.031752
2   (83.361, 115.465] -0.363999  0.026210
3  (115.465, 141.767] -0.035574  0.000253
4    (141.767, 239.9]  0.769764  0.112891
IV: 0.17178290959707893

Feature: MONTH_DAYS_LAST_PHONE_CHANGE
                 bin       woe        iv
0    (-0.001, 7.212]  0.199165  0.007907
1      (7.212, 17.9] -0.303572  0.018298
2     (17.9, 31.233] -0.325956  0.021056
3   (31.233, 51.933] -0.176604  0.006227
4  (51.933, 114.853]  0.621971  0.074903
IV: 0.12839149319649779



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NUM_DOCUMENTS
             bin       woe        iv
0  (-0.001, 1.0]  0.001259  0.000002
1     (1.0, 3.0] -0.045214  0.000055
IV: 5.692652215394384e-05

Feature: DAY_APPR_PROCESS_START
               bin       woe        iv
0  (-0.001, 0.824] -0.372754  0.027472
1     (0.824, 1.0]  0.092190  0.006794
IV: 0.03426636962586617



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: CREDIT_BY_INCOME
               bin       woe        iv
0    (0.0924, 2.0]  0.438092  0.038504
1     (2.0, 2.866] -0.202843  0.008044
2   (2.866, 3.876] -0.287572  0.016426
3   (3.876, 5.513] -0.195528  0.007622
4  (5.513, 70.524]  0.240503  0.011513
IV: 0.08210997007830982

Feature: ANNUITY_BY_INCOME
                             bin       woe        iv
0  (0.007699999999999999, 0.112]  0.378433  0.028306
1                 (0.112, 0.149] -0.051331  0.000527
2                 (0.149, 0.188] -0.176437  0.006210
3                 (0.188, 0.243] -0.193520  0.007467
4                 (0.243, 1.876]  0.046242  0.000428
IV: 0.04293651786526858



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: GOODS_PRICE_BY_INCOME
               bin       woe        iv
0  (0.0924, 1.778]  0.341660  0.023153
1   (1.778, 2.522] -0.172256  0.005912
2   (2.522, 3.374] -0.258334  0.013274
3   (3.374, 4.911] -0.246248  0.012067
4  (4.911, 64.294]  0.337871  0.022617
IV: 0.07702145675665352

Feature: INCOME_PER_PERSON
                      bin       woe        iv
0     (2812.499, 45000.0] -0.086959  0.001928
1      (45000.0, 63000.0] -0.006946  0.000007
2      (63000.0, 90000.0] -0.007049  0.000013
3     (90000.0, 126000.0]  0.061436  0.000540
4  (126000.0, 482047.958]  0.081670  0.001324
IV: 0.003812334743975854



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: CNT_ADULTS
            bin  woe   iv
0  (0.999, 2.0]  0.0  0.0
IV: 0.0

Feature: CHILDREN_RATIO
               bin       woe        iv
0  (-0.001, 0.333]  0.038878  0.001283
1    (0.333, 0.95] -0.219805  0.007256
IV: 0.008539540038388086



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: ANNUITY_LENGTH
                           bin       woe        iv
0  (8.036000000000001, 14.966]  0.243011  0.011753
1             (14.966, 19.379] -0.616474  0.073688
2             (19.379, 21.306]  0.085732  0.001469
3             (21.306, 27.501] -0.217877  0.009457
4             (27.501, 45.305]  0.497164  0.048441
IV: 0.1448076437165166

Feature: EXT_SOURCE_MEAN
                   bin       woe        iv
0  (-0.0009885, 0.326] -1.044305  0.200241
1       (0.326, 0.421] -0.684057  0.090100
2       (0.421, 0.503] -0.172160  0.005913
3       (0.503, 0.597]  0.472696  0.043874
4       (0.597, 0.879]  1.594429  0.422508
IV: 0.762636343120367



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NUM_EXT_SOURCES
             bin       woe        iv
0  (-0.001, 2.0] -0.312466  0.073357
1     (2.0, 3.0]  1.056263  0.247978
IV: 0.32133501341695697

Feature: NAME_TYPE_SUITE_Children
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_TYPE_SUITE_Family
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_TYPE_SUITE_Other
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_TYPE_SUITE_Spouse, partner
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_TYPE_SUITE_Unaccompanied
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_INCOME_TYPE_Businessman
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_INCOME_TYPE_Commercial associate
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_INCOME_TYPE_Maternity leave
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_INCOME_TYPE_Pensioner
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_INCOME_TYPE_State servant
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_INCOME_TYPE_Student
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_INCOME_TYPE_Unemployed
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_INCOME_TYPE_Working
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_EDUCATION_TYPE_Academic degree
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_EDUCATION_TYPE_Higher education
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_EDUCATION_TYPE_Incomplete higher
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_EDUCATION_TYPE_Lower secondary
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_EDUCATION_TYPE_Secondary / secondary special
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_FAMILY_STATUS_Civil marriage
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_FAMILY_STATUS_Married
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_FAMILY_STATUS_Separated
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_FAMILY_STATUS_Single / not married
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_FAMILY_STATUS_Widow
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_HOUSING_TYPE_Co-op apartment
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_HOUSING_TYPE_House / apartment
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_HOUSING_TYPE_Municipal apartment
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_HOUSING_TYPE_Office apartment
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: NAME_HOUSING_TYPE_Rented apartment
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: NAME_HOUSING_TYPE_With parents
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: OCCUPATION_TYPE_OCCUPATION_TYPE_0
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: OCCUPATION_TYPE_OCCUPATION_TYPE_1
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: OCCUPATION_TYPE_OCCUPATION_TYPE_2
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: ORGANIZATION_TYPE_ORGANIZATION_TYPE_0
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: ORGANIZATION_TYPE_ORGANIZATION_TYPE_1
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: ORGANIZATION_TYPE_ORGANIZATION_TYPE_2
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: FONDKAPREMONT_MODE_not specified
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FONDKAPREMONT_MODE_org spec account
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: FONDKAPREMONT_MODE_reg oper account
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: FONDKAPREMONT_MODE_reg oper spec account
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: HOUSETYPE_MODE_block of flats
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: HOUSETYPE_MODE_specific housing
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: HOUSETYPE_MODE_terraced house
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WALLSMATERIAL_MODE_Block
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WALLSMATERIAL_MODE_Mixed
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WALLSMATERIAL_MODE_Monolithic
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WALLSMATERIAL_MODE_Others
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WALLSMATERIAL_MODE_Panel
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WALLSMATERIAL_MODE_Stone, brick
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WALLSMATERIAL_MODE_Wooden
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WEEKDAY_APPR_PROCESS_START_FRIDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WEEKDAY_APPR_PROCESS_START_MONDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WEEKDAY_APPR_PROCESS_START_SATURDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


Feature: WEEKDAY_APPR_PROCESS_START_SUNDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WEEKDAY_APPR_PROCESS_START_THURSDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WEEKDAY_APPR_PROCESS_START_TUESDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0

Feature: WEEKDAY_APPR_PROCESS_START_WEDNESDAY
             bin  woe   iv
0  (-0.001, 1.0]  0.0  0.0
IV: 0.0



  grouped = data.groupby('bin').agg(
  grouped = data.groupby('bin').agg(


In [38]:
sorted_iv = sorted(feature_iv.items(), key=lambda x: x[1], reverse=True)
print("Features sorted by IV:")
for feature, iv in sorted_iv:
    print(f"{feature}: IV = {iv}")

Features sorted by IV:
REGION_RATING_CLIENT: IV = 2.3636952346867446
EMERGENCYSTATE_MODE: IV = 1.731131894077212
REGION_RATING_CLIENT_W_CITY: IV = 1.6117033060280073
EXT_SOURCE_MEAN: IV = 0.762636343120367
EXT_SOURCE_2: IV = 0.4612540833605343
EXT_SOURCE_3: IV = 0.4087721569823295
YEARS_BEGINEXPLUATATION_AVG: IV = 0.35740740657287257
NUM_EXT_SOURCES: IV = 0.32133501341695697
MONTH_DAYS_EMPLOYED: IV = 0.22160602153188547
LIVINGAREA_AVG: IV = 0.1984587258123924
AMT_REQ_CREDIT_BUREAU_QRT: IV = 0.19632127008277717
FLAG_PHONE: IV = 0.18493347109178243
AMT_CREDIT: IV = 0.17973662242094576
MONTH_DAYS_ID_PUBLISH: IV = 0.17178290959707893
ELEVATORS_AVG: IV = 0.1710996902455106
MONTH_DAYS_BIRTH: IV = 0.16625720445466322
FLAG_OWN_REALTY: IV = 0.15402703220336475
HOUR_APPR_PROCESS_START: IV = 0.145323168659649
ANNUITY_LENGTH: IV = 0.1448076437165166
AMT_GOODS_PRICE: IV = 0.142262729978393
OBS_30_CNT_SOCIAL_CIRCLE: IV = 0.13427542202953305
MONTH_DAYS_LAST_PHONE_CHANGE: IV = 0.12839149319649779
OBS_

In [39]:
filtered_features = [feature for feature, iv in feature_iv.items() if 0.1 <= iv <= 0.5]

# Create a new DataFrame with only the filtered features
filtered_train = X[filtered_features]

In [40]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state=42)
# filtered_train, y_res = smote.fit_resample(filtered_train, y_res)

In [41]:
filtered_test = test[filtered_features]

In [42]:
filtered_test.columns

Index(['FLAG_OWN_REALTY', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'FLAG_PHONE', 'HOUR_APPR_PROCESS_START',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'YEARS_BEGINEXPLUATATION_AVG',
       'ELEVATORS_AVG', 'LIVINGAREA_AVG', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'MONTH_DAYS_BIRTH', 'MONTH_DAYS_EMPLOYED', 'MONTH_DAYS_REGISTRATION',
       'MONTH_DAYS_ID_PUBLISH', 'MONTH_DAYS_LAST_PHONE_CHANGE',
       'ANNUITY_LENGTH', 'NUM_EXT_SOURCES'],
      dtype='object')

In [None]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.impute import SimpleImputer  # Thay thế Imputer bằng SimpleImputer

# Khởi tạo các preprocessing. Trong đó SimpleImputer theo mean, MinMaxScaler theo khoảng 0, 1 và PolynomialFeatures bậc 3.
imputer = SimpleImputer(strategy='mean')  # Sửa đổi từ Imputer thành SimpleImputer
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
poly_engineer = PolynomialFeatures(degree=3)

# Lấy các feature có tương quan lớn nhất đến biến mục tiêu từ app_train và app_test
TARGET = y['TARGET']
train_poly_fea = filtered_train
test_poly_fea = filtered_test

In [45]:
# MinMaxScaler dữ liệu
train_poly_fea = minmax_scaler.fit_transform(train_poly_fea)
test_poly_fea = minmax_scaler.transform(test_poly_fea)

In [46]:
# Tạo ra các Polynomial Features
train_poly_fea = poly_engineer.fit_transform(train_poly_fea)
test_poly_fea = poly_engineer.transform(test_poly_fea)

MemoryError: Unable to allocate 8.76 GiB for an array with shape (452266, 2600) and data type float64

In [None]:
# Tạo ra các Polynomial Features
train_poly_fea = poly_engineer.fit_transform(train_poly_fea)
test_poly_fea = poly_engineer.transform(test_poly_fea)

print('train_poly_fea shape: ', train_poly_fea.shape)
print('test_poly_fea shape: ', test_poly_fea.shape)

MemoryError: Unable to allocate 8.76 GiB for an array with shape (452266, 2600) and data type float64

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'classifier__penalty': ['l2'],
    'classifier__dual': [False],
    'classifier__tol': [1e-3, 1e-4, 1e-5],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__fit_intercept': [True],
    'classifier__intercept_scaling': [1],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag'],
    'classifier__max_iter': [100, 200, 300],
}

model_pipeline = Pipeline(steps=[
    ('classifier', LogisticRegression(random_state=42))
])

random_search = RandomizedSearchCV(
    estimator = model_pipeline,
    param_distributions = param_distributions,
    n_iter = 150,  
    cv = 5,
    scoring = 'roc_auc',
    random_state = 42,
    n_jobs = -1
)

random_search.fit(X_res, y_res)

best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(filtered_test)[:, 1]

In [None]:
test['TARGET'] = y_pred_proba
result = test[['SK_ID_CURR', 'TARGET']]

In [None]:
result = result.sort_values(['SK_ID_CURR'])
result['SK_ID_CURR'] = result['SK_ID_CURR'].astype(int)
result

In [None]:
result.to_csv('submit_7_nonmerge_lr_woe.csv', index = False)