Problem 1 Confirmation of competition contents

In [1]:
import gc
import joblib
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import stats
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

gc.enable()

Problem 2 Learning and verification

In [4]:
DEBUG = True
REMOVE_OUTLIERS=True

if DEBUG:
    train_df = pd.read_csv('data/task15/application_train.csv', nrows=1000)
    test_df = pd.read_csv('data/task15/application_test.csv', nrows=1000)
else:
    train_df = pd.read_csv('data/task15/application_train.csv')
    test_df = pd.read_csv('data/task15/application_test.csv')

train_df['isTrain'] = 'Train'
test_df['isTrain'] = 'Test'

test_df['TARGET'] = np.nan

df = pd.concat([train_df, test_df[train_df.columns]], axis=0)
df = df.reset_index(drop=True)

del train_df, test_df
gc.collect()

0

In [5]:
df.shape

(2000, 123)

Problem 4 Feature engineering

In [6]:
num_columns = []
cat_columns = []
should_be_encode = []
not_useful_columns = ['SK_ID_CURR', 'TARGET', 'isTrain']

for col in df.columns:
    if col not in not_useful_columns:
        unique_len = len(df[col].unique())
        data_type = df[col].dtype

        if unique_len<=20 and data_type!="object":
            cat_columns.append(col)
        elif data_type=='object':
            should_be_encode.append(col)
        else:
            num_columns.append(col)

print('Number of cat columns:', len(cat_columns+should_be_encode))
print('Number of numerical columns:', len(num_columns))

Number of cat columns: 63
Number of numerical columns: 57


In [7]:
missing_values = {}
for col in num_columns:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values>0:
        df[col] = df[col].fillna(np.nanmean(df[col].values))
        missing_values[col] = np.nanmean(df[col].values)

In [8]:
missing_values

{'AMT_ANNUITY': 27896.891195597796,
 'AMT_GOODS_PRICE': 491480.8626813407,
 'OWN_CAR_AGE': 12.106569343065695,
 'EXT_SOURCE_1': 0.4980570277236582,
 'EXT_SOURCE_2': 0.5157714723627375,
 'EXT_SOURCE_3': 0.5034876519837745,
 'APARTMENTS_AVG': 0.114991351888668,
 'BASEMENTAREA_AVG': 0.0871649411764706,
 'YEARS_BEGINEXPLUATATION_AVG': 0.979848703170029,
 'YEARS_BUILD_AVG': 0.7499645112781954,
 'COMMONAREA_AVG': 0.042430677966101685,
 'ELEVATORS_AVG': 0.07589109947643981,
 'ENTRANCES_AVG': 0.14419236043095005,
 'FLOORSMAX_AVG': 0.22618407335907334,
 'FLOORSMIN_AVG': 0.23438777777777775,
 'LANDAREA_AVG': 0.06458865598027129,
 'LIVINGAPARTMENTS_AVG': 0.09552752,
 'LIVINGAREA_AVG': 0.10460177339901477,
 'NONLIVINGAPARTMENTS_AVG': 0.007794398682042832,
 'NONLIVINGAREA_AVG': 0.02911912568306011,
 'APARTMENTS_MODE': 0.11219662027833001,
 'BASEMENTAREA_MODE': 0.08608717647058822,
 'YEARS_BEGINEXPLUATATION_MODE': 0.9785262247838616,
 'YEARS_BUILD_MODE': 0.757468120300752,
 'COMMONAREA_MODE': 0.0405

In [9]:
for col in should_be_encode:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(df[col].mode())
        missing_values[col] = (df[col].mode())

In [10]:
for col in cat_columns:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(df[col].median())
        missing_values[col] = (df[col].median())

In [11]:
joblib.dump(missing_values, 'data/task15/missing_values.pkl')

['data/home_credit/missing_values.pkl']

In [12]:
encoders = {}
for col in should_be_encode:
    encoder = LabelEncoder().fit(df[col])
    encoders[col] = encoder
    df[col] = encoder.transform(df[col])

joblib.dump(encoders, "data/task15/encoders.pkl",)

['data/home_credit/encoders.pkl']

In [13]:
for col in cat_columns+should_be_encode+num_columns:
    if df[col].isnull().sum()>0:
        print(col, df[col].dtype, df[col].isnull().sum())
cat_columns = cat_columns + should_be_encode
del should_be_encode
gc.collect()

0

In [14]:
dummy_columns = []
map_dummy = {}
for col in tqdm(cat_columns, total=len(cat_columns)):
    for value in tqdm(df[col].unique()):
        df[f'dummy_{col}_{value}'] = 0
        df.loc[df[col]==value, f'dummy_{col}_{value}'] = 1
        dummy_columns.append(f'dummy_{col}_{value}')
        map_dummy[col] = f"dummy_{col}_{value}"

joblib.dump(map_dummy, 'data/task15/map_dummy.pkl')

del cat_columns
gc.collect()

  0%|                                                                                           | 0/63 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 584.03it/s][A

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 395.63it/s][A

100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 204.54it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 339.52it/s][A

100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s][A

100%|███████

9

In [15]:
df.shape

(2000, 425)

In [16]:
train_df = df.loc[df['isTrain']=='Train'].reset_index(drop=True)
test_df = df.loc[df['isTrain']=='Test'].reset_index(drop=True)
del df
gc.collect()

0

In [17]:
outlier_idx = []
for col in num_columns:
    z_score = stats.zscore(train_df[col])
    outlier_idx = outlier_idx + list(train_df.loc[(np.abs(z_score)>3)].index)

outlier_idx = list(set(outlier_idx))

if REMOVE_OUTLIERS:
    train_df = train_df.drop(index=outlier_idx,
                             axis=0).reset_index(drop=True)

joblib.dump(num_columns, 'data/task15/num_columns.pkl')
joblib.dump(dummy_columns, 'data/task15/dummy_columns.pkl')

['data/home_credit/dummy_columns.pkl']

In [18]:
print(f"Class ratio = {train_df['TARGET'].sum()/train_df.shape[0]}" )

Class ratio = 0.0703883495145631


In [19]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=42)

models = []
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df,
                                                      train_df['TARGET'],
                                                      groups=train_df['TARGET'])):
    train_set = train_df.loc[train_idx]
    val_set = train_df.loc[val_idx]

    model = LogisticRegression().fit(train_set[num_columns+dummy_columns],
                                     train_set['TARGET'])
    models.append(model)

    y_pred = model.predict_proba(val_set[num_columns+dummy_columns])[:, 1]

    auc_score = roc_auc_score(val_set['TARGET'], y_pred)
    print(f"FOLD-{fold}: AUC score={np.round(auc_score, 3)}")

joblib.dump(models, 'data/task15/models.pkl')

FOLD-0: AUC score=0.615
FOLD-1: AUC score=0.677
FOLD-2: AUC score=0.565
FOLD-3: AUC score=0.506
FOLD-4: AUC score=0.552


['data/home_credit/models.pkl']

Problem 3 Estimation on test data

In [30]:
test_df = pd.read_csv('data/task15/application_test.csv')

In [31]:
missing_values = joblib.load('data/task15/missing_values.pkl')
for col in missing_values.keys():
    test_df.loc[test_df[col].isnull(), col] = missing_values[col]

encoders = joblib.load('data/task15/encoders.pkl')
for col in encoders.keys():
    test_df[col] = encoders[col].transform(test_df[col])
cat_cols = ['CNT_CHILDREN', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
for col in tqdm(cat_cols, total=len(cat_cols)):
    for value in tqdm(test_df[col].unique()):
        test_df[f'dummy_{col}_{value}'] = 0
        test_df.loc[test_df[col]==value, f'dummy_{col}_{value}'] = 1

  0%|                                                                                           | 0/58 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 653.74it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 398.40it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 434.80it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 649.42it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 501.80it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 502.01it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 502.22it/s][A

100%|███████

In [32]:
models = joblib.load('data/task15/models.pkl')
num_columns = joblib.load('data/task15/num_columns.pkl')
dummy_columns = joblib.load('data/task15/dummy_columns.pkl')

for col in dummy_columns:
    if col not in test_df.columns:
        test_df[col] = 0

y_pred = np.zeros(test_df.shape[0])
for model in models:
    y_pred += model.predict_proba(test_df[num_columns+dummy_columns])[:, 1]/len(models)

test_df['TARGET'] = y_pred
test_df[['SK_ID_CURR', 'TARGET']].to_csv('data/task15/submission.csv', index=False)