In [1]:
import pandas as pd
import numpy as np
import featuretools as ft

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import warnings

# Or, to specifically ignore LightGBM warnings:
warnings.filterwarnings('ignore')

In [2]:
#!pip install --upgrade numpy pandas matplotlib seaborn woodwork featuretools scikit-learn pyarrow hyperopt dask

In [4]:
train_control = pd.read_csv("./application_train.csv")
test_control = pd.read_csv("./application_test.csv")

In [5]:
app_initial = pd.concat([train_control, test_control], ignore_index=True, sort=True)

In [6]:
general_features = pd.read_csv('./input/general_features.csv')
time_features = pd.read_csv('./input/time_features.csv')
merged_df = general_features.merge(time_features, on='SK_ID_CURR')

In [7]:
merged_df = merged_df.merge(app_initial, on='SK_ID_CURR')
merged_df = merged_df.sort_values(by='SK_ID_CURR')

In [8]:
del general_features,time_features, train_control, test_control, app_initial

import gc
gc.collect()

0

In [9]:
#merged_df.columns = [col.replace('{', '/').replace('}', '/').replace('[', '/').replace(']', '/').replace(':', '-').replace(',', '') for col in merged_df.columns]

In [10]:
app_train = merged_df[merged_df["TARGET"].notnull()]
app_test = merged_df[merged_df["TARGET"].isna()]

In [11]:
del merged_df

import gc
gc.collect()

0

In [12]:
app_train = ft.selection.remove_highly_null_features(app_train, pct_null_threshold=0.75)
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

In [13]:
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

In [14]:
#import dask.dataframe as dd

#dask_df = dd.from_pandas(sampled_df, npartitions=6) 
#corr_matrix_dask = dask_df.corr().abs().compute()
#corr_matrix_dask.to_csv("correlation_matrix.csv")

In [15]:
corr_matrix = pd.read_csv("correlation_matrix.csv", index_col=0)
threshold = 0.9

mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

pairs = corr_matrix.where(mask).stack()
high_corr_pairs = pairs[pairs > threshold]

In [16]:
cols_to_drop = set()
for col1, col2 in high_corr_pairs.index:
    cols_to_drop.add(col2) 

In [17]:
app_train = app_train.drop(columns=list(cols_to_drop))
app_test = app_test.drop(columns=list(cols_to_drop))

In [18]:
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

In [19]:
print('Train shape: ', app_train.shape)
print('Test shape: ', app_test.shape)

Train shape:  (307511, 1249)
Test shape:  (48744, 1249)


In [21]:
app_train.to_csv("cleaned_train.csv")
app_test.to_csv("cleaned_test.csv")

In [22]:
train_labels = np.array(app_train['TARGET'].astype(np.int32)).reshape((-1, ))

train_ids = app_train['SK_ID_CURR']
test_ids = app_test['SK_ID_CURR']

app_train = app_train.drop(columns = ['SK_ID_CURR', "TARGET"])
app_test = app_test.drop(columns = ['SK_ID_CURR', "TARGET"])