In [1]:
import numpy as np
import featuretools as ft
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('input/train.csv', na_values=['NAN','NA','NaN','na','nan'])
test_df = pd.read_csv('input/test.csv', na_values=['NAN','NA','NaN','na','nan'])
X_all = pd.read_csv('input/X_all.csv', na_values=['NAN','NA','NaN','na','nan'])
X = pd.read_csv('input/X.csv', na_values=['NAN','NA','NaN','na','nan'])
X_test = pd.read_csv('input/X_test.csv', na_values=['NAN','NA','NaN','na','nan'])

In [13]:
cols = X.columns.values
cols_num = list(X_all.select_dtypes(include=['int64','float64']).columns)
cols_cat = list(X_all.select_dtypes(include=['object']).columns)

In [22]:
X.fillna(X.median(), inplace=True)
y.fillna(0, inplace=True)

In [4]:
train_df['OC'][train_df['OC'] == 'open'] = 0
train_df['OC'][train_df['OC'] == ' close'] = 1

In [5]:
train_df['OC'].value_counts()

0    286
1     15
Name: OC, dtype: int64

In [6]:
imbalanced_pos_ratio = train_df['OC'].value_counts()[1]/train_df['OC'].value_counts()[0]
imbalanced_pos_ratio

0.05244755244755245

In [7]:
y = train_df['OC']
X_id = train_df['inst_id']
X_test_id = test_df['inst_id']


In [8]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [9]:
from catboost import CatBoostClassifier

In [10]:
model = CatBoostClassifier(class_weights=[1,1/imbalanced_pos_ratio],
                           loss_function='Logloss', logging_level='Silent',
                           task_type='GPU', boosting_type='Plain', bootstrap_type='Bernoulli')

In [11]:
model.fit(X, y)
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
preds_proba[:,1]

array([0.00408123, 0.33128521, 0.05913383, 0.00770652, 0.21381515,
       0.02458065, 0.05649086, 0.09580576, 0.1380055 , 0.06184026,
       0.0082946 , 0.03695833, 0.00787514, 0.00474842, 0.00863804,
       0.08912987, 0.09155763, 0.45556704, 0.06992758, 0.02963349,
       0.2044366 , 0.09464936, 0.7914983 , 0.03290212, 0.00337411,
       0.03024739, 0.00585179, 0.0322947 , 0.01323549, 0.08774752,
       0.02956977, 0.00336785, 0.01000765, 0.00811486, 0.0155715 ,
       0.00802359, 0.00839108, 0.00640518, 0.00327837, 0.00642084,
       0.07488033, 0.00894786, 0.00890864, 0.0169063 , 0.10201278,
       0.10726218, 0.85940602, 0.00518868, 0.01423769, 0.00468961,
       0.13650379, 0.03393426, 0.67684209, 0.120447  , 0.14326794,
       0.1110973 , 0.00940531, 0.016318  , 0.03597127, 0.01676867,
       0.03482678, 0.07428103, 0.02734732, 0.33015677, 0.08182354,
       0.01087594, 0.70630303, 0.05337437, 0.00762487, 0.05066731,
       0.01011525, 0.13324898, 0.00374164, 0.02045917, 0.00438

In [15]:
feature_importance = pd.DataFrame()
feature_importance['feature'] = cols
feature_importance['value'] = min_max_scaler.fit_transform(model.feature_importances_.reshape(-1, 1))

In [16]:
feature_importance.sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
933,employee1_enc,1.000000
935,ownerChange_enc,0.503489
934,employee2_enc,0.442972
113,DIFF(shortLoan1 by ownerChange),0.223277
154,DIFF(bedCount by instkind),0.169799
930,sido_enc,0.169014
152,DIFF(longLoan2 by ownerChange),0.097562
250,DIFF(liquidAsset2 by instkind),0.082055
206,DIFF(ctax2 by instkind),0.077936
184,DIFF(inventoryAsset2 by sido),0.066715


In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
clf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=0, n_jobs=8, class_weight='balanced')

In [25]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=8, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [26]:
feature_importance2 = pd.DataFrame()
feature_importance2['feature'] = cols
feature_importance2['value'] = min_max_scaler.fit_transform(clf.feature_importances_.reshape(-1, 1))
feature_importance2.sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
933,employee1_enc,1.000000
934,employee2_enc,0.903357
930,sido_enc,0.504630
113,DIFF(shortLoan1 by ownerChange),0.499055
935,ownerChange_enc,0.410934
149,DIFF(profit2 by ownerChange),0.201299
198,DIFF(OnonCAsset2 by ownerChange),0.200832
154,DIFF(bedCount by instkind),0.189897
66,salescost2,0.160925
186,DIFF(noe1 by ownerChange),0.158984


In [27]:
import lightgbm as lgb

In [29]:
clf = lgb.LGBMClassifier(
        device = "gpu",
        gpu_platform_id = 0,
        gpu_device_id = 1,
        n_estimators=500,
        learning_rate=0.01,
        is_unbalance=True,
        metric='auc',
        boosting_type='gbdt',
        num_leaves=14,
        colsample_bytree=0.757,
        feature_fraction=0.746,
        verbose=2,
    )

In [30]:
clf.fit(X,y.astype('int'),verbose=True)

LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.757, device='gpu', feature_fraction=0.746,
        gpu_device_id=1, gpu_platform_id=0, importance_type='split',
        is_unbalance=True, learning_rate=0.01, max_depth=-1, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=14, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose=2)

In [31]:
feature_importance3 = pd.DataFrame()
feature_importance3['feature'] = cols
feature_importance3['value'] = min_max_scaler.fit_transform(clf.feature_importances_.reshape(-1, 1))
feature_importance3.sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
930,sido_enc,1.000000
933,employee1_enc,0.888283
934,employee2_enc,0.773842
113,DIFF(shortLoan1 by ownerChange),0.555858
67,bedCount,0.354223
91,liquidLiabilities2,0.348774
186,DIFF(noe1 by ownerChange),0.324251
615,city.STD(clients.shortLoan1),0.302452
93,inventoryAsset2,0.294278
149,DIFF(profit2 by ownerChange),0.237057


In [32]:
from xgboost.sklearn import XGBClassifier

In [33]:
xgb = XGBClassifier(
         learning_rate= 0.1,
         objective= 'binary:logistic',
         tree_method='gpu_exact',
         predictor='gpu_predictor',
         scale_pos_weight=int(1/imbalanced_pos_ratio),
         seed=42)

In [34]:
xgb.fit(X,y.astype('int'),verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=19, seed=42, silent=True,
       subsample=1, tree_method='gpu_exact')

In [35]:
feature_importance4 = pd.DataFrame()
feature_importance4['feature'] = cols
feature_importance4['value'] = min_max_scaler.fit_transform(xgb.feature_importances_.reshape(-1, 1))

In [36]:
feature_importance4.sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
0,noi1,1.000000
933,employee1_enc,0.483516
935,ownerChange_enc,0.274725
113,DIFF(shortLoan1 by ownerChange),0.252747
934,employee2_enc,0.208791
84,OnonCAsset1,0.197802
131,DIFF(OnonCAsset2 by instkind),0.186813
201,DIFF(OnonCAsset1 by instkind),0.175824
229,DIFF(sgg by ownerChange),0.175824
930,sido_enc,0.164835


In [37]:
ft_importance_total = pd.DataFrame()
ft_importance_total['feature'] = cols
ft_importance_total['value'] = (feature_importance['value']+feature_importance2['value']+feature_importance3['value']+feature_importance4['value'])/4

In [38]:
ft_importance_total[ft_importance_total.value >= np.percentile(ft_importance_total.value,80)].sort_values(by='value', ascending=False)

Unnamed: 0,feature,value
933,employee1_enc,0.842950
934,employee2_enc,0.582240
930,sido_enc,0.459620
113,DIFF(shortLoan1 by ownerChange),0.382734
935,ownerChange_enc,0.355870
0,noi1,0.268441
149,DIFF(profit2 by ownerChange),0.148903
154,DIFF(bedCount by instkind),0.145827
186,DIFF(noe1 by ownerChange),0.139725
93,inventoryAsset2,0.136877


In [39]:
ft_importance_total.to_csv('input/ft_importance_total.csv')