In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import pickle
import itertools
from tqdm.auto import tqdm

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

# 環境設定

In [2]:
DATA_PATH = "../data"

# 分析

In [16]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col=0)
print(train_df.shape)

(42307, 20)


In [17]:
default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 'ApprovalFY']
default_categorical_features = ['NewExist', 'FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']
add_numerical_features = ['FranchiseCode_count_encoding', 'RevLineCr_count_encoding', 'LowDoc_count_encoding', 'UrbanRural_count_encoding', 'State_count_encoding', 'BankState_count_encoding', 'City_count_encoding', 'Sector_count_encoding']
numerical_features = add_numerical_features + default_numerical_features
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']
features = numerical_features + categorical_features

In [18]:
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState', 'DisbursementDate']:
            output_df[col] = input_df[col].fillna('[UNK]')
        return output_df
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            output_df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return output_df
    output_df = deal_missing(input_df)
    output_df = clean_money(output_df)
    output_df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        # いろいろ特徴量作成を追加する
        return output_df
    output_df = make_features(output_df)
    return output_df

In [19]:
train_df = Preprocessing(train_df)
print(train_df.shape)

(42307, 20)


In [20]:
for col in ['FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']:
    count_dict = dict(train_df[col].value_counts())
    train_df[f'{col}_count_encoding'] = train_df[col].map(count_dict)
print(train_df.shape)

(42307, 28)


In [21]:
for col in categorical_features:
    encoder = LabelEncoder()
    encoder.fit(train_df[col])
    train_df[col] = encoder.transform(train_df[col])
print(train_df.shape)

(42307, 28)


In [22]:
train_df.columns

Index(['Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
       'FranchiseCode', 'RevLineCr', 'LowDoc', 'DisbursementDate',
       'MIS_Status', 'Sector', 'ApprovalDate', 'ApprovalFY', 'City', 'State',
       'BankState', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 'UrbanRural',
       'FranchiseCode_count_encoding', 'RevLineCr_count_encoding',
       'LowDoc_count_encoding', 'UrbanRural_count_encoding',
       'State_count_encoding', 'BankState_count_encoding',
       'City_count_encoding', 'Sector_count_encoding'],
      dtype='object')

In [24]:
list(set(train_df.columns) -set(features))

['ApprovalDate',
 'MIS_Status',
 'DisbursementDate',
 'FranchiseCode',
 'NewExist',
 'BankState',
 'City']

In [25]:
len(features)

21