# LB 0.6581439

In [1]:
!pip install -r requirements.txt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import lightgbm as lgb

Collecting lightgbm (from -r requirements.txt (line 1))
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/ba/11/cb8b67f3cbdca05b59a032bb57963d4fe8c8d18c3870f30bed005b7f174d/lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata
  Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0


In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
new_index = range(42307, 42307 + len(test))
test.index = new_index

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42307 entries, 0 to 42306
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Term               42307 non-null  int64  
 1   NoEmp              42307 non-null  int64  
 2   NewExist           42307 non-null  float64
 3   CreateJob          42307 non-null  int64  
 4   RetainedJob        42307 non-null  int64  
 5   FranchiseCode      42307 non-null  int64  
 6   RevLineCr          41228 non-null  object 
 7   LowDoc             41776 non-null  object 
 8   DisbursementDate   42157 non-null  object 
 9   MIS_Status         42307 non-null  int64  
 10  Sector             42307 non-null  int64  
 11  ApprovalDate       42307 non-null  object 
 12  ApprovalFY         42307 non-null  int64  
 13  City               42307 non-null  object 
 14  State              42307 non-null  object 
 15  BankState          42296 non-null  object 
 16  DisbursementGross  423

In [4]:
def preprocess(df, replace_dict=None, ce_dict=None):
    # 貸借手の所在地系の変数
    # City: Cityは汎用性が低いと考えられるためDrop
    #df.drop("City", axis=1, inplace=True)

    # 借り手の会社に関する変数（Sector, FranchiseCode）
    # 31-33, 44-45, 48-49 は同じらしい => 32,33を31に, 45を44に, 49を48に変換
    code_dict = {
        32: 31,
        33: 31,
        45: 44,
        49: 48
    }
    df["Sector"] = df["Sector"].replace(code_dict)

    # 今回の借り入れに関する変数（RevLineCr, LowDoc）
    # 公式ページには値の候補が2つ（YesとNoのYN）と記載があるが、実際の値の種類は2より多い。YN以外はNaNへ置換
    revline_dict = {'0': np.nan, 'T': np.nan}
    df["RevLineCr"] = df["RevLineCr"].replace(revline_dict)

    lowdoc_dict = {'C': np.nan, '0': np.nan, 'S': np.nan, 'A': np.nan}
    df["LowDoc"] = df["LowDoc"].replace(lowdoc_dict)

    # 日付系の変数（DisbursementDate, ApprovalDate）
    # 日付型へ変更 → 年を抽出（借りた月や日にはあまり意味はないと思われるため）
    df['DisbursementDate'] = pd.to_datetime(df['DisbursementDate'], format='%d-%b-%y')
    df["DisbursementYear"] = df["DisbursementDate"].dt.year
    #df.drop(["DisbursementDate", "ApprovalDate"], axis=1, inplace=True)

    # 本来数値型のものを変換する
    cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    df[cols] = df[cols].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float).astype(int)

    # 特徴量エンジニアリング
    df["FY_Diff"] = df["ApprovalFY"] - df["DisbursementYear"]
    df["State_is_BankState"] = (df["State"] == df["BankState"])
    df["State_is_BankState"] = df["State_is_BankState"].replace({True: 1, False: 0})

    df['SBA_Portion'] = df['SBA_Appv'] / df['GrAppv']
    df["DisbursementGrossRatio"] = df["DisbursementGross"] / df["GrAppv"]
    df["MonthlyRepayment"] = df["GrAppv"] / df["Term"]
    df["NullCount"] = df.isnull().sum(axis=1)

    # カテゴリカル変数の設定
    df[cols_category] = df[cols_category].fillna(-1)

    # train
    if replace_dict is None:
        # countencode, labelencode
        # ce_dict: 列名を入れるとそのカテゴリのデータがどのくらいあるかを返してくれます
        # replace_dict: 列名を入れるとlabelencodeのための数字を返してくれます
        ce_dict = {}
        replace_dict = {}
        for col in cols_category:
            replace_dict[col] = {}
            vc = df[col].value_counts()
            ce_dict[col] = vc
            replace_dict_in_dict = {}
            for i, k in enumerate(vc.keys()):
                replace_dict_in_dict[k] = i
            replace_dict[col] = replace_dict_in_dict
            df[f"{col}_CountEncode"] = df[col].replace(vc).astype(int)
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df, replace_dict, ce_dict

    # test
    else:
        for col in cols_category:
            # カウントエンコード
            test_vals_uniq = df[col].unique()
            ce_dict_in_dict = ce_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in ce_dict_in_dict.keys():
                    ce_dict_in_dict[test_val] = -1
            df[f"{col}_CountEncode"] = df[col].replace(ce_dict_in_dict).astype(int)

            # LabelEncode
            test_vals_uniq = df[col].unique()
            replace_dict_in_dict = replace_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in replace_dict_in_dict.keys():
                    replace_dict_in_dict[test_val] = -1
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df

In [5]:
cols_category = []
for col in df.columns:
    if df[col].dtypes == 'object':
        cols_category.append(col)

In [6]:
df, replace_dict, ce_dict = preprocess(df)

test_df = preprocess(test, replace_dict, ce_dict)

In [7]:
X = df.drop('MIS_Status',axis=1)
y = df['MIS_Status']

In [8]:
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(zip(np.unique(y), weights))

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import statistics
from scipy.stats import mode

average_f1 = []
predictions = []
skf = StratifiedKFold(n_splits=5)
for train_idx, val_idx in skf.split(X,y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    params = {
        'objective': 'binary',
        'metric': 'f1',
        'class_weight': class_weights,
        "n_estimators": 3000,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample_freq": 1,
        "subsample": 0.8,
        "random_seed": 0,
        'verbose': -1,
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    y_pred = model.predict(X_val)
    mean_f1 = f1_score(y_val, y_pred, average='macro')
    average_f1.append(mean_f1)
    feature_gains = model.booster_.feature_importance(importance_type='gain')
    feature = pd.DataFrame({'feature': X.columns, 'importance': feature_gains})
    feature.sort_values(by='importance',ascending=False)
    print(feature)
    print("Mean F1 Score (Macro F1 Score):", mean_f1)

average_f1 = statistics.mean(average_f1)
print(' ')
print(f'average_f1:{average_f1}')

                          feature     importance
0                            Term   79171.405820
1                           NoEmp   91309.713784
2                        NewExist    6219.099139
3                       CreateJob   29779.766485
4                     RetainedJob   35527.159718
5                   FranchiseCode   20953.661209
6                       RevLineCr   16984.197553
7                          LowDoc   41247.381439
8                DisbursementDate   65284.307626
9                          Sector   33056.961220
10                   ApprovalDate   84301.015760
11                     ApprovalFY   36213.887647
12                           City   69761.493505
13                          State   35324.321803
14                      BankState   34186.859103
15              DisbursementGross   53620.676730
16                         GrAppv   42374.873174
17                       SBA_Appv   59151.750692
18                     UrbanRural  126512.161096
19               Dis

In [21]:
params = {
        'objective': 'binary',
        'metric': 'f1',
        'class_weight': class_weights,
        "n_estimators": 3000,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample_freq": 1,
        "subsample": 0.8,
        "random_seed": 0,
        'verbose': -1,
}

final_model = lgb.LGBMClassifier(**params)
final_model.fit(X_train, y_train)
pred=final_model.predict(test)

predictions = pd.DataFrame({'Id': test.index, 'predict': pred})
predictions.to_csv('predictions_file.csv',index=False,header=False)