In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from scipy import stats

sns.set_theme()

print("Seaborn: {}".format(sns.__version__))
print("NumPy: {}".format(np.__version__))

Seaborn: 0.11.0
NumPy: 1.17.4


In [2]:
# Usable list

info = ["SEX","EDUCATION","MARRIAGE","AGE"]
delay_n = ["PAY_APR", "PAY_MAY", "PAY_JUN", "PAY_JUL", "PAY_AUG", "PAY_SEP"]
bill_n = ["BILL_AMT_APR", "BILL_AMT_MAY", "BILL_AMT_JUN", "BILL_AMT_JUL", "BILL_AMT_AUG", "BILL_AMT_SEP"]
pay_n = ["PAY_AMT_APR", "PAY_AMT_MAY", "PAY_AMT_JUN", "PAY_AMT_JUL", "PAY_AMT_AUG", "PAY_AMT_SEP"]

In [3]:
train = pd.read_csv("C:/Users/chowonjae/Desktop/내부 프로젝트/uci_creditcard-train-0.0-0.0 (1).csv")
test = pd.read_csv("C:/Users/chowonjae/Desktop/내부 프로젝트/uci_creditcard-test-0.0-0.0 (1).csv")

length = len(train)

rename_dict =  {"default payment next month":"default", 
                                "PAY_6" : "PAY_APR", 
                                "PAY_5" : "PAY_MAY", 
                                "PAY_4" : "PAY_JUN",  
                                "PAY_3" : "PAY_JUL",
                                "PAY_2": "PAY_AUG", 
                                "PAY_0": "PAY_SEP", 
                                "PAY_AMT6": "PAY_AMT_APR", 
                                "PAY_AMT5": "PAY_AMT_MAY", 
                                "PAY_AMT4": "PAY_AMT_JUN",
                                "PAY_AMT3": "PAY_AMT_JUL", 
                                "PAY_AMT2": "PAY_AMT_AUG", 
                                "PAY_AMT1": "PAY_AMT_SEP", 
                                "BILL_AMT6": "BILL_AMT_APR", 
                                "BILL_AMT5": "BILL_AMT_MAY", 
                                "BILL_AMT4": "BILL_AMT_JUN", 
                                "BILL_AMT3": "BILL_AMT_JUL", 
                                "BILL_AMT2": "BILL_AMT_AUG",
                                "BILL_AMT1": "BILL_AMT_SEP"}

#Change the column name "default payment next month" -> "default"
train = train.rename(columns = rename_dict)
test = test.rename(columns = rename_dict)

train = train.drop(["ID","sep_idx"], axis = 1)
train_drop_info = train.drop(info, axis = 1)

test = test.drop(["ID","sep_idx"], axis = 1)

# Change Type

train["SEX"] = train["SEX"].astype(np.int)
train["EDUCATION"] = train["EDUCATION"].astype(np.int)
train["MARRIAGE"] = train["MARRIAGE"].astype(np.int)
train["AGE"] = train["AGE"].astype(np.int)
train["default"] = train["default"].astype(np.int)
train[delay_n] = train[delay_n].astype(np.int)

test["SEX"] = test["SEX"].astype(np.int)
test["EDUCATION"] = test["EDUCATION"].astype(np.int)
test["MARRIAGE"] = test["MARRIAGE"].astype(np.int)
test["AGE"] = test["AGE"].astype(np.int)
test["default"] = test["default"].astype(np.int)
test[delay_n] = test[delay_n].astype(np.int)

In [4]:
X_train = train.drop("default", axis = 1)
y_train = train["default"]

In [12]:
## Variance

train["BILL_AMT_SD"] = np.std(train[bill_n], axis = 1)

log_bill_amt_sd = np.log(train["BILL_AMT_SD"] + 1) 
log_std_sd = (log_bill_amt_sd - log_bill_amt_sd.mean()) / log_bill_amt_sd.std()

In [96]:
train[((log_std_sd > -3.664469) & (log_std_sd < -2)) | (log_std_sd > 2)]["default"].value_counts()

0    29
1     5
Name: default, dtype: int64

In [94]:
train.loc[log_std_sd <= -3.664469]["default"].value_counts()

0    591
1    403
Name: default, dtype: int64

In [130]:
train[log_std_sd > 2][bill_n + ["default"]]

Unnamed: 0,BILL_AMT_APR,BILL_AMT_MAY,BILL_AMT_JUN,BILL_AMT_JUL,BILL_AMT_AUG,BILL_AMT_SEP,default
15449,124542.0,632.0,632.0,855086.0,-215.0,47751.0,0
20119,377217.0,97115.0,121757.0,1664089.0,-18088.0,125.0,0


In [131]:
train[(log_std_sd > -3.664469) & (log_std_sd < -2)][bill_n + ["default"]]

Unnamed: 0,BILL_AMT_APR,BILL_AMT_MAY,BILL_AMT_JUN,BILL_AMT_JUL,BILL_AMT_AUG,BILL_AMT_SEP,default
2234,0.0,0.0,0.0,0.0,0.0,100.0,0
3011,324.0,326.0,326.0,326.0,326.0,326.0,0
3095,-31.0,-31.0,-31.0,-31.0,-31.0,-62.0,0
3759,100.0,0.0,0.0,0.0,0.0,0.0,0
4017,-5.0,-5.0,0.0,0.0,0.0,0.0,1
5068,187.0,188.0,176.0,187.0,191.0,194.0,0
6207,297.0,316.0,316.0,316.0,316.0,316.0,0
6935,-18.0,-18.0,-18.0,-18.0,-18.0,-36.0,0
8079,1925.0,2015.0,2015.0,2015.0,2015.0,2015.0,0
8144,0.0,0.0,0.0,0.0,0.0,101.0,1


- bill이 규칙적이지 않는 사람 혹은 항상 똑같은 사람.
- train_std_sd를 log를 취하여 얻은 값을 표준화하여 얻은 값중에 2초과 -2미만인 값들을 이상치로 분류하고 나머지 값들을 분석함.

In [97]:
## Variance

train["PAY_AMT_SD"] = np.std(train[pay_n], axis = 1)

log_pay_amt_sd = np.log(train["PAY_AMT_SD"] + 1) 
log_pay_std_sd = (log_pay_amt_sd - log_pay_amt_sd.mean()) / log_pay_amt_sd.std()

In [117]:
train[log_pay_std_sd > 2]["default"].value_counts()

0    68
1    14
Name: default, dtype: int64

In [119]:
train[(log_pay_std_sd > -3.183671) & (log_pay_std_sd < -2)]["default"].value_counts()

0    27
1     7
Name: default, dtype: int64

In [124]:
train[log_pay_std_sd < -3.183671]["default"].value_counts()

0    856
1    497
Name: default, dtype: int64