In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data 전처리

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

In [3]:
print(train_df.shape, test_df.shape, submission.shape)

(26457, 17) (10000, 16) (10000, 4)


### FLAG_MOBIL 변수 삭제 

In [4]:
print(train_df['FLAG_MOBIL'].value_counts(), end = '\n\n') # 모두 1 -> 의미 x => drop
print(test_df['FLAG_MOBIL'].value_counts()) # 모두 1 -> 의미 x => drop
train_df.drop('FLAG_MOBIL', axis=True, inplace=True)
test_df.drop('FLAG_MOBIL', axis=True, inplace=True)
print(train_df.shape, test_df.shape)

1    26457
Name: FLAG_MOBIL, dtype: int64

1    10000
Name: FLAG_MOBIL, dtype: int64
(26457, 16) (10000, 15)


### index 변수 삭제

In [5]:
train_df.drop('index', axis=True, inplace=True) # 1,2,3 ... index -> 의미 x
test_df.drop('index', axis=True, inplace=True) # 1,2,3 ... index -> 의미 x
print(train_df.shape, test_df.shape)

(26457, 15) (10000, 14)


### binary 변수 처리

In [6]:
# binary variable (여성 - 0, 남성 - 1)
train_df['gender'] = train_df['gender'].replace({'F':0, 'M':1})
test_df['gender'] = test_df['gender'].replace({'F':0, 'M':1})

In [7]:
train_df.head()

Unnamed: 0,gender,Annual_income,income_type,Education,family_type,house_type,DAYS_BIRTH,working_day,work_phone,phone,email,occyp_type,begin_month,car_reality,credit
0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,0,0,0,,-6.0,0,1.0
1,0,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,0,0,1,Laborers,-5.0,1,1.0
2,1,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,0,1,0,Managers,-22.0,2,2.0
3,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,0,1,0,Sales staff,-37.0,1,0.0
4,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,0,0,0,Managers,-26.0,2,2.0


In [8]:
print(train_df.info(), end='\n\n')
print('--------------------구분---------------------', end='\n\n')
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  int64  
 1   Annual_income  26457 non-null  float64
 2   income_type    26457 non-null  object 
 3   Education      26457 non-null  object 
 4   family_type    26457 non-null  object 
 5   house_type     26457 non-null  object 
 6   DAYS_BIRTH     26457 non-null  int64  
 7   working_day    26457 non-null  int64  
 8   work_phone     26457 non-null  int64  
 9   phone          26457 non-null  int64  
 10  email          26457 non-null  int64  
 11  occyp_type     18286 non-null  object 
 12  begin_month    26457 non-null  float64
 13  car_reality    26457 non-null  int64  
 14  credit         26457 non-null  float64
dtypes: float64(3), int64(7), object(5)
memory usage: 3.0+ MB
None

--------------------구분---------------------

<class 'pandas.core.frame.Data

### 결측치 처리

In [9]:
print(train_df.isnull().sum(), end='\n\n')
print('--------------------구분---------------------', end='\n\n')
print(test_df.isnull().sum())

gender              0
Annual_income       0
income_type         0
Education           0
family_type         0
house_type          0
DAYS_BIRTH          0
working_day         0
work_phone          0
phone               0
email               0
occyp_type       8171
begin_month         0
car_reality         0
credit              0
dtype: int64

--------------------구분---------------------

gender              0
Annual_income       0
income_type         0
Education           0
family_type         0
house_type          0
DAYS_BIRTH          0
working_day         0
work_phone          0
phone               0
email               0
occyp_type       3152
begin_month         0
car_reality         0
dtype: int64


In [10]:
train_df['occyp_type'] = train_df['occyp_type'].fillna('Null')
test_df['occyp_type'] = test_df['occyp_type'].fillna('Null')
print(train_df.isnull().sum(), end='\n\n')
print('--------------------구분---------------------', end='\n\n')
print(test_df.isnull().sum())

gender           0
Annual_income    0
income_type      0
Education        0
family_type      0
house_type       0
DAYS_BIRTH       0
working_day      0
work_phone       0
phone            0
email            0
occyp_type       0
begin_month      0
car_reality      0
credit           0
dtype: int64

--------------------구분---------------------

gender           0
Annual_income    0
income_type      0
Education        0
family_type      0
house_type       0
DAYS_BIRTH       0
working_day      0
work_phone       0
phone            0
email            0
occyp_type       0
begin_month      0
car_reality      0
dtype: int64


### One-Hot Encoding

In [11]:
train_one_hot = pd.get_dummies(train_df)
test_one_hot = pd.get_dummies(test_df)
print(train_one_hot.shape, test_one_hot.shape)

(26457, 50) (10000, 49)


### 훈련 데이터(Train data)와 테스트 데이터(Test data)의 분리

In [12]:
train_x = train_one_hot.drop('credit', axis=1)
train_y = train_one_hot[['credit']]
test_x = test_one_hot
print(train_x.shape, train_y.shape, test_x.shape)

(26457, 49) (26457, 1) (10000, 49)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y,stratify=train_y, 
                                                  test_size=0.2, random_state = 10086)
print(X_train.shape, y_train.shape,  X_val.shape, y_val.shape)

(21165, 49) (21165, 1) (5292, 49) (5292, 1)


### ouput 클래스 확인

In [14]:
train_df['credit'].unique()

array([1., 2., 0.])

In [15]:
train_df['credit'].value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

## Oversampling (SMOTE)

In [16]:
# y_train.value_counts()

In [17]:
# # !pip install imblearn
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(k_neighbors = 3)
# X_train, y_train = smote.fit_resample(X_train, y_train)

In [18]:
# y_train.value_counts()

### 데이터 정규화 (MinMaxScaler)

In [19]:
from sklearn.preprocessing import MinMaxScaler as MMS
scaler = MMS().fit(X_train) # 최소 0 ~ 최대 1

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [20]:
# X_train = pd.DataFrame(X_train, columns = train_x.columns)
# X_train

In [21]:
# y_train

## RandomForestClassifier

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical

model_RF = RandomForestClassifier()
model_RF.fit(X_train, y_train)
y_pred = model_RF.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

  model_RF.fit(X_train, y_train)


log_loss: 0.9593378947449288


## LogisticRegression

In [23]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(max_iter=1000)
model_LR.fit(X_train, y_train)
y_pred = model_LR.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

  y = column_or_1d(y, warn=True)


log_loss: 0.8651591570671058


## MLPClassifier

In [24]:
from sklearn.neural_network import MLPClassifier

model_MLP = MLPClassifier(hidden_layer_sizes = (100, 100), max_iter=200, activation='relu', alpha=1)
model_MLP.fit(X_train, y_train)
y_pred = model_MLP.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

  y = column_or_1d(y, warn=True)


log_loss: 0.8445388177922136


## DecisionTreeClassifier

In [25]:
from sklearn.tree import DecisionTreeClassifier

model_TREE = DecisionTreeClassifier(max_depth=2, random_state = 10086)
model_TREE.fit(X_train, y_train)
y_pred = model_TREE.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

log_loss: 0.8111744676790846


## KNeighborsClassifier

In [26]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN = KNeighborsClassifier(n_neighbors=150)
model_KNN.fit(X_train, y_train)
y_pred = model_KNN.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

  return self._fit(X, y)


log_loss: 0.8739689902515003


## Ensemble (VotingClassifier)

In [27]:
from sklearn.ensemble import VotingClassifier

model_EnsembleH = VotingClassifier(estimators=[("LogisticRegression", model_LR), ("MLPClassifier", model_MLP),
                                               ("DecisionTreeClassifier", model_TREE), 
                                               ("RandomForestClassifier", model_RF)], voting = 'hard')

model_EnsembleH.fit(X_train, y_train)
y_pred = model_EnsembleH.predict(X_val)
# print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")
print("voting 분류기 정확도 {0:.4f}".format(accuracy_score(y_val, y_pred)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


voting 분류기 정확도 0.6909


In [28]:
model_EnsembleS = VotingClassifier(estimators=[("LogisticRegression", model_LR), ("MLPClassifier", model_MLP),
                                               ("DecisionTreeClassifier", model_TREE), 
                                               ("RandomForestClassifier", model_RF)], voting = 'soft')

model_EnsembleS.fit(X_train, y_train)
y_pred = model_EnsembleS.predict_proba(X_val)
print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


log_loss: 0.7668689694331253


In [30]:
# pred = model_EnsembleS.predict_proba(test_x)
# submission.loc[:,1:] = pred
# submission.to_csv('model_EnsembleS.csv',index=False)

  indexer = self._get_setitem_indexer(key)


## DNN