In [1]:
#필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

#랜덤 시드 고정
SEED=12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
print("시드 고정:",SEED)

시드 고정: 12


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

print(train.shape, test.shape, submission.shape)

(5497, 14) (1000, 13) (1000, 2)


In [3]:
train.head(2)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red


In [6]:
submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [11]:
train['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [13]:
train['type']=np.where(train['type']=='white',1,0).astype(int)
test['type']=np.where(test['type']=='white',1,0).astype(int)
train['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

In [14]:
train['quality'].value_countscounts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [15]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(train.loc[:,'quality']-3)
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [19]:
#피처 선택
X_train = train.loc[:,'fixed acidity':]
X_test = test.loc[:,'fixed acidity':]

#피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(X_train_scaled.shape, y_train.shape)
print(X_test_scaled.shape)

(5497, 12) (5497, 7)
(1000, 12)


In [23]:
#심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
    model =Sequential()
    model.add(Dense(128,activation='tanh', input_dim=train_data.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64,activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(32,activation='tanh'))
    model.add(Dense(train_target.shape[1],activation='softmax'))
    model.compile(optimizer='RMSProp', loss='categorical_crossentropy',
                 metrics=['acc','mae'])
    return model

model = build_model(X_train_scaled,y_train)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 128)               1664      
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 7)                 231       
                                                                 
Total params: 12,231
Trainable params: 12,231
Non-trai

In [24]:
#Early Stopping 기법
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled,y_train,test_size=0.15,
                                           shuffle=True,random_state=SEED)
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
history = model.fit(X_tr, y_tr, batch_size=64, epochs=200,
                   validation_data=(X_val,y_val),
                   callbacks=[early_stopping],
                   verbose=2)

Epoch 1/200
73/73 - 2s - loss: 1.2884 - acc: 0.4741 - mae: 0.1916 - val_loss: 1.1380 - val_acc: 0.5212 - val_mae: 0.1771 - 2s/epoch - 21ms/step
Epoch 2/200
73/73 - 0s - loss: 1.1663 - acc: 0.5060 - mae: 0.1760 - val_loss: 1.0988 - val_acc: 0.5345 - val_mae: 0.1728 - 221ms/epoch - 3ms/step
Epoch 3/200
73/73 - 0s - loss: 1.1407 - acc: 0.5203 - mae: 0.1731 - val_loss: 1.0754 - val_acc: 0.5430 - val_mae: 0.1675 - 236ms/epoch - 3ms/step
Epoch 4/200
73/73 - 0s - loss: 1.1281 - acc: 0.5242 - mae: 0.1715 - val_loss: 1.1087 - val_acc: 0.5127 - val_mae: 0.1706 - 233ms/epoch - 3ms/step
Epoch 5/200
73/73 - 0s - loss: 1.1157 - acc: 0.5265 - mae: 0.1709 - val_loss: 1.0622 - val_acc: 0.5600 - val_mae: 0.1672 - 227ms/epoch - 3ms/step
Epoch 6/200
73/73 - 0s - loss: 1.1103 - acc: 0.5218 - mae: 0.1710 - val_loss: 1.0571 - val_acc: 0.5588 - val_mae: 0.1664 - 233ms/epoch - 3ms/step
Epoch 7/200
73/73 - 0s - loss: 1.1034 - acc: 0.5242 - mae: 0.1706 - val_loss: 1.0550 - val_acc: 0.5527 - val_mae: 0.1658 - 226

Epoch 57/200
73/73 - 0s - loss: 1.0288 - acc: 0.5634 - mae: 0.1625 - val_loss: 1.0208 - val_acc: 0.5709 - val_mae: 0.1617 - 246ms/epoch - 3ms/step
Epoch 58/200
73/73 - 0s - loss: 1.0275 - acc: 0.5584 - mae: 0.1621 - val_loss: 1.0168 - val_acc: 0.5733 - val_mae: 0.1599 - 225ms/epoch - 3ms/step
Epoch 59/200
73/73 - 0s - loss: 1.0253 - acc: 0.5621 - mae: 0.1620 - val_loss: 1.0234 - val_acc: 0.5661 - val_mae: 0.1615 - 232ms/epoch - 3ms/step
Epoch 60/200
73/73 - 0s - loss: 1.0269 - acc: 0.5586 - mae: 0.1624 - val_loss: 1.0230 - val_acc: 0.5685 - val_mae: 0.1594 - 241ms/epoch - 3ms/step
Epoch 61/200
73/73 - 0s - loss: 1.0312 - acc: 0.5599 - mae: 0.1627 - val_loss: 1.0181 - val_acc: 0.5515 - val_mae: 0.1590 - 235ms/epoch - 3ms/step
Epoch 62/200
73/73 - 0s - loss: 1.0266 - acc: 0.5542 - mae: 0.1621 - val_loss: 1.0282 - val_acc: 0.5600 - val_mae: 0.1610 - 246ms/epoch - 3ms/step
Epoch 63/200
73/73 - 0s - loss: 1.0235 - acc: 0.5681 - mae: 0.1618 - val_loss: 1.0220 - val_acc: 0.5467 - val_mae: 0.1

In [25]:
model.evaluate(X_val,y_val)



[1.0165252685546875, 0.5684848427772522, 0.15824899077415466]

In [26]:
#test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_test)
y_pred_proba[:5]



array([[0.5069378 , 0.00648227, 0.01839712, 0.02292223, 0.10277011,
        0.09292174, 0.24956875],
       [0.2590956 , 0.00084169, 0.00169764, 0.00629539, 0.05462304,
        0.02197545, 0.65547115],
       [0.5139195 , 0.00582294, 0.0127828 , 0.0138494 , 0.06186216,
        0.06758921, 0.32417405],
       [0.42302972, 0.01945269, 0.0370989 , 0.0694959 , 0.16308554,
        0.20327477, 0.08456258],
       [0.36491564, 0.00206718, 0.00441601, 0.00686246, 0.05680372,
        0.03112555, 0.5338094 ]], dtype=float32)

In [27]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([3, 9, 3, 3, 9], dtype=int64)

In [28]:
submission['quality'] = y_pred_label.astype(int)
submission.head()

Unnamed: 0,index,quality
0,0,3
1,1,9
2,2,3
3,3,3
4,4,9
