In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:85% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:12pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:12px;}
</style>
"""))

# red wine 품질 등급 예측

```
1. 데이터셋 확보 & 전처리
    독립변수와 타겟변수 분리 :
    -> 독립변수만 스케일 조정(StandardScaler) 
    -> 타겟변수 one-hot-encoding(get_dummies) 
    -> 훈련셋과 테스트셋 분리(train_test_split)
2. 모델 구성(입력(독립)변수 11, 출력(종속)변수 ? -> one-hot-encoding 필요
3. 모델 학습 과정 설정(다중분류로 설정) -> 출력층의 활성화함수 : softmax
4. 모델 학습(callbacks 이용)
5. 모델 평가 - 그래프, 평가(테스트셋), 교차표/혼동행렬
6. 모델 저장 및 사용
```

In [2]:
import numpy as np
import pandas as pd # read_csv, get_dummies, crosstab
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
from tensorflow.keras.metrics import Recall, Precision # compile 단계에서 사용
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint
import matplotlib.pyplot as plt

# 1. 데이터셋 확보 & 전처리
- 독립변수와 타겟변수 분리 :
    * 독립변수만 스케일 조정(StandardScaler) 
    * 타겟변수 one-hot-encoding(get_dummies) 
    * 훈련셋과 테스트셋 분리(train_test_split)

In [5]:
# 데이터 읽어오기
# # data = np.loadtxt('data/winequality-red.csv', delimiter=';', skiprows=1)
# np.genfromtxt('data/winequality-red.csv', delimiter=';', skip_header=1)
redwine = pd.read_csv('data/winequality-red.csv', delimiter=';')
redwine.head()
# 독립변수 :
# 고정 산도(fixed acidity)
# 휘발성 산도(volatile acidity)
# 구연산(citric acid)
# 잔여 당분(residual sugar)
# 염화물(chlorides)
# 유리 아황산염(free sulfur dioxide)
# 총 아황산염(total sulfur dioxide)
# 밀도(density)
# pH
# 황산염(sulphates)
# 알코올(alcohol)

# 타겟변수 :
# 품질 등급(quality)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
redwine.info() # 결측치 없음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [7]:
# 타겟변수의 균형
redwine.quality.value_counts() / len(redwine) * 100 

5    42.589118
6    39.899937
7    12.445278
4     3.314572
8     1.125704
3     0.625391
Name: quality, dtype: float64

In [24]:
# 독립변수, 타겟변수 분리
X_redwine = redwine.iloc[:, :-1].values # to_numpy와 유사. numpy 배열로
y_redwine = redwine.iloc[:, -1] # 3, 4, 5, 6, 7, 8 등급

In [25]:
X_redwine.shape

(1599, 11)

In [27]:
# 독립변수 X의 스케일 조정
scaler = StandardScaler()
scaled_X_redwine = scaler.fit_transform(X_redwine)
# 다중분류를 위한 타겟변수의 one-hot-encoding -> numpy 배열로 변환
Y_redwine = pd.get_dummies(y_redwine).to_numpy()

In [30]:
# 독립변수 & 타겟변수
scaled_X_redwine.shape, Y_redwine.shape

((1599, 11), (1599, 6))