# 와인 품질 예측 모델 만들기

# 데이터 프레임 구성

In [3]:
import pandas as pd
import numpy as np

# 레드와인 정보
redwine  = pd.read_csv('winequality-red.csv', sep=',', header=0) # header는 0번째 열에 있다.
redwine['type'] = 'red'
redwine.head()

# 화이트와인 정보
whitewine  = pd.read_csv('winequality-white.csv', sep=';', header=0) # header는 0번째 열에 있다.
whitewine['type'] = 'white'
whitewine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white


In [4]:
# 레드와인 + 화이트와인
wine = redwine.append(whitewine)
wine.shape 

  wine = redwine.append(whitewine)


(6497, 13)

In [5]:
wine.columns = wine.columns.str.replace(' ','_') # Coulmn명에 ' '공백이 있으면, data를 다루기 힘들기 때문에 '-'로 바꿔준다
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


# 모델 클래스와 파라미터 선택

1. 회귀
2. 분류

우리는 `회귀`를 선택하여 문제해결.
`선형 회귀`

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True) # fit_intercept : 상수항(hyper parameter)를 사용할 것인가?
                                             # y= a + bx ----> 'a'


# 특징행렬과 대상벡터 추출

특징행렬(입력) <br>
대상벡터(출력 = 정답 = label)

In [7]:
X = wine.drop(['type', 'quality'], axis=1)
X.shape

# m : 훈련데이터 -> 6497건, n : 변수 -> 11개 (m * n)

(6497, 11)

In [9]:
y = wine.quality
y.shape

# 결과 -> 6497건

(6497,)

# 훈련 데이터와 테스트 데이터 분리

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # default : 훈련 데이터(75%), 테스트 데이터(25%)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4872, 11), (1625, 11), (4872,), (1625,))

# 모델의 데이터 적합

 parameter : 학습 과정에서 자동으로 설정되는 매개변수 <br>
 hyper parameter : 개발자가 어떤 모델을 컨트롤하기 위해서 사용하는 값

In [12]:
model.fit(X_train, y_train)

In [13]:
# y = ax + b --> 'a'
model.coef_  # 다중 선형 회귀. n개(11개)의 특징이 존재.11개 변수 각각의 parameter의 계수.

array([ 5.55618691e-02, -1.29945358e+00, -1.08205046e-01,  4.52070539e-02,
       -3.95901596e-01,  5.76479819e-03, -2.47760359e-03, -5.30023471e+01,
        3.50283862e-01,  7.49149475e-01,  2.78530060e-01])

In [14]:
# y = ax + b --> 'b'
model.intercept_   # 상수항 값.

54.058003854665714

# 모델을 새로운 데이터에 적용

In [15]:
newdata = np.array([6.3, 0.3, 0.34, 1.6, 0.049, 14, 132, 0.994, 3.3, 0.49, 9.5])

In [16]:
model.predict(np.reshape(newdata, (1, 11)))



array([5.27273597])

In [17]:
y_pred = model.predict(X_test)
y_pred.shape

(1625,)