# Wine Quality Prediction

**데이터 프레임 생성 / 함수 생성 / 확인 등등**

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

input_file = "winequality-red.csv"
df = pd.read_csv(input_file)

df = shuffle(df, random_state = 42)

In [2]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
def display_scores_mean(scores):
    print("Mean:", scores.mean())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599 entries, 803 to 1126
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 162.4 KB


**특성 목록**

0. 고정 산도
1. 휘발 산도
2. 시트르산
3. 잔류 설탕
4. 염화물
5. 자유 이산 화항
6. 총 이산 화항
7. 밀도
8. pH
9. 황산염
10. 알코올

**데이터 프레임을 훈련셋/라벨로 분리**

In [4]:
X_train = df.drop(["quality"], axis=1)
y_train = df["quality"].copy()

In [5]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
803,7.7,0.560,0.08,2.50,0.114,14.0,46.0,0.99710,3.24,0.66,9.6
124,7.8,0.500,0.17,1.60,0.082,21.0,102.0,0.99600,3.39,0.48,9.5
350,10.7,0.670,0.22,2.70,0.107,17.0,34.0,1.00040,3.28,0.98,9.9
682,8.5,0.460,0.31,2.25,0.078,32.0,58.0,0.99800,3.33,0.54,9.8
1326,6.7,0.460,0.24,1.70,0.077,18.0,34.0,0.99480,3.39,0.60,10.6
...,...,...,...,...,...,...,...,...,...,...,...
1130,9.1,0.600,0.00,1.90,0.058,5.0,10.0,0.99770,3.18,0.63,10.4
1294,8.2,0.635,0.10,2.10,0.073,25.0,60.0,0.99638,3.29,0.75,10.9
860,7.2,0.620,0.06,2.70,0.077,15.0,85.0,0.99746,3.51,0.54,9.5
1459,7.9,0.200,0.35,1.70,0.054,7.0,15.0,0.99458,3.32,0.80,11.9


In [6]:
y_train

803     6
124     5
350     6
682     5
1326    6
       ..
1130    6
1294    6
860     5
1459    7
1126    6
Name: quality, Length: 1599, dtype: int64

**랜덤 포레스트보다 엑스트라가 더 좋은 결과를 보였음**

In [8]:
from sklearn.ensemble import ExtraTreesRegressor

ext_reg = ExtraTreesRegressor(random_state=42)
ext_reg.fit(X_train, y_train)

ExtraTreesRegressor(random_state=42)

In [9]:
from sklearn.model_selection import cross_val_score

ext_scores = cross_val_score(ext_reg, X_train, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores(ext_rmse_scores)

Scores: [0.53636683 0.52877216 0.59122595 0.55056619 0.56376746 0.58689543
 0.54629777 0.5734806  0.50869195 0.52686745]
Mean: 0.5512931789128495
Standard deviation: 0.02583987619504102


In [10]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [11]:
rf_scores = cross_val_score(rf_reg, X_train, y_train, scoring = "neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-rf_scores)
display_scores(rf_rmse_scores)

Scores: [0.56245555 0.5485572  0.59725832 0.54845009 0.57064437 0.6029008
 0.56574674 0.61837084 0.52749526 0.54332832]
Mean: 0.5685207495809206
Standard deviation: 0.027678481419400287


1. fixed acidity
2. volatile acidity	
3. citric acid	
4. residual sugar	
5. chlorides	
6. free sulfur dioxide	
7. total sulfur dioxide	
8. density	
9. pH	
10. sulphates	
11. alcohol

# 특성 공학

In [None]:
X_train_fixed = df.drop(["fixed acidity", "quality"], axis=1)
X_train_volatile = df.drop(["volatile acidity", "quality"], axis=1)
X_train_citric = df.drop(["citric acid", "quality"], axis=1)
X_train_residual = df.drop(["residual sugar", "quality"], axis=1)
X_train_chlorides = df.drop(["chlorides", "quality"], axis=1)
X_train_free = df.drop(["free sulfur dioxide", "quality"], axis=1)
X_train_total = df.drop(["total sulfur dioxide", "quality"], axis=1)
X_train_density = df.drop(["density", "quality"], axis=1)
X_train_pH = df.drop(["pH", "quality"], axis=1)
X_train_sulphates = df.drop(["sulphates", "quality"], axis=1)
X_train_alcohol = df.drop(["alcohol", "quality"], axis=1)

## fixed

In [None]:
ext_reg_fixed = ExtraTreesRegressor(random_state=42)
ext_reg_fixed.fit(X_train_fixed, y_train)

In [None]:
ext_scores_fixed = cross_val_score(ext_reg_fixed, X_train_fixed, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_fixed = np.sqrt(-ext_scores_fixed)
display_scores(ext_rmse_scores_fixed)

## volatile

In [None]:
ext_reg_volatile = ExtraTreesRegressor(random_state=42)
ext_reg_volatile.fit(X_train_volatile, y_train)

In [None]:
ext_scores_volatile = cross_val_score(ext_reg_volatile, X_train_volatile, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_volatile = np.sqrt(-ext_scores_volatile)
display_scores(ext_rmse_scores_volatile)

## citric

In [None]:
ext_reg_citric = ExtraTreesRegressor(random_state=42)
ext_reg_citric.fit(X_train_citric, y_train)

In [None]:
ext_scores_citric = cross_val_score(ext_reg_citric, X_train_citric, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_citric = np.sqrt(-ext_scores_citric)
display_scores(ext_rmse_scores_citric)

## residual

In [None]:
ext_reg_residual = ExtraTreesRegressor(random_state=42)
ext_reg_residual.fit(X_train_residual, y_train)

In [None]:
ext_scores_residual = cross_val_score(ext_reg_residual, X_train_residual, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_residual = np.sqrt(-ext_scores_residual)
display_scores(ext_rmse_scores_residual)

## chlorides

In [None]:
ext_reg_chlorides = ExtraTreesRegressor(random_state=42)
ext_reg_chlorides.fit(X_train_chlorides, y_train)

In [None]:
ext_scores_chlorides = cross_val_score(ext_reg_chlorides, X_train_chlorides, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_chlorides = np.sqrt(-ext_scores_chlorides)
display_scores(ext_rmse_scores_chlorides)

## free

In [None]:
ext_reg_free = ExtraTreesRegressor(random_state=42)
ext_reg_free.fit(X_train_free, y_train)

In [None]:
ext_scores_free = cross_val_score(ext_reg_free, X_train_free, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_free = np.sqrt(-ext_scores_free)
display_scores(ext_rmse_scores_free)

## total

In [None]:
ext_reg_total = ExtraTreesRegressor(random_state=42)
ext_reg_total.fit(X_train_total, y_train)

In [None]:
ext_scores_total = cross_val_score(ext_reg_total, X_train_total, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_total = np.sqrt(-ext_scores_total)
display_scores(ext_rmse_scores_total)

## density

In [None]:
ext_reg_density = ExtraTreesRegressor(random_state=42)
ext_reg_density.fit(X_train_density, y_train)

In [None]:
ext_scores_density = cross_val_score(ext_reg_density, X_train_density, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_density = np.sqrt(-ext_scores_density)
display_scores(ext_rmse_scores_density)

## pH

In [None]:
ext_reg_pH = ExtraTreesRegressor(random_state=42)
ext_reg_pH.fit(X_train_pH, y_train)

In [None]:
ext_scores_pH = cross_val_score(ext_reg_pH, X_train_pH, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_pH = np.sqrt(-ext_scores_pH)
display_scores(ext_rmse_scores_pH)

## sulphates

In [None]:
ext_reg_sulphates = ExtraTreesRegressor(random_state=42)
ext_reg_sulphates.fit(X_train_sulphates, y_train)

In [None]:
ext_scores_sulphates = cross_val_score(ext_reg_sulphates, X_train_sulphates, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_sulphates = np.sqrt(-ext_scores_sulphates)
display_scores(ext_rmse_scores_sulphates)

## alcohol

In [None]:
ext_reg_alcohol = ExtraTreesRegressor(random_state=42)
ext_reg_alcohol.fit(X_train_alcohol, y_train)

In [None]:
ext_scores_alcohol = cross_val_score(ext_reg_alcohol, X_train_alcohol, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_alcohol = np.sqrt(-ext_scores_alcohol)
display_scores(ext_rmse_scores_alcohol)

## 특성 몇 개 지우고 테스트

In [None]:
X_train_test = df.drop(["fixed acidity", "density", "quality"], axis=1)
ext_reg_test = ExtraTreesRegressor(random_state=42)
ext_reg_test.fit(X_train_test, y_train)

In [None]:
ext_scores_test = cross_val_score(ext_reg_test, X_train_test, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores_test = np.sqrt(-ext_scores_test)
display_scores(ext_rmse_scores_test)

In [None]:
ext_reg_test = ExtraTreesRegressor(random_state=0, n_jobs=-1)
ext_reg_test.fit(X_train_test, y_train)

ext_scores_test = cross_val_score(ext_reg_test, X_train_test, y_train, scoring = "neg_mean_squared_error", cv=10)
past_rmse_scores = np.sqrt(-ext_scores_test)

for x in range(1, 10000):
    ext_reg_test = ExtraTreesRegressor(random_state=x, n_jobs=-1)
    ext_reg_test.fit(X_train_test, y_train)
    
    ext_scores_test = cross_val_score(ext_reg_test, X_train_test, y_train, scoring = "neg_mean_squared_error", cv=10)
#     ext_rmse_scores_test = np.sqrt(-ext_scores_test)
    current_rmse_scores_test = np.sqrt(-ext_scores_test)

    if current_rmse_scores_test.mean() < past_rmse_scores.mean():
        past_rmse_scores = current_rmse_scores_test
        print(f"{x}번째 rmse값:{np.mean(past_rmse_scores)}")
        
display_scores_mean(past_rmse_scores) # best random_state

**기본 rs가 42인 기준에서는 0.5493939765435303의 값을 지님**

2346번째 rmse값:0.5454708458211521...

끝까지 하기엔 시간 오래걸려서 인터럽트 걸었음. 위는 무시해도 좋은 코드.

# volatile acidity

In [None]:
X_train_vol_cit = X_train.copy()
X_train_vol_res = X_train.copy()
X_train_vol_chl = X_train.copy()
X_train_vol_fre = X_train.copy()
X_train_vol_tot = X_train.copy()
X_train_vol_pH = X_train.copy()
X_train_vol_sul = X_train.copy()
X_train_vol_alc = X_train.copy()

# citric acid

In [None]:
X_train_cit_res = X_train.copy()
X_train_cit_chl = X_train.copy()
X_train_cit_fre = X_train.copy()
X_train_cit_tot = X_train.copy()
X_train_cit_pH = X_train.copy()
X_train_cit_sul = X_train.copy()
X_train_cit_alc = X_train.copy()

# residual sugar

In [None]:
X_train_res_chl = X_train.copy()
X_train_res_fre = X_train.copy()
X_train_res_tot = X_train.copy()
X_train_res_pH = X_train.copy()
X_train_res_sul = X_train.copy()
X_train_res_alc = X_train.copy()

# chlorides

In [None]:
X_train_chl_fre = X_train.copy()
X_train_chl_tot = X_train.copy()
X_train_chl_pH = X_train.copy()
X_train_chl_sul = X_train.copy()
X_train_chl_alc = X_train.copy()

# free sulfur dioxide

In [None]:
X_train_fre_tot = X_train.copy()
X_train_fre_pH = X_train.copy()
X_train_fre_sul = X_train.copy()
X_train_fre_alc = X_train.copy()

# total sulfur dioxide

In [None]:
X_train_tot_pH = X_train.copy()
X_train_tot_sul = X_train.copy()
X_train_tot_alc = X_train.copy()

# pH, sulphates, alc

In [None]:
X_train_pH_sul = X_train.copy()
X_train_pH_alc = X_train.copy()
X_train_sul_alc = X_train.copy()

In [None]:
X_train_vol_chl["sample1"] = X_train["volatile acidity"] / X_train["chlorides"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_vol_chl, y_train)

ext_scores = cross_val_score(ext_reg, X_train_vol_chl, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_pH_sul["sample9"] = X_train["pH"] - X_train["sulphates"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_pH_sul, y_train)

ext_scores = cross_val_score(ext_reg, X_train_pH_sul, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_chl_pH["sample7"] = X_train["chlorides"] / X_train["pH"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_chl_pH, y_train)

ext_scores = cross_val_score(ext_reg, X_train_chl_pH, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_chl_tot["sample6"] = X_train["chlorides"] * X_train["total sulfur dioxide"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_chl_tot, y_train)

ext_scores = cross_val_score(ext_reg, X_train_chl_tot, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_cit_chl["sample2"] = X_train["citric acid"] / X_train["chlorides"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_cit_chl, y_train)

ext_scores = cross_val_score(ext_reg, X_train_cit_chl, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_cit_sul["sample3"] = X_train["citric acid"] * X_train["sulphates"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_cit_sul, y_train)

ext_scores = cross_val_score(ext_reg, X_train_cit_sul, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_tot_pH["sample8"] = X_train["total sulfur dioxide"] * X_train["pH"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_tot_pH, y_train)

ext_scores = cross_val_score(ext_reg, X_train_tot_pH, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_res_chl["sample4"] = X_train["residual sugar"] / X_train["chlorides"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_res_chl, y_train)

ext_scores = cross_val_score(ext_reg, X_train_res_chl, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

In [None]:
X_train_cit_pH["sample5"] = X_train["citric acid"] + X_train["pH"]

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_cit_pH, y_train)

ext_scores = cross_val_score(ext_reg, X_train_cit_pH, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

#뭔가 애매함...

이 위쪽은 의미있는 값들임

## 특성 공학으로 만든 데이터를 훈련셋에 넣음

In [12]:
X_train_comb = X_train.copy()

X_train_comb["sample1"] = X_train["volatile acidity"] / X_train["chlorides"] #1
X_train_comb["sample2"] = X_train["citric acid"] / X_train["chlorides"] #5 
X_train_comb["sample3"] = X_train["citric acid"] * X_train["sulphates"] #6
X_train_comb["sample4"] = X_train["residual sugar"] / X_train["chlorides"] #8
X_train_comb["sample5"] = X_train["citric acid"] + X_train["pH"] #애매함. 근데 빼니까 값이 높아져서 넣는게 나음. #9
X_train_comb["sample6"] = X_train["chlorides"] * X_train["total sulfur dioxide"] #4
X_train_comb["sample7"] = X_train["chlorides"] / X_train["pH"] #3
X_train_comb["sample8"] = X_train["total sulfur dioxide"] * X_train["pH"] #7
X_train_comb["sample9"] = X_train["pH"] - X_train["sulphates"] #2

# X_train_comb["sample11"] = X_train_comb["sample1"] - X_train_comb["sample6"] #54485
# X_train_comb["sample12"] = X_train_comb["sample1"] * X_train_comb["sample8"] #54312
# X_train_comb["sample13"] = X_train_comb["sample2"] * X_train_comb["sample4"] #54358
# X_train_comb["sample15"] = X_train_comb["sample2"] - X_train_comb["sample8"] #54454
# X_train_comb["sample16"] = X_train_comb["sample3"] + X_train_comb["sample8"] #54430
# X_train_comb["sample18"] = X_train_comb["sample4"] - X_train_comb["sample5"] #54434
X_train_comb["sample20"] = X_train_comb["sample4"] + X_train_comb["sample7"] #54204
# X_train_comb["sample21"] = X_train_comb["sample5"] + X_train_comb["sample8"] #54422
# X_train_comb["sample22"] = X_train_comb["sample6"] / X_train_comb["sample7"] #54330
# X_train_comb["sample23"] = X_train_comb["sample7"] + X_train_comb["sample8"] #54343
# X_train_comb["sample24"] = X_train_comb["sample8"] + X_train_comb["sample9"] #54259

ext_reg = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg.fit(X_train_comb, y_train)

ext_scores = cross_val_score(ext_reg, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores_mean(ext_rmse_scores)

Mean: 0.5420422095160929


기본 특성 조합 값 : 0.5452912557773699

20번째 샘플에서 가장 잘 나옴

0.5420422095160929

In [None]:
ext_reg_test = ExtraTreesRegressor(random_state=0, n_jobs=-1)
ext_reg_test.fit(X_train_comb, y_train)

ext_scores_test = cross_val_score(ext_reg_test, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
past_rmse_scores = np.sqrt(-ext_scores_test)

for x in range(1, 10000):
    ext_reg_test = ExtraTreesRegressor(random_state=x, n_jobs=-1)
    ext_reg_test.fit(X_train_comb, y_train)
    
    ext_scores_test = cross_val_score(ext_reg_test, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
#     ext_rmse_scores_test = np.sqrt(-ext_scores_test)
    current_rmse_scores_test = np.sqrt(-ext_scores_test)

    if current_rmse_scores_test.mean() < past_rmse_scores.mean():
        past_rmse_scores = current_rmse_scores_test
        print(f"{x}번째 rmse값:{np.mean(past_rmse_scores)}")
        
display_scores_mean(past_rmse_scores) # best random_state

**특성 조합 이후 rs값 추가해서 최종 결론 내리기!**

In [13]:
ext_reg1 = ExtraTreesRegressor(random_state=610, n_jobs=-1)
ext_reg1.fit(X_train_comb, y_train)
    
ext_scores1 = cross_val_score(ext_reg1, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores1 = np.sqrt(-ext_scores1)

display_scores_mean(ext_rmse_scores1)

Mean: 0.5416769289737556


In [14]:
ext_reg2 = ExtraTreesRegressor(random_state=2449, n_jobs=-1)
ext_reg2.fit(X_train_comb, y_train)
    
ext_scores2 = cross_val_score(ext_reg2, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores2 = np.sqrt(-ext_scores2)

display_scores_mean(ext_rmse_scores2)

Mean: 0.5402247641277131


In [15]:
ext_reg3 = ExtraTreesRegressor(random_state=42, n_jobs=-1)
ext_reg3.fit(X_train_comb, y_train)

ext_scores3 = cross_val_score(ext_reg3, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
ext_rmse_scores3 = np.sqrt(-ext_scores3)

display_scores_mean(ext_rmse_scores3)

Mean: 0.5420422095160929


## Voting

In [16]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(estimators=[('ext1', ext_reg1), ('ext2', ext_reg2), ('ext3', ext_reg3)])
voting_reg.fit(X_train_comb, y_train)

voting_scores = cross_val_score(voting_reg, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
voting_rmse_scores = np.sqrt(-voting_scores)

display_scores_mean(voting_rmse_scores)

Mean: 0.5393481159091247


# Result

In [18]:
voting_scores = cross_val_score(voting_reg, X_train_comb, y_train, scoring = "neg_mean_squared_error", cv=10)
voting_rmse_scores = np.sqrt(-voting_scores)

display_scores_mean(voting_rmse_scores)

Mean: 0.5393481159091247
