In [1]:
%load_ext watermark
%watermark  -d -u -a 'Shihao Yang, Yifan Zhang, Kexiao Zhu' -v -p numpy,scipy,matplotlib,sklearn

Author: Shihao Yang, Yifan Zhang, Kexiao Zhu

Last updated: 2021-12-06

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

numpy     : 1.20.1
scipy     : 1.6.2
matplotlib: 3.3.4
sklearn   : 0.24.1



In [2]:
import numpy as np
import pandas as pd

In [3]:
url = 'https://raw.githubusercontent.com/Hawk9808/STAT-451-project-group/main/winequalityN.csv'
df = pd.read_csv(url, error_bad_lines=False)
print(df.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    6487.000000       6489.000000  6494.000000     6495.000000   
mean        7.216579          0.339691     0.318722        5.444326   
std         1.296750          0.164649     0.145265        4.758125   
min         3.800000          0.080000     0.000000        0.600000   
25%         6.400000          0.230000     0.250000        1.800000   
50%         7.000000          0.290000     0.310000        3.000000   
75%         7.700000          0.400000     0.390000        8.100000   
max        15.900000          1.580000     1.660000       65.800000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  6495.000000          6497.000000           6497.000000  6497.000000   
mean      0.056042            30.525319            115.744574     0.994697   
std       0.035036            17.749400             56.521855     0.002999   
min       0.009000             1.000000         

In [4]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
label_list = []
labels_map = {
    "white": 0,
    "red": 1}
for i in range(len(df)):
    label_list.append(labels_map[df["type"][i]])
df["type"] = label_list

In [6]:
labels_map = {
    10: 'high',
    9: 'high',
    8: 'high',
    7: 'medium',
    6: 'medium',
    5: 'medium',
    4: 'medium',
    3: 'low',
    2: 'low',
    1: 'low'
}

In [7]:
df['quality'] = df['quality'].map(labels_map)

In [8]:
labels_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}

In [9]:
label_list2 = []
for i in range(len(df)):
    label_list2.append(labels_map[df["quality"][i]])
df["label"] = label_list2

df = df.drop(columns=['quality'])
# df = df.dropna()
df = df.fillna(df.mean())#fill the na value with the overall mean

In [10]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,label
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1
1,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1
2,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1
3,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1
4,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1


## Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from numpy import set_printoptions
from sklearn.ensemble import ExtraTreesClassifier
# load data
array = df.values
X = array[:,0:12]
Y = array[:,12]
# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(model.feature_importances_)

[0.00387471 0.0974098  0.08490494 0.08529245 0.09011189 0.08501937
 0.09456813 0.09088265 0.09097636 0.08357909 0.07963376 0.11374686]


As we can see from above feature importance, we can exclude the wine type since it is the least important feature among others.

In [12]:
df = df.drop(columns=['type'])

X = df[['fixed acidity',"volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]].values
y = df['label'].values

## Train test split 

In [13]:
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)
print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)

X_train.shape: (4547, 11)
y_train.shape: (4547,)
X_test.shape: (1950, 11)
y_test.shape: (1950,)


In [14]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
lr=lr.score(X_test,y_test)

In [15]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,y_train)
svm=svm.score(X_test,y_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
rf=rf.score(X_test,y_test)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn=knn.score(X_test,y_test)

In [18]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt=dt.score(X_test,y_test)

In [19]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(X_train,y_train)
nb=nb.score(X_test,y_test)

In [20]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'KNN', 'SVM', 
              'Random Forest', 'Naive Bayes', 'Decision Tree'],
    'Score': [lr,knn,svm,rf,nb,dt]})

In [21]:
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,0.973333
2,SVM,0.965128
1,KNN,0.964615
5,Decision Tree,0.943077
4,Naive Bayes,0.886154
0,Linear Regression,0.033481


## import another wine quality dataset to test the model

In [22]:
url = 'https://raw.githubusercontent.com/Hawk9808/STAT-451-project-group/main/winequality-red.csv'
df_test = pd.read_csv(url, error_bad_lines=False)
# df_test.head()
# df_test.info()

In [23]:
labels_map = {
    10: 'high',
    9: 'high',
    8: 'high',
    7: 'medium',
    6: 'medium',
    5: 'medium',
    4: 'medium',
    3: 'low',
    2: 'low',
    1: 'low'
}
df_test['quality'] = df_test['quality'].map(labels_map)
labels_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}
label_list2 = []
for i in range(len(df_test)):
    label_list2.append(labels_map[df_test["quality"][i]])
df_test["label"] = label_list2

df_test = df_test.drop(columns=['quality'])
df_test = df_test.fillna(df.mean())#fill the na value with the overall mean
df_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1


We test on the randomforest model which is the highest score model we've got from model selection step

In [24]:
X = df_test[['fixed acidity',"volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]].values
y = df_test['label'].values

rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
rf=rf.score(X,y)

print(f"Test Accuracy: {100*rf:0.2f}%")

Test Accuracy: 99.69%


In [25]:
!git config --global user.name "Hawk9808"