In [1]:
import pandas as pd
import numpy as np

# read data
redwine=pd.read_csv(r'C:\Users\Fu\Documents\GitHub\Test1\winequality-red.csv',sep=';')
redwine.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [2]:
X_red=redwine.iloc[:,:11]
Y_red=(redwine['quality']>6)
# choose 3 features to run these methods
X=X_red.loc[:,['alcohol','volatile acidity','density']]

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y_red,test_size=0.3,random_state=0)

In [3]:
# KNN method
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
KNN=KNeighborsClassifier(n_neighbors=10,weights='distance',
                         p=2,metric='minkowski')
KNN.fit(X_train,Y_train)
Y_train_pred_KNN=KNN.predict(X_train)
Y_test_pred_KNN=KNN.predict(X_test)
ac_KNN=accuracy_score(Y_test,Y_test_pred_KNN)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tr=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=0)
tr.fit(X_train,Y_train)
Y_train_pred_tr=tr.predict(X_train)
Y_test_pred_tr=tr.predict(X_test)
ac_tr=accuracy_score(Y_test,Y_test_pred_tr)

# Bagging
from sklearn.ensemble import BaggingClassifier
bag=BaggingClassifier(base_estimator=tr,n_estimators=10,
                      max_samples=10,
                      bootstrap=True,bootstrap_features=False,
                      n_jobs=1)
bag.fit(X_train,Y_train)
Y_train_pred_bag=bag.predict(X_train)
Y_test_pred_bag=bag.predict(X_test)
ac_bag=accuracy_score(Y_test,Y_test_pred_bag)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ad=AdaBoostClassifier(base_estimator=tr,n_estimators=10,
                      learning_rate=0.5,random_state=0)
ad.fit(X_train,Y_train)
Y_train_pred_ad=ad.predict(X_train)
Y_test_pred_ad=ad.predict(X_test)
ac_ad=accuracy_score(Y_test,Y_test_pred_ad)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier(criterion='entropy',n_estimators=2,
                          random_state=0)
RF.fit(X_train,Y_train)
Y_train_pred_RF=RF.predict(X_train)
Y_test_pred_RF=RF.predict(X_test)
ac_RF=accuracy_score(Y_test,Y_test_pred_RF)

In [4]:
# print the test accuracy of each method
print('The test accuracy of KNN method is  %.3f' % ac_KNN)
print('The test accuracy of Decision Tree is  %.3f' % ac_tr)
print('The test accuracy of Bagging is  %.3f' % ac_bag)
print('The test accuracy of AdaBoost is  %.3f' % ac_ad)
print('The test accuracy of Random Forest is  %.3f' % ac_RF)

The test accuracy of KNN method is  0.873
The test accuracy of Decision Tree is  0.869
The test accuracy of Bagging is  0.890
The test accuracy of AdaBoost is  0.896
The test accuracy of Random Forest is  0.892


In [8]:
# Visualize the result of Decision Tree, use only two features instead of three
X_train2=X_train.loc[:,['alcohol','volatile acidity']]
tr2=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=0)
tr2.fit(X_train2,Y_train)

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plot
def plot_decision(X,Y,classifier,resolution=0.02):
    # setup marker generator and color map
    markers=('s','x','o','^','v')
    colors=('red','blue','lightgreen','gray','cyan')
    cmap=ListedColormap(colors[:len(np.unique(Y))])
    
    # decision surface
    x1_min,x1_max=X[:,0].min()-1,X[:,0].max()+1
    x2_min,x2_max=X[:,1].min()-1,X[:,1].max()+1
    xx1,xx2=np.meshgrid(np.arange(x1_min,x1_max,resolution),
                        np.arange(x2_min,x2_max,resolution))
    Z=classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T)
    Z=Z.reshape(xx1.shape)
    plot.contourf(xx1,xx2,Z,alpha=0.4,cmap=cmap)
    plot.xlim(xx1.min(),xx1.max())
    plot.ylim(xx2.min(),xx2.max())
    # plot class smaples
    for i,cl in enumerate(np.unique(Y)):
        plot.scatter(x=X[Y==cl,0],y=X[Y==cl,1],
                     alpha=0.8,c=cmap(i),
                     marker=markers[i],label=cl)

In [9]:
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
X_train2_std=st.fit_transform(X_train2)

plot_decision(X_train2_std,Y_train,classifier=tr2)

