# Import Library

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Dataset

In [99]:
dataset = pd.read_csv('winequality-red.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [100]:
print(dataset)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [101]:
print(X)

[[ 7.4    0.7    0.    ...  3.51   0.56   9.4  ]
 [ 7.8    0.88   0.    ...  3.2    0.68   9.8  ]
 [ 7.8    0.76   0.04  ...  3.26   0.65   9.8  ]
 ...
 [ 6.3    0.51   0.13  ...  3.42   0.75  11.   ]
 [ 5.9    0.645  0.12  ...  3.57   0.71  10.2  ]
 [ 6.     0.31   0.47  ...  3.39   0.66  11.   ]]


In [102]:
print(y)

[5 5 5 ... 6 5 6]


# Preprocessing

In [103]:
#head of data
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [104]:
#type of column
dataset.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [105]:
# checking for missing value
dataset.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

#  Splitting the Dataset into the Training set and Test set

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Feature Scaling

In [107]:
from sklearn.preprocessing import StandardScaler
st_sc = StandardScaler()

X_train= st_sc.fit_transform(X_train)
X_test= st_sc.fit_transform(X_test)

#y_train =  st_sc.fit_transform(y_train.reshape(-1, 1))
#y_test =  st_sc.fit_transform(y_test.reshape(-1, 1))

In [108]:
print(y_train)

[5 5 5 ... 6 6 5]


# Applying LDA

In [109]:
#Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 2)
X_train = lda.fit_transform(X =X_train, y = y_train)
X_test = lda.transform(X = X_test)

# Logistic Regression

In [110]:
from sklearn.linear_model import LogisticRegression
log_obj = LogisticRegression()
log_obj.fit(X_train, y_train)

LogisticRegression()

In [111]:
log_pred = log_obj.predict(X_test)

In [112]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
log_acr = accuracy_score(y_test, log_pred)

In [113]:
print(log_acr)

0.6275


In [114]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = log_obj, X = X_train, y = y_train )
print(accurecies.mean())

0.5871513249651324


# K-Nearest Neighbors

In [115]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [116]:
neigh_pred = neigh.predict(X_test)

In [117]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
neigh_acr = accuracy_score(y_test, neigh_pred)

In [118]:
print(neigh_acr)

0.5975


In [119]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = neigh, X = X_train, y = y_train )
print(accurecies.mean())

0.5663145048814505


# Naive Bayes

In [120]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

In [121]:
gnb_pred = gnb.predict(X_test)

In [122]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
gnb_acr = accuracy_score(y_test, gnb_pred)

In [123]:
print(gnb_acr)

0.5775


In [124]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = gnb, X = X_train, y = y_train )
print(accurecies.mean())

0.5829776847977686


# Decision Tree

In [125]:
from sklearn.tree import DecisionTreeClassifier
tree_cls = DecisionTreeClassifier()
tree_cls.fit(X_train, y_train)

DecisionTreeClassifier()

In [126]:
tree_pred = tree_cls.predict(X_test)

In [127]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
tree_acr = accuracy_score(y_test, tree_pred)

In [128]:
print(tree_acr)

0.49


In [129]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = tree_cls, X = X_train, y = y_train )
print(accurecies.mean())

0.5546443514644352


# Support Vector Machines

In [130]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [131]:
svc_pred = svc.predict(X_test)

In [132]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
svc_acr = accuracy_score(y_test, svc_pred)

In [133]:
print(svc_acr)

0.62


In [134]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = svc, X = X_train, y = y_train )
print(accurecies.mean())

0.5879916317991631


# Random Forest Classifier

In [135]:
from sklearn.ensemble import RandomForestClassifier
rand_cl = RandomForestClassifier(n_estimators = 85)
rand_cl.fit(X_train, y_train)

RandomForestClassifier(n_estimators=85)

In [136]:
rand_pred = rand_cl.predict(X_test)

In [137]:
#Calculate Accuercy rate
from sklearn.metrics import accuracy_score
rand_acr = accuracy_score(y_test, rand_pred)

In [138]:
print(rand_acr)

0.555


In [139]:
#Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score
accurecies = cross_val_score(estimator = rand_cl, X = X_train, y = y_train )
print(accurecies.mean())

0.5996687587168757
