In [1]:
# Data Loading
import pandas as pd
wine_data_path = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv"
wine = pd.read_csv(wine_data_path)
wine.head(1)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6


In [2]:
# checking for Nan Values
wine.isna().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [3]:
# checking for duplicated data
wine.duplicated().sum()

1168

In [4]:
# we need to remoe duplicated rown and missing values rows
import pandas as pd

# load the wine quality dataset
wine_data = pd.read_csv(wine_data_path)
wine_data.shape

(6497, 13)

In [5]:
# drop rows with missing values
wine_data = wine_data.dropna()

In [6]:
# dropping duplicats rows
wine_data.drop_duplicates(inplace=True)
print('Wine Data Shape (After Dropping-) :',wine_data.shape)

Wine Data Shape (After Dropping-) : (5295, 13)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_data.drop_duplicates(inplace=True)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [8]:
# Separate features (physicochemical properties) and target variable (type)
x = wine_data.drop('type',axis=1)
y = wine_data['type']

In [9]:
# split the dataset into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
# standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [11]:
# Train a K-Nearest Neighbors (KNN) classifier without PCA
knn_no_pca = KNeighborsClassifier(n_neighbors=5)
knn_no_pca.fit(x_train_scaled,y_train)

In [12]:
# Predict the wine type on the testing set without PCA
y_pred_no_pca = knn_no_pca.predict(x_test_scaled)

In [13]:
# Evaluate the classification accuracy without PCA
accuracy_no_pca = accuracy_score(y_test, y_pred_no_pca)
print("KNN having features -", x_train_scaled.shape[1])
print("Classification Accuracy without PCA: {:.2f}%".format(accuracy_no_pca * 100))

KNN having features - 12
Classification Accuracy without PCA: 99.34%


In [14]:
# Apply PCA on the standardized features
pca = PCA(n_components=5)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

In [15]:
# Train a K-Nearest Neighbors (KNN) classifier with PCA
knn_with_pca = KNeighborsClassifier(n_neighbors=5)
knn_with_pca.fit(x_train_pca,y_train)

In [17]:
# Precict the wine type on the testing set with PCA
y_pred_pca = knn_with_pca.predict(x_test_pca)

In [18]:
# evaluate the classification accuracy with PCA
accuracy_pca = accuracy_score(y_test,y_pred_pca)

In [19]:
print("KNN having features -", x_train_pca.shape[1])
print("Classification Accuracy with PCA: {:.2f}%".format(accuracy_pca * 100))

KNN having features - 5
Classification Accuracy with PCA: 98.87%
