In [51]:
import pandas as pd

In [52]:
# Load dataset
df = pd.read_csv('winequality-white.csv', delimiter=';')

In [53]:
# Create binary target variable to analyze wine quality
df['y'] = df['quality'].apply(lambda x: 0 if x <= 5 else 1)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,y
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,1
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,1
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,1
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


In [54]:
# Class imbalance ratio
zero_count = (df['y'] == 0).sum()
one_count = (df['y'] == 1).sum()
num_rows = len(df)

proportion_zero = zero_count / num_rows
proportion_one = one_count / num_rows
proportion_zero,proportion_one

(0.33483054307880766, 0.6651694569211923)

In [55]:
#Split data
from sklearn.model_selection import train_test_split

# Split the data into features and target using the updated dataset
X = df.drop(['quality', 'y'], axis=1)
y = df['y']

# Repeat the three-way stratified split: train (60%), validation (20%), test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [56]:
# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [64]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

neighbors = [1,3,5]
knn_metrics = []

for n in neighbors:
    # Train the model
    knn  = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled,y_train)

    # Store predictions from validation set
    y_pred = knn.predict(X_val_scaled)

    # Record model prediction metrics
    accuracy = accuracy_score(y_val,y_pred)
    precision = precision_score(y_val,y_pred, zero_division=0)
    recall = recall_score(y_val,y_pred)
    f1 = f1_score(y_val,y_pred)
    knn_metrics.append([accuracy,precision,recall,f1])

print(f"accuracy: {knn_metrics[0][0],knn_metrics[1][0],knn_metrics[2][0]}")
print(f"precision: {knn_metrics[0][1],knn_metrics[1][1],knn_metrics[2][1]}")
print(f"recall: {knn_metrics[0][2],knn_metrics[1][2],knn_metrics[2][2]}")
print(f"f1: {knn_metrics[0][3],knn_metrics[1][3],knn_metrics[2][3]}")

accuracy: (0.773469387755102, 0.7510204081632653, 0.753061224489796)
precision: (0.8257575757575758, 0.7947976878612717, 0.8005865102639296)
recall: (0.8358895705521472, 0.843558282208589, 0.8374233128834356)
f1: (0.8307926829268293, 0.8184523809523809, 0.8185907046476761)


In [65]:
# SVM
from sklearn.svm import SVC

kernels = ["rbf","linear","poly"]
svm_metrics = []

for k in kernels:
    # Train the model
    svm = SVC(kernel=k)
    svm.fit(X_train_scaled,y_train)

    # Store predictions from validation set
    y_pred = svm.predict(X_val_scaled)

    # Record model prediction metrics
    accuracy = accuracy_score(y_val,y_pred)
    precision = precision_score(y_val,y_pred, zero_division=0)
    recall = recall_score(y_val,y_pred)
    f1 = f1_score(y_val,y_pred)
    svm_metrics.append([accuracy,precision,recall,f1])

print(f"accuracy: {svm_metrics[0][0],svm_metrics[1][0],svm_metrics[2][0]}")
print(f"precision: {svm_metrics[0][1],svm_metrics[1][1],svm_metrics[2][1]}")
print(f"recall: {svm_metrics[0][2],svm_metrics[1][2],svm_metrics[2][2]}")
print(f"f1: {svm_metrics[0][3],svm_metrics[1][3],svm_metrics[2][3]}")

accuracy: (0.7612244897959184, 0.736734693877551, 0.726530612244898)
precision: (0.7960339943342776, 0.7585301837270341, 0.7335766423357665)
recall: (0.8619631901840491, 0.8865030674846626, 0.9248466257668712)
f1: (0.8276877761413843, 0.8175388967468176, 0.8181818181818182)


In [67]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier as DTC

criterion = ["gini","entropy"]
dtc_metrics = []

for c in criterion:
    # Train the model
    dtc = DTC(criterion=c)
    dtc.fit(X_train_scaled,y_train)

    # Store predictions from validation set
    y_pred = dtc.predict(X_val_scaled)

    # Record model prediction metrics
    accuracy = accuracy_score(y_val,y_pred)
    precision = precision_score(y_val,y_pred, zero_division=0)
    recall = recall_score(y_val,y_pred)
    f1 = f1_score(y_val,y_pred)
    dtc_metrics.append([accuracy,precision,recall,f1])

print(f"accuracy: {dtc_metrics[0][0],dtc_metrics[1][0]}")
print(f"precision: {dtc_metrics[0][1],dtc_metrics[1][1]}")
print(f"recall: {dtc_metrics[0][2],dtc_metrics[1][2]}")
print(f"f1: {dtc_metrics[0][3],dtc_metrics[1][3]}")

accuracy: (0.7653061224489796, 0.7591836734693878)
precision: (0.8196969696969697, 0.8260188087774295)
recall: (0.8297546012269938, 0.808282208588957)
f1: (0.8246951219512195, 0.8170542635658915)


In [68]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

penalties = ["l1","l2"]
lReg_metrics = []

for p in penalties:
    # Train the model
    lReg = LogisticRegression(penalty=p, solver='liblinear')
    lReg.fit(X_train_scaled,y_train)

    # Store predictions from validation set
    y_pred = lReg.predict(X_val_scaled)

    # Record model prediction metrics
    accuracy = accuracy_score(y_val,y_pred)
    precision = precision_score(y_val,y_pred, zero_division=0)
    recall = recall_score(y_val,y_pred)
    f1 = f1_score(y_val,y_pred)
    lReg_metrics.append([accuracy,precision,recall,f1])

print(f"accuracy: {lReg_metrics[0][0],lReg_metrics[1][0]}")
print(f"precision: {lReg_metrics[0][1],lReg_metrics[1][1]}")
print(f"recall: {lReg_metrics[0][2],lReg_metrics[1][2]}")
print(f"f1: {lReg_metrics[0][3],lReg_metrics[1][3]}")

accuracy: (0.7336734693877551, 0.7336734693877551)
precision: (0.7603195739014648, 0.7603195739014648)
recall: (0.8757668711656442, 0.8757668711656442)
f1: (0.8139700641482538, 0.8139700641482538)


kNN accuracy: 1 neighbor: 0.773469387755102 kNN accuracy: 2 neighbors: 0.7510204081632653
