In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier # Import the KNeighborsClassifier class

In [None]:
dataset = pd.read_csv("/content/WineQT.csv")

print(dataset.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4        5   0  
1      9.8        5   1  
2      9

In [None]:
# Define features (X) and target (y)

X = dataset.drop("quality", axis=1)
y = dataset["quality"]

In [None]:
# Convert quality into 3 categories: Low (0), Medium (1), High (2)

y = y.apply(lambda q: 0 if q <= 5 else (2 if q >= 7 else 1))


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [None]:
# Scale the features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

grid_log = GridSearchCV(LogisticRegression(), log_reg_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_log.fit(X_train_scaled, y_train)

print("Best Parameters (Logistic Regression):", grid_log.best_params_)
print("Best Accuracy (Logistic Regression):", grid_log.best_score_)

Best Parameters (Logistic Regression): {'C': 0.1, 'solver': 'lbfgs'}
Best Accuracy (Logistic Regression): 0.6250000000000001


In [None]:
# Random Forest

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)

print("Best Parameters (Random Forest):", grid_rf.best_params_)
print("Best Accuracy (Random Forest):", grid_rf.best_score_)

Best Parameters (Random Forest): {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Best Accuracy (Random Forest): 0.69375


In [None]:
# K-Nearest Neighbors (KNN)

knn_params = {
    'n_neighbors': range(1, 20),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

random_knn = RandomizedSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_iter=10, scoring='accuracy', random_state=42, n_jobs=-1)
random_knn.fit(X_train_scaled, y_train)

print("Best Parameters (KNN):", random_knn.best_params_)
print("Best Accuracy (KNN):", random_knn.best_score_)

Best Parameters (KNN): {'weights': 'distance', 'n_neighbors': 18, 'metric': 'euclidean'}
Best Accuracy (KNN): 0.6312500000000001


In [None]:
# Compare Tuned Models vs Baseline

# Best models from tuning
best_log_reg = grid_log.best_estimator_
best_rf = grid_rf.best_estimator_
best_knn = random_knn.best_estimator_

# Evaluate on test data
log_tuned_acc = accuracy_score(y_test, best_log_reg.predict(X_test_scaled))
rf_tuned_acc = accuracy_score(y_test, best_rf.predict(X_test_scaled))
knn_tuned_acc = accuracy_score(y_test, best_knn.predict(X_test_scaled))

# Print results
print("Tuned Accuracy (Logistic Regression):", log_tuned_acc)
print("Tuned Accuracy (Random Forest):", rf_tuned_acc)
print("Tuned Accuracy (KNN):", knn_tuned_acc)

Tuned Accuracy (Logistic Regression): 0.6337209302325582
Tuned Accuracy (Random Forest): 0.6802325581395349
Tuned Accuracy (KNN): 0.6802325581395349
