In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the wine dataset
wine_df = pd.read_csv("C:\\Users\\jessi\\OneDrive\\Documents\\Dorset\\Data Analysis\\winequality-red\\winequality-red.csv",sep=';')

In [6]:
# Categorize the quality column into low, medium, and high
wine_df['review'] = pd.cut(wine_df['quality'], bins=[0, 5, 6, 10], labels=['low', 'medium', 'high'])

# Drop the original quality column
wine_df.drop('quality', axis=1, inplace=True)

# Encode the review column
label_encoder = LabelEncoder()
wine_df['review'] = label_encoder.fit_transform(wine_df['review'])

# Train-test split and logistic regression

In [3]:
# Separate features (X) and target variable (y)
X = wine_df.drop('review', axis=1)
y = wine_df['review']



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply logistic regression
logistic_reg = LogisticRegression(max_iter=1000, random_state=42)
logistic_reg.fit(X_train_scaled, y_train)

# Predict the classes for test set
y_pred = logistic_reg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.625

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.32      0.41        47
           1       0.68      0.79      0.73       141
           2       0.56      0.55      0.56       132

    accuracy                           0.62       320
   macro avg       0.61      0.56      0.57       320
weighted avg       0.62      0.62      0.61       320



In [4]:
# Define the parameter grid for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Create a logistic regression model
logistic_reg = LogisticRegression(max_iter=1000, random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(logistic_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Predict the classes for test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

# Generate a classification report for the best model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the best model
import joblib
joblib.dump(best_model, "wine_quality_classification_model.pkl")

Best Parameters: {'C': 1, 'solver': 'newton-cg'}
Best Cross-Validation Score: 0.6357138480392157

Accuracy: 0.625

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.32      0.41        47
           1       0.68      0.79      0.73       141
           2       0.56      0.55      0.56       132

    accuracy                           0.62       320
   macro avg       0.61      0.56      0.57       320
weighted avg       0.62      0.62      0.61       320



['wine_quality_classification_model.pkl']