In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("winequality-red-cleaned.csv")

# Remove rows where 'quality' is NaN
data_clean = data.dropna(subset=["quality"])

# Features and target
X = data_clean.drop("quality", axis=1)
y = data_clean["quality"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7821


In [2]:
from sklearn.preprocessing import StandardScaler

# Feature normalization (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# You can now use X_train_scaled and X_test_scaled for model training and evaluation

# Example: retrain RandomForestClassifier on scaled data
rf_scaled = RandomForestClassifier(random_state=42)
rf_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = rf_scaled.predict(X_test_scaled)

# Evaluate accuracy on scaled features
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
print(f"Accuracy with scaled features: {accuracy_scaled:.4f}")

Accuracy with scaled features: 0.7821


In [3]:
from sklearn.preprocessing import MinMaxScaler

# Feature normalization (Min-Max Scaling)
minmax_scaler = MinMaxScaler()
X_train_norm = minmax_scaler.fit_transform(X_train)
X_test_norm = minmax_scaler.transform(X_test)

# Example: train RandomForestClassifier on normalized data
rf_norm = RandomForestClassifier(random_state=42)
rf_norm.fit(X_train_norm, y_train)
y_pred_norm = rf_norm.predict(X_test_norm)

# Evaluate accuracy on normalized features
accuracy_norm = accuracy_score(y_test, y_pred_norm)
print(f"Accuracy with normalized features: {accuracy_norm:.4f}")

Accuracy with normalized features: 0.7821


In [4]:
import pickle
# After training
pickle.dump(rf_scaled, open("wine_model.pkl", "wb"))