In [16]:
# Regression problem 
import warnings
import argparse
import logging
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import joblib


In [17]:

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# --- Function to evaluate model metrics ---
def eval_metrics(actual, pred):
    """Calculates and returns RMSE, MAE, and R2 score."""
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# --- Main execution block ---
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # --- 1. Load and prepare the data ---
    try:
        data = pd.read_csv("red-wine-quality.csv")
    except FileNotFoundError:
        logger.error("Error: 'red-wine-quality.csv' not found. Please make sure the file is in the correct directory.")
        exit()

    if "Unnamed: 0" in data.columns:
        data = data.drop("Unnamed: 0", axis=1)

    # Separate features (X) from the target (y) before splitting
    X = data.drop(["quality"], axis=1)
    y = data["quality"].values.ravel() # Use ravel() to convert y to the correct shape for the model

    # Split the data into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)

    # --- 2. Define the Model ---
    # We will use RandomForestRegressor directly. It does not require feature scaling.
    model = RandomForestRegressor(random_state=42)
    
    # --- 3. Hyperparameter Tuning using GridSearchCV ---
    # Define a grid of parameters to search for Random Forest
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5]
    }

    # Set up GridSearchCV to find the best parameters using 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
    
    print("--- Starting hyperparameter tuning for Random Forest... ---")
    grid_search.fit(train_x, train_y)
    
    # The best model is found by grid_search
    best_model = grid_search.best_estimator_
    
    print("--- Hyperparameter tuning complete! ---")
    print(f"Best Parameters found: {grid_search.best_params_}")

    # --- 4. Evaluate the Best Model ---
    predicted_qualities = best_model.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("\n--- Best Model Evaluation ---")
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # --- 5. Save the best model ---
    model_filename = "randomforest_model.joblib"
    joblib.dump(best_model, model_filename)
    print(f"\nModel training complete and saved to '{model_filename}'")



--- Starting hyperparameter tuning for Random Forest... ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
--- Hyperparameter tuning complete! ---
Best Parameters found: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}

--- Best Model Evaluation ---
  RMSE: 0.5578857409900346
  MAE: 0.422775
  R2: 0.49686446072240464

Model training complete and saved to 'randomforest_model.joblib'


In [18]:
# --- 6. Load the model and make a new prediction ---
print("\n--- Loading model for prediction ---")

loaded_model = joblib.load(model_filename)
print("Model loaded successfully!")

# Prepare your custom data for prediction
prediction_data = {
    'fixed acidity': [7.4, 7.3],
    'volatile acidity': [0.7, 0.65],
    'citric acid': [0.0, 0.0],
    'residual sugar': [1.9, 1.2],
    'chlorides': [0.076, 0.065],
    'free sulfur dioxide': [11.0, 15.0],
    'total sulfur dioxide': [34.0, 21.0],
    'density': [0.9978, 0.9946],
    'pH': [3.51, 3.39],
    'sulphates': [0.56, 0.47],
    'alcohol': [9.4, 10.0]
}

sample_data = pd.DataFrame(prediction_data)

# Use the loaded model to make predictions.
predictions = loaded_model.predict(sample_data)

print("\n--- Making a prediction on new data ---")
print("Sample Input Data:")
print(sample_data.to_string())
print("\nPredicted Qualities:")
for i, pred in enumerate(predictions):
    print(f"  Data point {i+1}: {pred}")



--- Loading model for prediction ---
Model loaded successfully!

--- Making a prediction on new data ---
Sample Input Data:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  alcohol
0            7.4              0.70          0.0             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4
1            7.3              0.65          0.0             1.2      0.065                 15.0                  21.0   0.9946  3.39       0.47     10.0

Predicted Qualities:
  Data point 1: 5.01
  Data point 2: 6.02


In [21]:
# Load your original dataset
data = pd.read_csv('red-wine-quality.csv')

# Drop unwanted column if it exists, axis=1 drops a colum, if axis=0 it drops a row
if 'Unnamed: 0' in data.columns:
    data = data.drop('Unnamed: 0', axis=1)

# Drop the 'quality' column to use only features
X = data.drop(['quality'], axis=1)

# Select only the first row for prediction
sample_data = X.iloc[[3]]

# Load the trained model
loaded_model = joblib.load('randomforest_model.joblib')

# Predict the quality for the first row
predicted_quality = loaded_model.predict(sample_data)


In [22]:

# print("Sample Data (First Row):")
# print(sample_data)
sample_data


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8


In [23]:
print("\nPredicted Quality for First Row:", predicted_quality[0])


Predicted Quality for First Row: 5.79
