In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import os

# 1. Load the Wine Quality dataset (red wine)
PATH_DATA_FILE = 'Wine_Dataset'
if not os.path.exists(PATH_DATA_FILE):
    os.makedirs(PATH_DATA_FILE)
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    wine_data = pd.read_csv(url, delimiter=';')
    wine_data.to_csv(os.path.join(PATH_DATA_FILE, 'Dataset'), index=False)
else:
    wine_data = pd.read_csv(os.path.join(PATH_DATA_FILE, 'Dataset'))

print(wine_data.head)

<bound method NDFrame.head of       fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067  

2. Split the dataset into features (X) and target (y)

In [58]:
X = wine_data.drop('quality', axis=1)  # All features except 'quality'
y = wine_data['quality']  # Target: quality score

3. Split the data into training and testing sets (80% train, 20% test)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

4. Create and train the Random Forest Regressor

In [60]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
from sklearn.model_selection import GridSearchCV
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

model.fit(X_train, y_train)

5. Make predictions on the test set

In [61]:
y_pred = model.predict(X_test)

6. Evaluate the model

In [62]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.30123812499999997


Optionally, check feature importance

In [63]:
feature_importances = model.feature_importances_

for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance}")

fixed acidity: 0.053186369906438206
volatile acidity: 0.11154663936447985
citric acid: 0.050752294281887524
residual sugar: 0.05789180861290351
chlorides: 0.07113195656507873
free sulfur dioxide: 0.04719662555262754
total sulfur dioxide: 0.07678564827910653
density: 0.050816246759634046
pH: 0.06141843743303209
sulphates: 0.14840565890119353
alcohol: 0.2708683143436185
