In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [2]:
# Load and prepare the dataset
data = pd.read_csv('/Users/jorgen/Documents/tastywine/wine+quality/winequality-red.csv', delimiter=';')

In [3]:
# Define the features and target variable
X = data.drop('quality', axis=1)
y = data['quality']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [6]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [7]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)


In [8]:
# Feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance = sorted(zip(importances, feature_names), reverse=True)


In [9]:
print("Feature importances:")
for importance, name in feature_importance:
    print(f"{name}: {importance:.2f}")

Feature importances:
alcohol: 0.27
sulphates: 0.15
volatile acidity: 0.11
total sulfur dioxide: 0.08
chlorides: 0.07
pH: 0.06
residual sugar: 0.06
fixed acidity: 0.05
density: 0.05
citric acid: 0.05
free sulfur dioxide: 0.05
