# Assignment #4 (demo). Exploring OLS, Lasso and Random Forest in a regression task
## author: Yury Kashnitsky. All content is distributed under the Creative Commons CC BY-NC-SA 4.0 license.

In [62]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler

In [63]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")

In [64]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


## Separate the target feature, split data in 7:3 proportion (30% form a holdout set, use random_state=17), and preprocess data with StandardScaler.

In [66]:
y = data['quality']
X = data.drop(columns=[ 'quality'])

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y,test_size=0.3,random_state=17)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

# Linear regression
## Train a simple linear regression model (Ordinary Least Squares).

In [67]:
linreg = LinearRegression().fit(X_train_scaled, y_train)
linreg

# Model evaluation
## We evaluate the model’s performance on the test set using the mean squared error and the coefficient of determination.

In [68]:
y_pred = linreg.predict(X_holdout_scaled)

In [69]:
y_pred

array([5.57815637, 6.13413553, 5.93310004, ..., 6.93033664, 5.50467546,
       5.77599116])

In [70]:
print(f"Mean squared error: {mean_squared_error(y_pred, y_holdout):.2f}")

Mean squared error: 0.58


## Sort features by their influence on the target feature (wine quality). Beware that both large positive and large negative coefficients mean large influence on target. It’s handy to use pandas.DataFrame here.

Question 2: Which feature this linear regression model treats as the most influential on wine quality?

In [71]:
coef = linreg.coef_
linreg_coef = pd.DataFrame(coef)
linreg_coef
linreg_coef.sort_values(by=[0], ascending=False)

Unnamed: 0,0
3,0.538164
8,0.150036
10,0.129533
0,0.097822
9,0.062053
5,0.04218
6,0.014304
4,0.008127
2,-0.000183
1,-0.19226


In [72]:
coef_abs = np.abs(linreg.coef_)
linreg_abs_coef = pd.DataFrame(coef_abs)
linreg_abs_coef
linreg_abs_coef.sort_values(by=[0], ascending=False)

Unnamed: 0,0
7,0.66572
3,0.538164
1,0.19226
8,0.150036
10,0.129533
0,0.097822
9,0.062053
5,0.04218
6,0.014304
4,0.008127


Feature 3 → Residual sugar: has the strongest positive impact.
Feature 7 → Density: has the strongest negative impact.

But if we look at the absolute strength, Density actually has the biggest overall impact.

# Lasso regression

In [73]:
from sklearn.linear_model import Lasso
lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(X_train_scaled, y_train)

In [74]:
#Which feature is the least informative in predicting wine quality, according to this LASSO model?

coef = lasso1.coef_
lasso1_coef = pd.DataFrame(coef)
lasso1_coef
lasso1_coef.sort_values(by=[0], ascending=False)

Unnamed: 0,0
10,0.322425
3,0.256363
8,0.067277
5,0.043088
9,0.029722
0,-0.0
2,-0.0
6,-0.0
4,-0.002747
1,-0.188479


feature 10 --> alcohol  

In [75]:
#Train LassoCV with random_state=17 to choose the best value of alpha in 5-fold cross-validation.

alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(cv=5, random_state=17, alphas=alphas).fit(X_train_scaled, y_train)
lasso_cv.alpha_



0.0002833096101839324

Question 3: Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

In [76]:
coef = lasso_cv.coef_
lasso_cv_coef = pd.DataFrame(coef)
lasso_cv_coef
lasso_cv_coef.sort_values(by=[0], ascending=True)

Unnamed: 0,0
7,-0.648161
1,-0.192049
2,-0.0
4,0.006933
6,0.012969
5,0.042698
9,0.060939
0,0.093295
10,0.137115
8,0.146549


features 7 --> density 

## Question 4: What are mean squared errors of tuned LASSO predictions on train and holdout sets?

In [77]:
y_pred_test = lasso_cv.predict(X_holdout_scaled)
print(f"Mean squared error: {mean_squared_error(y_pred, y_holdout):.2f}")

Mean squared error: 0.58


In [78]:
y_pred_train = lasso_cv.predict(X_train_scaled)
print(f"Mean squared error: {mean_squared_error(y_pred_train, y_train):.2f}")

Mean squared error: 0.56


# Random Forest

In [79]:
forest = RandomForestRegressor(random_state = 17)
forest.fit (X_train_scaled, y_train)

## Question 5: What are mean squared errors of RF model on the training set, in cross-validation (cross_val_score with scoring=’neg_mean_squared_error’ and other arguments left with default values) and on holdout set?

In [80]:
# X_train, X_holdout, y_train, y_holdout

In [81]:
y_pred_train = forest.predict(X_train_scaled)
y_pred_cv = np.mean(np.abs(cross_val_score(forest, X_train_scaled,y_train,  scoring='neg_mean_squared_error', cv=5)))
y_pred_test = forest.predict(X_holdout_scaled)

In [82]:
# Train
print(f"Mean squared error: {mean_squared_error(y_pred_train, y_train):.2f}")

Mean squared error: 0.05


In [83]:
# Test
print(f"Mean squared error: {mean_squared_error(y_pred_test, y_holdout):.2f}")

Mean squared error: 0.37


In [84]:
print("Mean squared error (cv): %.3f" % np.mean(np.abs(cross_val_score(forest, X_train_scaled, y_train, scoring="neg_mean_squared_error"))))

Mean squared error (cv): 0.414


## Tune the max_features and max_depth hyperparameters with GridSearchCV and again check mean cross-validation MSE and MSE on holdout set.

In [85]:
forest_params = {"max_depth": list(range(10, 25)), "max_features": list(range(6, 12))}

locally_best_forest = GridSearchCV(
    RandomForestRegressor(n_jobs=-1, random_state=17),
    forest_params,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    cv=5,
    verbose=True,
)
locally_best_forest.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [86]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 21, 'max_features': 6}, -0.39773288191505934)

## Question 6: What are mean squared errors of tuned RF model in cross-validation (cross_val_score with scoring=’neg_mean_squared_error’ and other arguments left with default values) and on holdout set?

In [87]:
print(
    "Mean squared error (cv): %.3f"
    % np.mean(
        np.abs(
            cross_val_score(
                locally_best_forest.best_estimator_,
                X_train_scaled,
                y_train,
                scoring="neg_mean_squared_error",
            )
        )
    )
)
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, locally_best_forest.predict(X_holdout_scaled))
)

Mean squared error (cv): 0.398
Mean squared error (test): 0.366


## Question 7: What is the most important feature, according to the Random Forest model?

In [88]:
rf_importance = pd.DataFrame(
    locally_best_forest.best_estimator_.feature_importances_,
    columns=["coef"],
    index=data.columns[:-1],
)
rf_importance.sort_values(by="coef", ascending=False)

Unnamed: 0,coef
alcohol,0.206056
volatile acidity,0.117578
free sulfur dioxide,0.111556
density,0.088549
pH,0.073659
total sulfur dioxide,0.07364
chlorides,0.073366
residual sugar,0.072072
citric acid,0.062601
fixed acidity,0.061813
