In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection  import (GridSearchCV, cross_val_score, train_test_split)
from sklearn.preprocessing  import StandardScaler

In [49]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
data = pd.read_csv(DATA_PATH  + "winequality-white.csv", sep=";")

In [50]:

df = data
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [52]:
y = df["quality"]
df_features = df.drop(columns=['quality'])
df_features.head()
X_train, X_holdout, y_train, y_holdout = train_test_split(df_features, y, test_size = 0.3, random_state = 17)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

In [45]:
print(X_train.head)
print("\n")
print(X_train_scaled)

<bound method NDFrame.head of       fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
96              6.0              0.34         0.66            15.9      0.046   
2292            6.5              0.32         0.23             1.2      0.054   
1054            7.0              0.31         0.52             1.7      0.029   
3687            6.0              0.16         0.27            12.0      0.030   
596             6.9              0.41         0.33            10.1      0.043   
...             ...               ...          ...             ...        ...   
2800            6.6              0.25         0.51             8.0      0.047   
1337            6.7              0.18         0.30             6.4      0.048   
406             5.8              0.28         0.34             4.0      0.031   
2191            9.2              0.23         0.35            10.7      0.037   
2671            5.9              0.30         0.30             2.0      0.030  

In [7]:
#Train a simple model of Linear Regression
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

What are mean squared errors of model predictions on train and holdout sets?

In [8]:
y_pred = regressor.predict(X_holdout_scaled)
y_train_pred = regressor.predict(X_train_scaled)
res1 = mean_squared_error(y_holdout, y_pred)
res2 = mean_squared_error(y_train, y_train_pred)
print(res1)
print(res2)

0.5842473102404544
0.5580606489803572


Sort features by their influence on the target feature (wine quality).Beware that both large positive and large negative coefficients mean large influence on target. It's handy to use pd Dataframe here.

which feature this linear regression model trates as the modt influently on wine quality?

In [9]:
regressor.coef_

array([ 9.78219223e-02, -1.92259947e-01, -1.83224449e-04,  5.38164096e-01,
        8.12724353e-03,  4.21804406e-02,  1.43040227e-02, -6.65720472e-01,
        1.50036006e-01,  6.20533605e-02,  1.29533447e-01])

In [10]:
print(len(regressor.coef_))
print(regressor.coef_.min())

11
-0.6657204718353251


In [11]:
names = {}
for i in range(11):
    names[df.columns[i]] = (regressor.coef_[i])
data = {'Names':names}
coef_df = pd.DataFrame(data)

In [12]:
print(coef_df)

                         Names
alcohol               0.129533
chlorides             0.008127
citric acid          -0.000183
density              -0.665720
fixed acidity         0.097822
free sulfur dioxide   0.042180
pH                    0.150036
residual sugar        0.538164
sulphates             0.062053
total sulfur dioxide  0.014304
volatile acidity     -0.192260


**Answer:** Density

**Lasso regression**


In [13]:
lasso = Lasso(alpha=0.1, random_state=17)
lasso.fit(X_train_scaled, y_train)

In [14]:
names = {}
for i in range(11):
    names[df.columns[i]] = (lasso.coef_[i])
data = {'Names':names}
coef_lasso_df = pd.DataFrame(data)
print(coef_lasso_df.sort_values)

<bound method DataFrame.sort_values of                          Names
alcohol               0.295922
chlorides            -0.000000
citric acid          -0.000000
density              -0.000000
fixed acidity        -0.000000
free sulfur dioxide   0.000000
pH                    0.000000
residual sugar        0.000000
sulphates             0.000000
total sulfur dioxide -0.000000
volatile acidity     -0.093247>


In [15]:
alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=17)

In [16]:
lasso_cv.fit(X_train_scaled, y_train)

In [17]:
lasso_cv.alpha_

0.0002833096101839324

Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

In [18]:
lasso = Lasso(alpha=lasso_cv.alpha_, random_state=17)
lasso.fit(X_train_scaled, y_train)
names = {}
for i in range(11):
    names[df.columns[i]] = (lasso.coef_[i])
data = {'Names':names}
coef_lasso_df = pd.DataFrame(data)
print(coef_lasso_df.sort_values)

<bound method DataFrame.sort_values of                          Names
alcohol               0.137115
chlorides             0.006933
citric acid          -0.000000
density              -0.648161
fixed acidity         0.093295
free sulfur dioxide   0.042698
pH                    0.146549
residual sugar        0.526883
sulphates             0.060939
total sulfur dioxide  0.012969
volatile acidity     -0.192049>


**Answer:** citric acid


**Question4** What are mean squared errors of tuned LASSO predictions on train and holdout sets?

In [19]:
y_pred = lasso.predict(X_holdout_scaled)
y_train_pred = lasso.predict(X_train_scaled)
lasso_res1 = mean_squared_error(y_holdout, y_pred)
lasso_res2 = mean_squared_error(y_train, y_train_pred)
print("Mean squared error(train):", lasso_res2)
print("Mean squared error(pred):", lasso_res1)

Mean squared error(train): 0.558070014187378
Mean squared error(pred): 0.5832976077860635


**Answer** 

Mean squared error(train): 0.558070014187378

Mean squared error(pred): 0.5832976077860635

**Question5** What are mean squared errors of RF(Random Forest) model on the training set, in cross-validation and on holdout set?

In [20]:
rf = RandomForestRegressor(random_state=17)
rf.fit(X_train_scaled, y_train)

In [23]:
y_pred = rf.predict(X_holdout_scaled)
y_train_pred = rf.predict(X_train_scaled)
rf_res1 = mean_squared_error(y_holdout, y_pred)
rf_res2 = mean_squared_error(y_train, y_train_pred)
print("Mean squared error(train):", rf_res2)
print("Mean squared error(pred):", rf_res1)
print("Mean squared error cv: ", cross_val_score(rf,X_train_scaled, y_train, scoring="neg_mean_squared_error").mean())

forest_params = {'max_depth': list(range(10, 25)),
                'min_samples_leaf': list(range(1, 8)),
                'max_features': list(range(6, 12))}



Mean squared error(train): 0.05261155192532089
Mean squared error(pred): 0.37163775510204083
Mean squared error cv:  -0.4142003732204039


**Answer:**

Mean squared error(train): 0.05261155192532089

Mean squared error(pred): 0.37163775510204083

Mean squared error cv:  -0.4142003732204039

**Q_6** What are mean squared error of tuned RF model in cross-validation and on holdout set?

In [24]:
rf2 = RandomForestRegressor(n_jobs=-1, random_state=17)
locally_best_forest = GridSearchCV(rf2, param_grid=forest_params,scoring="roc_auc", cv=5)
locally_best_forest.fit(X_train_scaled, y_train)

In [25]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 10, 'max_features': 6, 'min_samples_leaf': 1}, nan)

({'max_depth': 21, 'max_features': 6, 'min_samples_leaf': 1},
 -0.39773288191505934)

In [56]:
rf3 = RandomForestRegressor(max_depth=21, max_features=6, min_samples_leaf=1, random_state=17)
rf3.fit(X_train_scaled, y_train)

In [57]:
y_pred = rf3.predict(X_holdout_scaled)
y_train_pred = rf3.predict(X_train_scaled)
rf_res1 = mean_squared_error(y_holdout, y_pred)
rf_res2 = mean_squared_error(y_train, y_train_pred)
print("Mean squared error(train):", rf_res2)
print("Mean squared error(pred):", rf_res1)
print("Mean squared error cv: ", cross_val_score(rf3,X_train_scaled, y_train, scoring="neg_mean_squared_error").mean())


Mean squared error(train): 0.05426728474544572
Mean squared error(pred): 0.36572455603132475
Mean squared error cv:  -0.39773288191505934


**Answer:**

Mean squared error(pred): 0.36572455603132475

Mean squared error cv:  -0.39773288191505934

**Q_7:** What is the most important feature, according to the Random Forest model?

In [30]:
names = {}
for i in range(11):
    names[df.columns[i]] = (rf3.feature_importances_[i])
data = {'Names':names}
coef_rf_df = pd.DataFrame(data)
print(coef_rf_df.sort_values)

<bound method DataFrame.sort_values of                          Names
alcohol               0.206056
chlorides             0.073366
citric acid           0.062601
density               0.088549
fixed acidity         0.061813
free sulfur dioxide   0.111556
pH                    0.073659
residual sugar        0.072072
sulphates             0.059111
total sulfur dioxide  0.073640
volatile acidity      0.117578>


**Answer:** Alcohol

In [33]:
%%time
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

parameters = {
    "max_features": [2, 3, 4],
    "max_samples": [0.5, 0.7, 0.9],
    "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100],
}
model = BaggingClassifier(LogisticRegression(class_weight='balanced'), 
                n_estimators=100, 
                bootstrap = True, random_state = 42)
grid_r = RandomizedSearchCV(model,parameters,n_iter=20, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=5), scoring='roc_auc', random_state=1)
grid_r.fit(X_train, y_train)
print(grid_r.best_)

nan
CPU times: total: 8min 49s
Wall time: 8min 50s


In [53]:
X_train_scaled

array([[-0.99273   ,  0.58375489,  2.64116886, ..., -0.32100149,
         0.10300887, -1.41119853],
       [-0.41141461,  0.38747125, -0.84407222, ..., -0.05551066,
        -0.25313273, -0.51729161],
       [ 0.16990079,  0.28932942,  1.50643921, ..., -0.78561044,
        -0.52023894, -0.11097029],
       ...,
       [-1.22525616, -0.00509605,  0.04750108, ...,  1.33831618,
        -0.87638054,  1.83937207],
       [ 2.72768852, -0.49580516,  0.1285532 , ..., -0.65286502,
        -1.49962834, -0.67982014],
       [-1.10899308,  0.1911876 , -0.27670739, ...,  1.47106159,
        -0.69830974,  1.92063633]])

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)


rf = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, random_state=42
)

parameters = {
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}

rf = GridSearchCV(rf, param_grid=parameters, scoring="roc_auc", cv=skf.split(X_train_scaled, y_train))
rf.fit(X_train_scaled, y_train)
print(rf.best_params_, rf.best_score_)

{'max_depth': 5, 'max_features': 1, 'min_samples_leaf': 3} nan


In [55]:
rf.best_score_

nan

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

lr = LogisticRegression(random_state=17, class_weight="balanced").fit(X_train_scaled, y_train)
roc_auc_score(y_holdout, lr.predict_proba(X_holdout_scaled), multi_class='ovr')

0.6858985176488988

In [110]:
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
rf = RandomForestClassifier(n_estimators=100,max_depth=20, max_features=12, min_samples_leaf=1, random_state=17,n_jobs= -1).fit(X_train_scaled, y_train)
roc_auc_score(y_holdout, rf.predict_proba(X_holdout_scaled), multi_class='ovr')

0.7619690590610604

In [123]:
parameters = {
    "n_estimators": [1, 2, 10, 25, 50, 75, 100, 150, 200, 250],
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}
clf = GridSearchCV(rf, param_grid=parameters, scoring="neg_mean_squared_error", cv=skf.split(X_train_scaled, y_train))

In [127]:

print(clf.best_params_)

{'max_depth': 15, 'max_features': 4, 'min_samples_leaf': 3, 'n_estimators': 200}


In [126]:
print(clf.best_score_)

-0.5139992764571939


In [133]:
rf = RandomForestClassifier(n_estimators=200,max_depth=15, max_features=4, min_samples_leaf=3, random_state=42,n_jobs= -1).fit(X_train, y_train)
roc_auc_score(y_holdout, rf.predict_proba(X_holdout), multi_class='ovr')

0.7379145028947585

In [136]:
from sklearn.ensemble import BaggingClassifier
bgclf = BaggingClassifier(estimator=rf, n_estimators=150, random_state=17).fit(X_train, y_train)

KeyboardInterrupt: 

In [135]:
roc_auc_score(y_holdout, bgclf.predict_proba(X_holdout), multi_class='ovr')

0.8001635799874697