In [141]:
import pandas as pd
import numpy as np
import plotly.graph_objects
import plotly.express
import scipy.stats as st
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor as RFR, GradientBoostingRegressor as GBR
from sklearn.feature_selection import SequentialFeatureSelector as SFC
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

In [4]:
wine_df = pd.read_csv('winequality-red.csv')

In [5]:
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [6]:
X_train, x_test, Y_train, y_test = train_test_split(
    wine_df.loc[:, ~wine_df.columns.isin(['quality'])], 
    wine_df['quality'], 
    test_size=0.25
)

# **BEST MODEL SELECTION, HYPERPARAMETER SELECTION**

# **1. L1**

**CREATE COLUMN TRANSFORMER FOR PIPELINE**

In [7]:
col_trans_lin_l1 = ColumnTransformer([
    ('scaller', StandardScaler(), [col for col in wine_df.columns if col != "quality"])
], remainder='passthrough')

**CREATE PIPELINE FOR L1**

In [8]:
pipe_l1 = Pipeline([('col_trans_lin_l1', col_trans_lin_l1), ('poly', PolynomialFeatures()), ('L1', Lasso())])

**CREATE HYPERPARAMETERS GRID**

In [55]:
params_l1 = {
    'poly__degree': [1, 2, 3],
    'L1__alpha': np.linspace(0.001, 0.1, 50),
    'poly__include_bias': [False, ]
}

**CREATE GRID SEARCH CV JOB**

In [56]:
grid_search_L1 = GridSearchCV(pipe_l1, params_l1, scoring='neg_mean_absolute_error', n_jobs=4, verbose=1)

In [57]:
grid_search_L1.fit(X_train, Y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


**BEST SCORE AND PARAMETERS**

In [58]:
grid_search_L1.best_score_

-0.49988081439033544

In [59]:
grid_search_L1.best_params_

{'L1__alpha': 0.019183673469387756,
 'poly__degree': 2,
 'poly__include_bias': False}

# **2. L2**

**CREATE COLUMN TRANSFORMER FOR L2**

**2. DECISION TREE**

In [28]:
col_trans_lin_l2 = ColumnTransformer([
    ('scaller', StandardScaler(), [col for col in wine_df.columns if col != "quality"])
], remainder='passthrough')

**CREATE PIPELINE FOR L2**

In [29]:
pipe_l2 = Pipeline([('col_trans_lin_l2', col_trans_lin_l2), ('poly', PolynomialFeatures()), ('L2', Ridge())])

**CREATE HYPERPARAMETER GRID**

In [50]:
params_l2 = {
    'poly__degree': [1, 2, 3],
    'L2__alpha': np.linspace(1, 50, 150),
    'poly__include_bias': [False, ]
}

**CREATE GRID SEARCH FOR L2**

In [51]:
grid_search_L2 = GridSearchCV(pipe_l2, params_l2, scoring='neg_mean_absolute_error', n_jobs=4, verbose=1)

In [52]:
grid_search_L2.fit(X_train, Y_train)

Fitting 5 folds for each of 450 candidates, totalling 2250 fits


**BEST SCORE AND PARAMETERS**

In [53]:
grid_search_L2.best_score_

-0.5002491017707965

In [54]:
grid_search_L2.best_params_

{'L2__alpha': 10.536912751677853,
 'poly__degree': 1,
 'poly__include_bias': False}

# **BINARY TREE**

**WE DONT NEED COLUMN TRANSFORMER. PROCEED STRAIGHT TO GRID SEARCH**

In [60]:
dec_tree = DTR()

**NEED HYPERPARAMETERS GRID**

In [61]:
params_tree = {
    'max_depth': np.linspace(2, 20, 19).astype(int),
    'min_samples_leaf': np.linspace(2, 150, 146).astype(int)
}

In [62]:
grid_search_BT = GridSearchCV(dec_tree, params_tree, scoring='neg_mean_absolute_error', n_jobs=4, verbose=1)

In [63]:
grid_search_BT.fit(X_train, Y_train)

Fitting 5 folds for each of 2774 candidates, totalling 13870 fits


**BEST SCORE AND PARAMETERS**

In [64]:
grid_search_BT.best_score_

-0.4911516427551993

In [66]:
grid_search_BT.best_params_

{'max_depth': 14, 'min_samples_leaf': 2}

# **RANDOM FORREST**

**AGAIN WE DONT NEED ANY TRANSFORMERS, START WITH GRID SEARCH**

In [67]:
ran_for = RFR()

**SET HYPERPARAMETERS GRID**

In [83]:
params_ran_for = {
    'max_depth': [42, 43, 44, 45, 46, 47, 48],
    'n_estimators': [70, 75, 80, 85, 90],
    'max_features': [3, 4, 5, 6],
    'min_samples_leaf': [1, 2]
}

**ASSIGN GRID SEARCH JOB**

In [84]:
grid_search_RF = GridSearchCV(ran_for, params_ran_for, scoring='neg_mean_absolute_error', n_jobs=4, verbose=1)

In [85]:
grid_search_RF.fit(X_train, Y_train)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


In [86]:
grid_search_RF.best_params_

{'max_depth': 46, 'max_features': 4, 'min_samples_leaf': 1, 'n_estimators': 75}

In [87]:
grid_search_RF.best_score_

-0.43721608554160857

# **GRADIENT BOOSTING**

**CREATE GRADIENT BOOST OBJ**

In [94]:
grad_boost = GBR(loss='absolute_error')

**BUILD HYPERPARAMETERS GRID**

In [128]:
params_grad_boost = {
    'learning_rate': np.linspace(0.01, 0.5, 15),
    'n_estimators': [100, 110, 120],
    'max_depth': [10, 20, 30],
    'max_features': [6, 7, 8],
    'min_samples_leaf': [1]
}

**ASSIGN AND START GRID SEARCH JOB**

In [129]:
grid_search_GB = GridSearchCV(grad_boost, params_grad_boost, scoring='neg_mean_absolute_error', n_jobs=4, verbose=1)

In [130]:
grid_search_GB.fit(X_train, Y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


In [131]:
grid_search_GB.best_score_

-0.41777401550982846

In [132]:
grid_search_GB.best_params_

{'learning_rate': 0.07999999999999999,
 'max_depth': 20,
 'max_features': 7,
 'min_samples_leaf': 1,
 'n_estimators': 110}

# **FEATURE REDUCTION PCA**

**CREATE CYCLE FOR FINAL NUMB OF PARAMETERS**

In [139]:
param_nb = [2, 3, 4, 5, 6, 7, 8, 9, 10]

In [140]:
res = np.zeros((len(param_nb),))
for i in tqdm(range(len(param_nb))):
    pca = PCA(n_components=param_nb[i], svd_solver='full')
    new_X = pd.DataFrame(data=pca.fit_transform(wine_df.loc[:, ~wine_df.columns.isin(['quality'])]), columns=[f'x{i}' for i in range(param_nb[i])])
    new_X_train = new_X.loc[X_train.index]
    new_x_test = new_X.loc[x_test.index]
    grad_boost = GBR(
        loss='absolute_error', 
        learning_rate=0.07999999999999999, 
        n_estimators=110,
        max_depth=20,
        max_features=7,
        min_samples_leaf=1,
    )
    cv_results = cross_validate(grad_boost, new_X_train, Y_train, scoring='neg_mean_absolute_error', cv=5)
    res[i] = np.max(cv_results['test_score'])
res

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [02:49<00:00, 18.85s/it]


array([-0.55143081, -0.48736524, -0.45920173, -0.43144594, -0.39859479,
       -0.40505527, -0.39265642, -0.4055798 , -0.40196431])

# **FEATURE REDUCTION ADD-DEL**

# **ДЗ**

**1. отобрать лучшую модель для обучения для датасета с оригинальными фичами, подобрать гиперпараметры**
**2. применить PCA что б отобрать 2, 3 и 4 и сравнить с результатом пункта 1**
**3. Применить отбор признаков ADD-DEL**
**4. sne, t-sne, попробовать реализовать**