## Feature Selection
- This is done before modelling to get the best features instead of using all the features present in the dataset.
- It helps to achieve optimal model performance with minimal features and avoid multicollinearity.

In [59]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [60]:
# load and read dataset
data = pd.read_csv("wine.csv")
data

# Quality - Target variable

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [61]:
data.columns = data.columns.str.replace(" ", "_")
data.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'red_wine'],
      dtype='object')

In [62]:
data.isna().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
red_wine                0
dtype: int64

In [63]:
# Check correlation matrix
data.corr()["quality"].map(abs).sort_values(ascending = False)

quality                 1.000000
alcohol                 0.444319
density                 0.305858
volatile_acidity        0.265699
chlorides               0.200666
red_wine                0.119323
citric_acid             0.085532
fixed_acidity           0.076743
free_sulfur_dioxide     0.055463
total_sulfur_dioxide    0.041385
sulphates               0.038485
residual_sugar          0.036980
pH                      0.019506
Name: quality, dtype: float64

In [64]:
# Separate features from target variable
X = data.drop("quality", axis = 1)
y = data["quality"]

In [65]:
# Feature selection
lr = LinearRegression()
rfe = RFE(estimator = lr, n_features_to_select = 5, step = 1)
rfe = rfe.fit(X, y)
rfe

In [66]:
data.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'red_wine'],
      dtype='object')

In [67]:
# Check support
# Checks which columns have been chosen and which have been dropped
rfe.support_

array([False,  True, False, False,  True, False, False,  True, False,
        True,  True, False])

In [68]:
rfe.ranking_

array([5, 1, 2, 6, 1, 7, 8, 1, 4, 1, 1, 3])

In [69]:
# To get the features that are needed to create an optimal model
X_rfe = rfe.get_feature_names_out()
X_rfe

array(['volatile_acidity', 'chlorides', 'density', 'sulphates', 'alcohol'],
      dtype=object)

In [70]:
X_new = X[X_rfe]
X_new

Unnamed: 0,volatile_acidity,chlorides,density,sulphates,alcohol
0,0.70,0.076,0.99780,0.56,9.4
1,0.88,0.098,0.99680,0.68,9.8
2,0.76,0.092,0.99700,0.65,9.8
3,0.28,0.075,0.99800,0.58,9.8
4,0.70,0.076,0.99780,0.56,9.4
...,...,...,...,...,...
6492,0.21,0.039,0.99114,0.50,11.2
6493,0.32,0.047,0.99490,0.46,9.6
6494,0.24,0.041,0.99254,0.46,9.4
6495,0.29,0.022,0.98869,0.38,12.8


In [71]:
lr.fit(X_new, y)
lr.score(X_new, y)
# Wueh

0.2742377223532102

In [72]:
?RFECV

In [79]:
# Feature selection using Cross Validation
lr2 = LinearRegression()
rfe2 = RFECV(estimator = lr2, min_features_to_select = 6, step = 1, cv = 3, 
             scoring = 'neg_mean_squared_error')
rfe2 = rfe2.fit(X, y)
rfe2

In [80]:
# Check support
rfe2.support_

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True])

In [81]:
rfe2.get_feature_names_out()

array(['fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'red_wine'], dtype=object)

In [85]:
rfe2.feature_names_in_

array(['fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
       'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'red_wine'], dtype=object)