In [47]:
import pandas as pd
import numpy as np
import plotly.graph_objects
import plotly.express
import scipy.stats as st
from sklearn.feature_selection import mutual_info_regression as mir

In [2]:
wine_df = pd.read_csv('winequality-red.csv')

In [3]:
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# **1D SELECTION ALGORYTHMS**

**LINEAR CORRELATION**

In [6]:
features = [feat for feat in wine_df.columns if feat != 'quality']

In [10]:
lin_corr_array = np.zeros((len(features),))

In [14]:
def get_correlation(feature, response):
    return np.sum((feature - np.mean(feature)) * (response - np.mean(response))) / np.sqrt(np.sum((feature - np.mean(feature)) ** 2) * np.sum((response - np.mean(response)) ** 2))

In [15]:
for i in range(len(features)):
    lin_corr_array[i] = get_correlation(wine_df[features[i]], wine_df['quality'])

In [16]:
lin_corr_array

array([ 0.12405165, -0.39055778,  0.22637251,  0.01373164, -0.12890656,
       -0.05065606, -0.18510029, -0.17491923, -0.05773139,  0.25139708,
        0.47616632])

**MUTUAL INFORMATION TECHNIQUE**

In [55]:
mutual_info_array = np.zeros((len(features),))
mutual_info_array_mine = np.zeros((len(features),))

In [56]:
def get_mut_info(feature, response):
    unique_feature_val = np.unique(feature)
    unique_response_val = np.unique(response)
    MI = 0
    for feat_val in unique_feature_val:
        for resp_val in unique_response_val:
            mut_probability = feature[(feature == feat_val) & (response == resp_val)].shape[0]
            if mut_probability == 0:
                continue
            feat_probability = feature[feature == feat_val].shape[0]
            resp_probability = response[response == resp_val].shape[0]
            MI += mut_probability * np.log2(mut_probability / (feat_probability * resp_probability))
    return MI

In [60]:
for i in range(len(features)):
    mutual_info_array_mine[i] = get_mut_info(wine_df[features[i]], wine_df['quality'])
mutual_info_array_mine

array([-16629.60664821, -16391.85666733, -16641.2566096 , -16706.9401968 ,
       -16547.45999419, -16796.3235191 , -16496.03994059, -15812.03177048,
       -16695.54060945, -16506.63415496, -16437.68298264])

In [59]:
mutual_info_array = mir(wine_df[features], wine_df['quality'], n_neighbors=2)
mutual_info_array

array([0.0313994 , 0.14013838, 0.03025256, 0.0360638 , 0.01256444,
       0.        , 0.06812449, 0.10641965, 0.01824006, 0.09054734,
       0.16550922])

# **DIMENSION REGRESSION** 

**RANDOM PROJECTION**