## Imports

In [2]:
import scipy.io
import numpy as np
from scipy.stats import linregress
from statsmodels.sandbox.stats.multicomp import multipletests 

# seaborn can be used to "prettify" default matplotlib plots by importing and setting as default
import seaborn as sns
sns.set_theme() # Set searborn as default

## Load dataset

In [3]:
mat = scipy.io.loadmat('sand.mat')
X = mat['X']
y = mat['Y'].ravel()

[n, p] = X.shape

### 3 Perform univariate feature selection for the sand data using:

> (a) Bonferroni correction to control the family-wise error rate(FWER). Use FWER = 0.05.

In [20]:

# Calculate the pvalue for each feature one at the time because OLS breaks down with this many features
# Use the stats models linear regression, since p value already is included
# Otherwise check https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
# Which explains how to expand the class in sklearn to calculate it

p_values = np.zeros((p,))
for i in range(p):
    slope, intercept, r_value, p_value, std_err = linregress(X[:,i], y)
    ## print(f'Feature {i} has p-value: {p_value}')
    p_values[i] = p_value

# Sort p-values in acending order
p_values = np.sort(p_values)
assert p_values[0] < p_values[-1]

# include all features with p values lower than p / features
p_values_bonferroni = p_values[(p_values < 0.05/p)]
p_values_bonferroni.shape

(72,)

> (b) Benjamini-Hochbergâ€™s algorithm for FDR. Use an acceptable fraction of mistakes,
q = 0.15.

In [None]:
q = 0.15
# Use multipletests to get the FDR corrected p values

p_values = np.zeros((p,))
for i in range(p):
    slope, intercept, r_value, p_value, std_err = linregress(X[:,i], y)
    ## print(f'Feature {i} has p-value: {p_value}')
    p_values[i] = p_value

# Sort p-values in acending order

p_values = np.sort(p_values)
assert p_values[0] < p_values[-1]

# include all features with p values lower  than q
for i in range(p):
    if p_values[i] > (i+1)/p * q:
        break

p_values[i:].shape

721


(1295,)

Compare the solutions in terms of number of selected features and selected features.