<a href="https://colab.research.google.com/github/Laura-Neff/UnivariateFeatureSelection/blob/main/UnivariateFeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
diabetes_data = pd.read_csv('diabetes.csv')

diabetes_data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [None]:
#We are just creating a helper function here
#For when/each time the new dataFrame's values are equal to the old/main dataFrame's values, print the columns of the old one and set up column name array for our new dataFrame


def get_selected_features(X, X_new):
    selected_features = []

    for i in range(len(X_new.columns)):
        for j in range(len(X.columns)):

            if(X_new.iloc[:,i].equals(X.iloc[:,j])):
                print(X.columns[j])
                selected_features.append(X.columns[j])

    return selected_features

In [None]:
#perform chi^2 for univariate feature selection first
#the chi square analysis between every feature and the target variable calculates the measure of dependency between these 2 variables
#use this technique to figure out the most relevant features for classification model 
#gives us a goodness of fit measure because it measures how well the observed distribution of a particular variable fits with the distribution that is expected if 2 variables are independent

#sci-kit learn offers the SelectKBest estimator object to help select relevant features based on the method you want to perfrom

from sklearn.feature_selection import chi2

from sklearn.feature_selection import SelectKBest

In [None]:
X = diabetes_data.drop('Outcome', axis=1)

Y = diabetes_data['Outcome']

In [None]:
#8 columns = 8 features
X.shape

(768, 8)

In [None]:
X = X.astype(np.float64)

In [None]:
#select the best 4 features, A.K.A the features with the highest chi^2 scores in this case
test = SelectKBest(score_func=chi2, k=4)

fit = test.fit(X, Y)

In [None]:
fit.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [None]:
#will tell us which columns these scores correspond to 
feature_score = pd.DataFrame()

for i in range(X.shape[1]):
    new = pd.DataFrame({'Features' : X.columns[i],
                        'Score' : fit.scores_[i]}, index=[i])
    
    feature_score = pd.concat([feature_score, new])

In [None]:
feature_score

Unnamed: 0,Features,Score
0,Pregnancies,111.519691
1,Glucose,1411.887041
2,BloodPressure,17.605373
3,SkinThickness,53.10804
4,Insulin,2175.565273
5,BMI,127.669343
6,DiabetesPedigreeFunction,5.392682
7,Age,181.303689


In [None]:
#Now actually get our k best features. Fit transform
X_new = fit.transform(X)

In [None]:
X_new = pd.DataFrame(X_new)

X_new.head()

Unnamed: 0,0,1,2,3
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


In [None]:
#Which features were selected? What do these columns correspond to? 

selected_features = get_selected_features(X, X_new)

selected_features

Glucose
Insulin
BMI
Age


['Glucose', 'Insulin', 'BMI', 'Age']

In [None]:
#Now we will see which columns are which
X[selected_features].head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


In [None]:
chi2_best_features = X[selected_features]

In [None]:
#Use ANOVA f-value to measure dependency between variables and use relationship to determine most relevant features for classification
#will select features in the top 80th percentile (drop 20% at bottom -- the least relevant)

from sklearn.feature_selection import f_classif, SelectPercentile

test = SelectPercentile(f_classif, percentile=80)

fit = test.fit(X, Y)

In [None]:
fit.scores_

array([ 39.67022739, 213.16175218,   3.2569504 ,   4.30438091,
        13.28110753,  71.7720721 ,  23.8713002 ,  46.14061124])

In [None]:
X_new = fit.transform(X)

In [None]:
X_new = pd.DataFrame(X_new)

X_new.head()

Unnamed: 0,0,1,2,3,4,5
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0


In [None]:
selected_features = get_selected_features(X, X_new)

selected_features

Pregnancies
Glucose
Insulin
BMI
DiabetesPedigreeFunction
Age


['Pregnancies', 'Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [None]:
X[selected_features].head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0


In [None]:
f_classif_best_features = X[selected_features]

In [None]:
#Now that we have relevant features selected using 2 different statistical techniques
#let's train a LogisticRegression classifier model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#this method will let us see how successful using LogisticRegression to predict the outcome is after using feature selection via our different techniques
def build_model(X, Y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    model = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", accuracy_score(y_test, y_pred))

In [None]:
build_model(X, Y, 0.2)

Test_score :  0.7597402597402597


In [None]:
build_model(chi2_best_features, Y, 0.2)

Test_score :  0.8116883116883117


In [None]:
build_model(f_classif_best_features, Y, 0.2)

Test_score :  0.8051948051948052
