# Feature Selection

In [334]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = pd.read_excel("data.xlsx")
y_G1 = data.iloc[:,30:31]    #target column G1
y_G2 = data.iloc[:,31:32]    #target column G2
y_G3 = data.iloc[:,32:33]    #target column G3
X = data.drop(['G1', 'G2', 'G3'], axis = 1) #feature columns
y = [y_G1,y_G2,y_G3]

#Replace string variables with numbers
#school
school = {'GP': 0, 'MS': 1}
X.school = [school[item] for item in X.school]
#sex
sex = {'M': 0, 'F': 1}
X.sex = [sex[item] for item in X.sex]
#address
address = {'U': 0, 'R': 1}
X.address = [address[item] for item in X.address]
#famsize
famsize = {'LE3': 0, 'GT3': 1}
X.famsize = [famsize[item] for item in X.famsize]
#Pstatus
Pstatus = {'T': 0, 'A': 1}
X.Pstatus = [Pstatus[item] for item in X.Pstatus]
#Mjob
Mjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Mjob = [Mjob[item] for item in X.Mjob]
#Fjob
Fjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Fjob = [Fjob[item] for item in X.Fjob]
#reason
reason = {'home': 0, 'reputation': 1, 'course': 2, 'other': 3}
X.reason = [reason[item] for item in X.reason]
#guardian
guardian = {'mother': 0, 'father': 1, 'other': 2}
X.guardian = [guardian[item] for item in X.guardian]
#schoolsup
schoolsup = {'no': 0, 'yes': 1}
X.schoolsup = [schoolsup[item] for item in X.schoolsup]
#famsup
famsup = {'no': 0, 'yes': 1}
X.famsup = [famsup[item] for item in X.famsup]
#paid
paid = {'no': 0, 'yes': 1}
X.paid = [paid[item] for item in X.paid]
#activities
activities = {'no': 0, 'yes': 1}
X.activities = [activities[item] for item in X.activities]
#nursery
nursery = {'no': 0, 'yes': 1}
X.nursery = [nursery[item] for item in X.nursery]
#higher
higher = {'no': 0, 'yes': 1}
X.higher = [higher[item] for item in X.higher]
#internet
internet = {'no': 0, 'yes': 1}
X.internet = [internet[item] for item in X.internet]
#romantic
romantic = {'no': 0, 'yes': 1}
X.romantic = [romantic[item] for item in X.romantic]

total_error = 0
for g in range(len(y)):
    #Extract the 10 features with the biggest impact on G1 by applying Univariate Selection
    features = SelectKBest(score_func=chi2, k=10)
    fit = features.fit(X,y[g])
    scores = pd.DataFrame(fit.scores_)
    columns = pd.DataFrame(X.columns)
    #Print the 10 features and their respective scores by concatenating them into a list
    featureScores = pd.concat([columns,scores],axis=1)
    featureScores.columns = ['Features','Score']  #naming the dataframe columns
    print('The 10 features with the biggest impact on G',g+1 , 'and there scores calculated with Univariate Selection')
    bestFeatures = featureScores.nlargest(10,'Score') #store the 10 best features
    print(bestFeatures)

    selection = np.asarray(bestFeatures)[:,0] #list containing the names of the 10 selected features
    allFeatures = np.asarray(X.columns) #list containing the names of all features

    Xfiltered = X #copy X so we can filter away all the features that aren't selected without modifying the original

    for i in range(len(allFeatures)): #loop over all features in X
        selected = False
        for j in range(len(selection)): #loop over all selected features
            if allFeatures[i] == selection[j]: #if the current feature in X is in the list of selected features
                selected = True #set selected to true
        if selected == False: #if the feature is not selected
            Xfiltered = Xfiltered.drop(columns = allFeatures[i]) #drop it

    errors = 0
    for i in range(0,100):
        X_train, X_test, y_train, y_test = train_test_split(Xfiltered,y[g], test_size=0.3)
        linear_regression = LinearRegression().fit(X_train, y_train)
        predictions = linear_regression.predict(X_test)
        errors = errors + mean_squared_error(y_test, predictions)  
    total_error = total_error + errors
    print('Mean Squared Error for predicting G', g+1, 'using Feature Selection with 10 features is', errors/100)
    print()
print('Mean Squared Error for predicting grades using Feature Selection with 10 features is', total_error/300)

The 10 features with the biggest impact on G 1 and there scores calculated with Univariate Selection
     Features       Score
14   failures  131.735043
29   absences  107.102433
15  schoolsup   35.432187
11   guardian   17.506273
8        Mjob   16.248542
17       paid   13.802533
5     Pstatus   13.685845
7        Fedu   13.416273
27       Walc   13.394719
25      goout   13.334754
Mean Squared Error for predicting G 1 using Feature Selection with 10 features is 9.133678355347808

The 10 features with the biggest impact on G 2 and there scores calculated with Univariate Selection
     Features       Score
29   absences  291.458449
14   failures  117.433679
15  schoolsup   28.847409
8        Mjob   23.342715
17       paid   17.080698
27       Walc   16.634579
0      school   15.359515
5     Pstatus   15.068327
11   guardian   14.631769
3     address   14.535013
Mean Squared Error for predicting G 2 using Feature Selection with 10 features is 12.370981983283443

The 10 features with th

# Feature Extraction