# Feature Selection

In [91]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.read_excel("data.xlsx")
grade1 = data.iloc[:,30:31]    #target column G1
grade2 = data.iloc[:,31:32]    #target column G2
grade3 = data.iloc[:,32:33]    #target column G3
X = data.drop(['G1', 'G2', 'G3'], axis = 1) #feature columns

#Replace string variables with numbers
#school
school = {'GP': 0, 'MS': 1}
X.school = [school[item] for item in X.school]
#sex
sex = {'M': 0, 'F': 1}
X.sex = [sex[item] for item in X.sex]
#address
address = {'U': 0, 'R': 1}
X.address = [address[item] for item in X.address]
#famsize
famsize = {'LE3': 0, 'GT3': 1}
X.famsize = [famsize[item] for item in X.famsize]
#Pstatus
Pstatus = {'T': 0, 'A': 1}
X.Pstatus = [Pstatus[item] for item in X.Pstatus]
#Mjob
Mjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Mjob = [Mjob[item] for item in X.Mjob]
#Fjob
Fjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Fjob = [Fjob[item] for item in X.Fjob]
#reason
reason = {'home': 0, 'reputation': 1, 'course': 2, 'other': 3}
X.reason = [reason[item] for item in X.reason]
#guardian
guardian = {'mother': 0, 'father': 1, 'other': 2}
X.guardian = [guardian[item] for item in X.guardian]
#schoolsup
schoolsup = {'no': 0, 'yes': 1}
X.schoolsup = [schoolsup[item] for item in X.schoolsup]
#famsup
famsup = {'no': 0, 'yes': 1}
X.famsup = [famsup[item] for item in X.famsup]
#paid
paid = {'no': 0, 'yes': 1}
X.paid = [paid[item] for item in X.paid]
#activities
activities = {'no': 0, 'yes': 1}
X.activities = [activities[item] for item in X.activities]
#nursery
nursery = {'no': 0, 'yes': 1}
X.nursery = [nursery[item] for item in X.nursery]
#higher
higher = {'no': 0, 'yes': 1}
X.higher = [higher[item] for item in X.higher]
#internet
internet = {'no': 0, 'yes': 1}
X.internet = [internet[item] for item in X.internet]
#romantic
romantic = {'no': 0, 'yes': 1}
X.romantic = [romantic[item] for item in X.romantic]

#Extract the 10 features with the biggest impact on G1 by applying Univariate Selection
features = SelectKBest(score_func=chi2, k=10)
fit = features.fit(X,grade1)
scores = pd.DataFrame(fit.scores_)
columns = pd.DataFrame(X.columns)
#Print the 10 features and their respective scores by concatenating them into a list
featureScores = pd.concat([columns,scores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print('The 10 features with the biggest impact on G1 and there scores calculated with Univariate Selection')
bestFeatures = featureScores.nlargest(10,'Score') #store the 10 best features
print(bestFeatures)

selection = np.asarray(bestFeatures)[:,0] #list containing the names of the 10 selected features
allFeatures = np.asarray(X.columns) #list containing the names of all features

Xfiltered = X #copy X so we can filter away all the features that aren't selected without modifying the original

for i in range(len(allFeatures)): #loop over all features in X
    selected = False
    for j in range(len(selection)): #loop over all selected features
        if allFeatures[i] == selection[j]: #if the current feature in X is in the list of selected features
            selected = True #set selected to true
    if selected == False: #if the feature is not selected
        Xfiltered = Xfiltered.drop(columns = allFeatures[i]) #drop it

X_train, X_test, y_train, y_test = train_test_split(Xfiltered,grade1, test_size=0.2)

xtest = np.asarray(X_test)
ytest = np.asarray(y_test)
linear_regression = LinearRegression().fit(X_train, y_train)
accuracy = linear_regression.score(X_test,y_test)
print(accuracy)

predictions = linear_regression.predict(X_test)
for i in range(len(predictions)):
    print(predictions[i], xtest[i], ytest[i])



The 10 features with the biggest impact on G1 and there scores calculated with Univariate Selection
     Features       Score
14   failures  131.735043
29   absences  107.102433
15  schoolsup   35.432187
11   guardian   17.506273
8        Mjob   16.248542
17       paid   13.802533
5     Pstatus   13.685845
7        Fedu   13.416273
27       Walc   13.394719
25      goout   13.334754
0.13637564642448707
[11.63076526] [0 3 2 0 0 0 1 3 2 2] [12]
[9.49965065] [0 3 0 0 0 1 1 4 4 2] [10]
[11.32385318] [0 2 3 0 0 0 0 4 2 4] [8]
[11.04173258] [0 2 2 1 0 1 0 1 1 0] [8]
[7.95546699] [ 0  3  4  0  0  1  1  3  4 54] [11]
[10.58171571] [0 2 4 0 0 0 1 5 1 0] [6]
[12.6977685] [0 4 0 0 0 0 1 2 2 0] [13]
[9.05680688] [ 0  4  3  0  1  0  1  3  4 28] [10]
[11.8384117] [0 2 2 0 0 0 1 2 2 3] [11]
[12.3481389] [0 4 2 0 0 0 0 3 1 0] [11]
[10.28431657] [ 0  3  4  2  1  0  0  4  1 20] [15]
[9.85783622] [ 0  4  0  0  0  1  0  5  3 14] [11]
[11.08145184] [0 3 1 0 0 0 0 5 5 2] [16]
[11.44657391] [ 0  2  1  1  0  

# Commented-out Feature Selection code as python doesn't support multiline comments

In [None]:
#Extract the 10 features with the biggest impact on G2 by applying Univariate Selection
features = SelectKBest(score_func=chi2, k=10)
fit = features.fit(X,grade2)
scores = pd.DataFrame(fit.scores_)
columns = pd.DataFrame(X.columns)
#Print the 10 features and their respective scores by concatenating them into a list
featureScores = pd.concat([columns,scores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print('The 10 features with the biggest impact on G2 and there scores calculated with Univariate Selection')
print(featureScores.nlargest(10,'Score'))  #print 10 best features

#Extract the 10 features with the biggest impact on G3 by applying Univariate Selection
features = SelectKBest(score_func=chi2, k=10)
fit = features.fit(X,grade3)
scores = pd.DataFrame(fit.scores_)
columns = pd.DataFrame(X.columns)
#Print the 10 features and their respective scores by concatenating them into a list
featureScores = pd.concat([columns,scores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print('The 10 features with the biggest impact on G3 and there scores calculated with Univariate Selection')
print(featureScores.nlargest(10,'Score'))  #print 10 best features

# Feature Extraction