# Feature Selection 

In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data_mat = pd.read_excel("data_mat.xlsx")
data_por = pd.read_excel("data_por.xlsx")
data = data_mat.append([data_por])

y_G1 = data.iloc[:,30:31]    #target column G1
y_G2 = data.iloc[:,31:32]    #target column G2
y_G3 = data.iloc[:,32:33]    #target column G3
X = data.drop(['G1', 'G2', 'G3'], axis = 1) #feature columns
y = [y_G1,y_G2,y_G3]

#Replace string variables with numbers
#school
school = {'GP': 0, 'MS': 1}
X.school = [school[item] for item in X.school]
#sex
sex = {'M': 0, 'F': 1}
X.sex = [sex[item] for item in X.sex]
#address
address = {'U': 0, 'R': 1}
X.address = [address[item] for item in X.address]
#famsize
famsize = {'LE3': 0, 'GT3': 1}
X.famsize = [famsize[item] for item in X.famsize]
#Pstatus
Pstatus = {'T': 0, 'A': 1}
X.Pstatus = [Pstatus[item] for item in X.Pstatus]
#Mjob
Mjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Mjob = [Mjob[item] for item in X.Mjob]
#Fjob
Fjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Fjob = [Fjob[item] for item in X.Fjob]
#reason
reason = {'home': 0, 'reputation': 1, 'course': 2, 'other': 3}
X.reason = [reason[item] for item in X.reason]
#guardian
guardian = {'mother': 0, 'father': 1, 'other': 2}
X.guardian = [guardian[item] for item in X.guardian]
#schoolsup
schoolsup = {'no': 0, 'yes': 1}
X.schoolsup = [schoolsup[item] for item in X.schoolsup]
#famsup
famsup = {'no': 0, 'yes': 1}
X.famsup = [famsup[item] for item in X.famsup]
#paid
paid = {'no': 0, 'yes': 1}
X.paid = [paid[item] for item in X.paid]
#activities
activities = {'no': 0, 'yes': 1}
X.activities = [activities[item] for item in X.activities]
#nursery
nursery = {'no': 0, 'yes': 1}
X.nursery = [nursery[item] for item in X.nursery]
#higher
higher = {'no': 0, 'yes': 1}
X.higher = [higher[item] for item in X.higher]
#internet
internet = {'no': 0, 'yes': 1}
X.internet = [internet[item] for item in X.internet]
#romantic
romantic = {'no': 0, 'yes': 1}
X.romantic = [romantic[item] for item in X.romantic]

total_error = 0.3 #keeps track of the total mean squared error
for g in range(len(y)): #loops over all 3 periods
    #Extract the 10 features with the biggest impact by applying Univariate Selection
    features = SelectKBest(score_func=chi2, k=10)
    fit = features.fit(X,y[g])
    scores = pd.DataFrame(fit.scores_)
    columns = pd.DataFrame(X.columns)
    #Print the 10 features and their respective scores by concatenating them into a list
    featureScores = pd.concat([columns,scores],axis=1)
    featureScores.columns = ['Features','Score']  #naming the dataframe columns
    print('The 10 features with the biggest impact on G',g+1 , 'and there scores calculated with Univariate Selection')
    bestFeatures = featureScores.nlargest(10,'Score') #store the 10 best features
    print(bestFeatures)
    print()

    selection = np.asarray(bestFeatures)[:,0] #list containing the names of the 10 selected features
    allFeatures = np.asarray(X.columns) #list containing the names of all features

    Xfiltered = X #copy X so we can filter away all the features that aren't selected without modifying the original

    for i in range(len(allFeatures)): #loop over all features in X
        selected = False
        for j in range(len(selection)): #loop over all selected features
            if allFeatures[i] == selection[j]: #if the current feature in X is in the list of selected features
                selected = True #set selected to true
        if selected == False: #if the feature is not selected
            Xfiltered = Xfiltered.drop(columns = allFeatures[i]) #drop it

    errors = 0 #keeps track of means squared error of the predicted grades for the current period
    for i in range(0,100): #predict and calculate the mean squared error 100 times for more accuracy
        X_train, X_test, y_train, y_test = train_test_split(Xfiltered,y[g], test_size=0.3) #split the data into test and train set
        linear_regression = LinearRegression().fit(X_train, y_train) #fit the data into linear regression format
        predictions = linear_regression.predict(X_test) #use the linear regression format to predict the grades
        errors = errors + mean_squared_error(y_test, predictions) #calculate the mean squared error
    total_error = total_error + errors #add error to total error
    print('Mean Squared Error for predicting G', g+1, 'using Feature Selection with 10 features is', errors/100) #print the mean error of all 100 runs
    print()
print('Mean Squared Error for predicting grades using Feature Selection with 10 features is', total_error/300) #print the mean error of all runs of all grades

The 10 features with the biggest impact on G 1 and there scores calculated with Univariate Selection
     Features       Score
14   failures  338.204484
29   absences  164.040058
6        Medu   37.555212
15  schoolsup   37.158663
0      school   35.794843
8        Mjob   32.364700
7        Fedu   28.700149
27       Walc   25.999364
13  studytime   25.425352
26       Dalc   24.534230

Mean Squared Error for predicting G 1 using Feature Selection with 10 features is 6.983089078386679

The 10 features with the biggest impact on G 2 and there scores calculated with Univariate Selection
     Features       Score
29   absences  395.609383
14   failures  333.055850
6        Medu   40.843899
15  schoolsup   34.047563
0      school   31.761423
7        Fedu   31.086546
8        Mjob   29.590094
27       Walc   26.748477
3     address   25.822547
26       Dalc   21.123821

Mean Squared Error for predicting G 2 using Feature Selection with 10 features is 8.74762468807451

The 10 features with th

# Feature Extraction

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


data_mat = pd.read_excel("data_mat.xlsx")
data_por = pd.read_excel("data_por.xlsx")
data = data_mat.append([data_por])

y_G1 = data.iloc[:,30:31]    #target column Daily Alcohol Consumption
y_G2 = data.iloc[:,31:32]    #target column Weekly Alcohol Consumption
y_G3 = data.iloc[:,32:33]
X = data.drop(['G1','G2','G3'], axis = 1) #feature columns
y = [y_G1,y_G2,y_G3]


#Replace string variables with numbers
#school
school = {'GP': 0, 'MS': 1}
X.school = [school[item] for item in X.school]
#sex
sex = {'M': 0, 'F': 1}
X.sex = [sex[item] for item in X.sex]
#address
address = {'U': 0, 'R': 1}
X.address = [address[item] for item in X.address]
#famsize
famsize = {'LE3': 0, 'GT3': 1}
X.famsize = [famsize[item] for item in X.famsize]
#Pstatus
Pstatus = {'T': 0, 'A': 1}
X.Pstatus = [Pstatus[item] for item in X.Pstatus]
#Mjob
Mjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Mjob = [Mjob[item] for item in X.Mjob]
#Fjob
Fjob = {'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4}
X.Fjob = [Fjob[item] for item in X.Fjob]
#reason
reason = {'home': 0, 'reputation': 1, 'course': 2, 'other': 3}
X.reason = [reason[item] for item in X.reason]
#guardian
guardian = {'mother': 0, 'father': 1, 'other': 2}
X.guardian = [guardian[item] for item in X.guardian]
#schoolsup
schoolsup = {'no': 0, 'yes': 1}
X.schoolsup = [schoolsup[item] for item in X.schoolsup]
#famsup
famsup = {'no': 0, 'yes': 1}
X.famsup = [famsup[item] for item in X.famsup]
#paid
paid = {'no': 0, 'yes': 1}
X.paid = [paid[item] for item in X.paid]
#activities
activities = {'no': 0, 'yes': 1}
X.activities = [activities[item] for item in X.activities]
#nursery
nursery = {'no': 0, 'yes': 1}
X.nursery = [nursery[item] for item in X.nursery]
#higher
higher = {'no': 0, 'yes': 1}
X.higher = [higher[item] for item in X.higher]
#internet
internet = {'no': 0, 'yes': 1}
X.internet = [internet[item] for item in X.internet]
#romantic
romantic = {'no': 0, 'yes': 1}
X.romantic = [romantic[item] for item in X.romantic]

totalError = 0
for g in range(len(y)):
    error = 0
    for i in range(0,100):
        #Split the data into test and train data
        X_train, X_test, y_train, y_test = train_test_split(X,y[g], test_size=0.2)
        #Create a PCA so that the number of features are reduced to a certain amount.
        pca = PCA(n_components = 10)
        #Fit the data to the new amount of features.
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        #Fit the data into a linear regression format
        linear_regression = LinearRegression().fit(X_train, y_train)
        #make the predictions
        predictions = linear_regression.predict(X_test)
        error += mean_squared_error(y_test, predictions)
    print("Error G",g+1, "is",(error/100),"without modifing the data")
    totalError += error
print ("Total Error:",totalError/300)

Error G 1 is 8.000756276465772 without modifing the data
Error G 2 is 9.572744712904935 without modifing the data
Error G 3 is 13.764134869106808 without modifing the data
Total Error: 10.445878619492506
