In [1]:
# Importing the necessary packages
import numpy as np                                  # "Scientific computing"
import scipy.stats as stats                         # Statistical tests

import pandas as pd                                 # Data Frame

import matplotlib.pyplot as plt                     # Basic visualisation

from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

The data provided contains information about individual study results for mathematics, together with demographic information about the student.
Solve each subquest in one cell. Show, if applicable, at the end of each cell first 5 lines of your last result (s).

In [2]:
demographic = pd.read_csv('https://raw.githubusercontent.com/HOGENT-ML/course/main/datasets/demographic.csv')
demographic.head()

Unnamed: 0,StudentID,gender,race/ethnicity,parental level of education
0,0,female,group B,bachelor's degree
1,1,female,group C,some college
2,2,female,group B,master's degree
3,3,male,group A,associate's degree
4,4,male,group C,some college


Show some general info about the dataset

In [3]:
demographic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   StudentID                    1000 non-null   int64 
 1   gender                       1000 non-null   object
 2   race/ethnicity               1000 non-null   object
 3   parental level of education  1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB


Read in the dataset studentscore.csv

In [4]:
student = pd.read_csv('https://raw.githubusercontent.com/HOGENT-ML/course/main/datasets/studentscore.csv')
student.head()

Unnamed: 0,StudentID,lunch,test preparation course,math score
0,0,standard,none,72
1,1,standard,completed,69
2,2,standard,none,90
3,3,free/reduced,none,47
4,4,standard,none,76


Show some general info about the dataset

In [5]:
student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   StudentID                1000 non-null   int64 
 1   lunch                    1000 non-null   object
 2   test preparation course  1000 non-null   object
 3   math score               1000 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 31.4+ KB


Merge both data frames so that you have one line left per student.  
We are doing an **inner** join on student and demographic, using the column StudentID.

In [6]:
student = pd.merge(student,demographic, on='StudentID', how='inner')
student.head()

Unnamed: 0,StudentID,lunch,test preparation course,math score,gender,race/ethnicity,parental level of education
0,0,standard,none,72,female,group B,bachelor's degree
1,1,standard,completed,69,female,group C,some college
2,2,standard,none,90,female,group B,master's degree
3,3,free/reduced,none,47,male,group A,associate's degree
4,4,standard,none,76,male,group C,some college


Remove all lines that contain empty fields in some columns.

In [7]:
student = student.dropna()
student.head(5)

Unnamed: 0,StudentID,lunch,test preparation course,math score,gender,race/ethnicity,parental level of education
0,0,standard,none,72,female,group B,bachelor's degree
1,1,standard,completed,69,female,group C,some college
2,2,standard,none,90,female,group B,master's degree
3,3,free/reduced,none,47,male,group A,associate's degree
4,4,standard,none,76,male,group C,some college


What are the unique values of luch, test preparation course, gender, race/ethnicity and parental level of education?

In [8]:
student['lunch'].value_counts()

standard        645
free/reduced    355
Name: lunch, dtype: int64

In [9]:
student['test preparation course'].value_counts()

none         642
completed    358
Name: test preparation course, dtype: int64

In [10]:
student['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64

In [11]:
student['race/ethnicity'].value_counts()

group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64

In [12]:
student['parental level of education'].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64

Replace the textual values of 'lunch', 'test preparation course' and 'gender' by numeric values 0 and 1.

In [13]:
student['lunch'] = np.where(student['lunch'] == 'standard', 0, 1)
student['test preparation course'] = np.where(student['test preparation course'] == 'none', 0, 1)
student['gender'] = np.where(student['gender'] == 'male', 0, 1)
student.head()

Unnamed: 0,StudentID,lunch,test preparation course,math score,gender,race/ethnicity,parental level of education
0,0,0,0,72,1,group B,bachelor's degree
1,1,0,1,69,1,group C,some college
2,2,0,0,90,1,group B,master's degree
3,3,1,0,47,0,group A,associate's degree
4,4,0,0,76,0,group C,some college



Later on we will make a prediction model for the 'math score'. Remove the column or columns that are not relevant for this purpose.


In [14]:
student = student.drop(['StudentID'], axis=1)
student.head()

Unnamed: 0,lunch,test preparation course,math score,gender,race/ethnicity,parental level of education
0,0,0,72,1,group B,bachelor's degree
1,0,1,69,1,group C,some college
2,0,0,90,1,group B,master's degree
3,1,0,47,0,group A,associate's degree
4,0,0,76,0,group C,some college


What is X and what is y?

In [15]:
X = student.drop(['math score'], axis = 1)
y = student['math score']   

What is X_train, y_train, X_test, y_test?

In [16]:
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)    

What is the shape of X_train, y_train, X_test and y_test?

In [17]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(750, 5) (250, 5) (750,) (250,)


What are the columns containing numbers?

In [18]:
numerical_ix = X.select_dtypes(include=[np.number]).columns
print(numerical_ix)


Index(['lunch', 'test preparation course', 'gender'], dtype='object')


What are the columns containing text?

In [19]:
categorical_ix = X.select_dtypes(include=['object']).columns
print(categorical_ix)

Index(['race/ethnicity', 'parental level of education'], dtype='object')


Define the ColumnTransformer for applying OneHotEncoder on both categorical columns.  
Why is one-hot-encoding preferable over numeric values in this case?

In [20]:
# define the data preparation for the columns
# Setting remainder='passthrough' will mean that all columns not specified in the list of "transformers"
# will be passed through without transformation, instead of being dropped.
col_transform = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_ix)], 
                                  remainder='passthrough')

The dictionary below consists of the model Linear SVM Regression.  
Add a model for SVM Regression, using a 2nd-degree polynomial kernel.

In [21]:
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

regressors = {
    "svm_reg": LinearSVR(epsilon=1.5, dual="auto", random_state=42),   # dual=True is the default but avoids FutureWarning
    "svm_poly_reg": SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
}

For each of both models: 
* Define the data preparation and modeling pipeline
* Train the model
* What is the accuracy of the model, expressed as MAE? Use K-fold cross-validation with k = 3.

svm_reg --> mae = 11.043956032438077  
svm_poly_reg --> mae = 11.406481743196977

In [22]:
for key, regressor in regressors.items():
  pipeline = Pipeline([('prep',col_transform), (key, regressor)])
  pipeline.fit(X_train, y_train)
  mae = np.mean(cross_val_score(pipeline, X_train, y_train, 
                                scoring='neg_mean_absolute_error', cv=3))
  print(f"{key} --> mae = {-mae}")

svm_reg --> mae = 11.043956032438077
svm_poly_reg --> mae = 11.406481743196977


Find the best value for parameter epsilon for the linear model using GridSearchCV with three folds. 

In [23]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV

svr_params = {'epsilon': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5]}
grid_svc = GridSearchCV(LinearSVR(dual=True,random_state=42), svr_params, 
                        scoring='neg_mean_absolute_error', cv=3)
pipeline = Pipeline([('prep',col_transform), ('grid_svc', grid_svc)])
pipeline.fit(X_train, y_train)

# best estimator
print(f"best estimator = {grid_svc.best_estimator_}")
# best score
print(f"best score = {-grid_svc.best_score_}")


best estimator = LinearSVR(dual=True, epsilon=3.0, random_state=42)
best score = 11.015022781135555


Use GridSearchCV to find the best parameters for the non-linear SVM regression task with little regularization (C is a large value)
* epsilon = 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5 
* kernel' = 'rbf', 'poly', 'sigmoid', 'linear'
* degree = [1, 2, 3]




In [24]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV

svr_params = {'epsilon': [0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5], 
              'kernel':['rbf', 'poly', 'sigmoid', 'linear'], 'degree':[1, 2, 3]}
grid_svc = GridSearchCV(SVR(C=100), svr_params, 
                        scoring='neg_mean_absolute_error', cv=3, verbose=1,n_jobs=-1)  # verbose=1 or 2 shows the progress, n_jobs=-1 uses all processors
pipeline = Pipeline([('prep',col_transform), ('grid_svc', grid_svc)])
pipeline.fit(X_train, y_train)
results = grid_svc.cv_results_

Fitting 3 folds for each of 96 candidates, totalling 288 fits


In [25]:
# best estimator
print(f"best estimator = {grid_svc.best_estimator_}")
# best score
print(f"best score = {-grid_svc.best_score_}")

best estimator = SVR(C=100, degree=1, epsilon=1.0, kernel='poly')
best score = 10.6970230922105


Is it worth using non-linear SVM iso linear SVM? 

Yes, the score is (slightly) better. 