In [21]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from sklearn import datasets, neighbors
from mlxtend.plotting import plot_decision_regions

# Read student data
student_data = pd.read_csv("student-data.csv")

In [22]:
display(student_data.head())

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


## Data Exploration

In [23]:
# number of students
n_students = len(student_data)

# number of features
n_features = len(student_data.keys())-1

# number of passing students
n_passed = len(student_data[student_data['passed']=='yes'])

# number of failing students
n_failed = len(student_data[student_data['passed']=='no'])

# graduation rate
grad_rate = n_passed/n_students*100

# Print the results
print("Total number of students: {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students who passed: {}".format(n_passed))
print("Number of students who failed: {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students: 395
Number of features: 30
Number of students who passed: 265
Number of students who failed: 130
Graduation rate of the class: 67.09%


## Data Preparation

In [24]:
# feature variable names
feature_cols = student_data.columns.to_list()

# target variable 'passed'
target_col = student_data.columns[-1] 

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

# Show the feature information 
print("\nFeatures:")
display(X_all.head())


Features:


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


## Data Pre-processing

In [25]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # convert categorical data to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)

## Split the Data

In [26]:
# Use sklearn's train_test_split to split the data
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train ,y_test = train_test_split(X_all, y_all, 
                                                    test_size = 0.2, 
                                                    train_size = 0.8, 
                                                    random_state=42)

# Inspect split data
print("Training set:  {}".format(X_train.shape[0]))
print("Testing set: {}".format(X_test.shape[0]))

Training set:  316
Testing set: 79


## Setup Helper Functions

In [54]:
def train_classifier(model, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    model.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(model, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = model.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(model, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(model.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(model, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(predict_labels(model, X_train, y_train)))
    print("F1 score for test set: {:.4f}. \n".format(predict_labels(model, X_test, y_test)))
    
    
def knn_comparison(data, k):
    from sklearn.decomposition import PCA
    
    x = X_train[:200]
    y = y_train[:200].replace({'yes':1, 'no':0})
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    pca = PCA(n_components = 2)
    x2 = pca.fit_transform(x)
    clf.fit(x2, y)
    
    # Plotting decision region
    plt.figure(figsize=(10,8))
    plot_decision_regions(x2, y.to_numpy(), clf=clf, legend=2)
    # Adding axes annotations
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('Knn with K='+ str(k))
    plt.show()

## Build Supervised ML Models

In [48]:
# Import supervised learning models from sklearn (Here we us SVM an )
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import BaggingClassifier as BC
from sklearn.ensemble import RandomForestClassifier as RFC

# Initialize the models
model_a = BC(KNN(),random_state=1)
model_b = SVC(random_state=1)

# Set up the training set sizes
size = [100,200,300]
clf_set = {'K-Nearest Neighbors':model_a, 'Support Vector Machine':model_b}

for classifier in clf_set:
    print('Classifier: %s' %classifier)
    print(clf_set[classifier],'\n')
    for s in size:
        X = X_train[:s]
        y = y_train[:s]
        train_predict(clf_set[classifier], X, y, X_test, y_test)
    print('--------------- \n')

Classifier: K-Nearest Neighbors
BaggingClassifier(base_estimator=KNeighborsClassifier(), random_state=1) 

Training a BaggingClassifier using a training set size of 100. . .
Trained model in 0.0343 seconds
Made predictions in 0.0258 seconds.
F1 score for training set: 0.8671.
Made predictions in 0.0227 seconds.
F1 score for test set: 0.7903. 

Training a BaggingClassifier using a training set size of 200. . .
Trained model in 0.0283 seconds
Made predictions in 0.0502 seconds.
F1 score for training set: 0.8789.
Made predictions in 0.0277 seconds.
F1 score for test set: 0.8264. 

Training a BaggingClassifier using a training set size of 300. . .
Trained model in 0.0406 seconds
Made predictions in 0.1023 seconds.
F1 score for training set: 0.8849.
Made predictions in 0.0356 seconds.
F1 score for test set: 0.8235. 

--------------- 

Classifier: Support Vector Machine
SVC(random_state=1) 

Training a SVC using a training set size of 100. . .
Trained model in 0.0075 seconds
Made predictions

## Model Selection

Here, we try to slect the better model of the two to use with the student-data. I will then perform a grid search optimization for the model over the entire training set (`X_train` and `y_train`) by tuning at least one parameter to improve upon the untuned model’s F1 score.

The better performing model out of these, on average, in terms of $F1$ score is Bagging with `K-Nearest Neighbors` (`Train`: 87.7%, `Test`: 81.34%) which is significantly better than `SVM` (`Train`: 82.22, `Test`: 79.59%). 

Hence, we choose the **`K-Nearest Neighbors`** model for this dataset.