### Building a Student Intervention System

In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"
# Note: The last column 'passed' is the target/label, all other are feature columns

Student data read successfully!


Now, can you find out the following facts about the dataset?
- Total number of students
- Number of students who passed
- Number of students who failed
- Graduation rate of the class (%)
- Number of features

_Use the code block below to compute these values. Instructions/steps are marked using **TODO**s._

In [3]:
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [3]:
# Compute desired values - replace each '?' with an appropriate expression/function call
n_students = student_data.shape[0]
n_features = student_data.shape[1] - 1
n_passed = student_data[student_data['passed'] == 'yes'].shape[0]
n_failed = student_data[student_data['passed'] == 'no'].shape[0]
grad_rate = np.true_divide(n_passed, n_students)*100
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)

Total number of students: 395
Number of students who passed: 265
Number of students who failed: 130
Number of features: 30
Graduation rate of the class: 67.09%


In [5]:
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1])  # all columns but last are features
target_col = student_data.columns[-1]  # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)

X_all = student_data[feature_cols]  # feature values for all students
y_all = student_data[target_col]  # corresponding targets/labels
print "\nFeature values:-"
print X_all.head()  # print the first 5 rows

Feature column(s):-
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']
Target column: passed

Feature values:-
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...    

In [7]:
# Preprocess feature columns
def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'

        outX = outX.join(col_data)  # collect column(s) in output dataframe

    return outX

X_all = preprocess_features(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))

Processed feature columns (48):-
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [8]:
# decide how many training vs test samples 
num_all = student_data.shape[0]  # same as len(student_data)
num_train = 300  # about 75% of the data
num_test = num_all - num_train

# select features (X) and corresponding labels (y) for the training and test sets
from sklearn.cross_validation import train_test_split
# Shuffle and split the data
def shuffle_split_data(train_set_size):  
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = np.true_divide(num_test,num_all), train_size = train_set_size, random_state = 1)
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = shuffle_split_data(300)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])

Training set: 300 samples
Test set: 95 samples


In [18]:
# Choose a model, import it and instantiate an object
from sklearn.ensemble import RandomForestClassifier
import time

clf = RandomForestClassifier(n_estimators = 10)
# Train a model
def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)


# Fit model to training data
train_classifier(clf, X_train, y_train)


Training RandomForestClassifier...
Done!
Training time (secs): 0.016


In [19]:
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score

def predict_labels(clf, features, target):
    print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred, pos_label='yes')

train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)

Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.003
F1 score for training set: 0.989847715736


In [20]:
# Predict on test data
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.728682170543


### Model 1 Random Forst:
- General Applications:
    Random forests is an ensemble learning method for classification, regression and some other tasks.
- Lengths:
    1. It runs efficiently on large data bases. 
    2. Almost always have lower classification error and better f-scores than dicision trees.
    3. Dealing really well with uneven data sets that have missing variables.
    4. Generally train faster than SVMs.
    5. Can handle very well high dimensional spaces.
- Weaknesses:
    1. Large number of trees may make the algorithm slow for real-time prediction.
- Reason:
    Random Forest is a general algorithm to considered when dealing with classification problem, also in this case, we should deal with hign dimensional features.

In [21]:
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
    print "------------------------------------------"
    print "Training set size: {}".format(len(X_train))
    train_classifier(clf, X_train, y_train)
    print "F1 score for training set: {}".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

# size = 100
X_train, y_train, X_test, y_test = shuffle_split_data(100)
train_predict(clf, X_train, y_train, X_test, y_test)
# Note: Keep the test set constant

# size = 200
X_train, y_train, X_test, y_test = shuffle_split_data(200)
train_predict(clf, X_train, y_train, X_test, y_test)

# size = 300
X_train, y_train, X_test, y_test = shuffle_split_data(300)
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size: 100
Training RandomForestClassifier...
Done!
Training time (secs): 0.008
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.002
F1 score for training set: 0.96062992126
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.728682170543
------------------------------------------
Training set size: 200
Training RandomForestClassifier...
Done!
Training time (secs): 0.012
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.002
F1 score for training set: 0.988764044944
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.706766917293
------------------------------------------
Training set size: 300
Training RandomForestClassifier...
Done!
Training time (secs): 0.012
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 0.003
F1 sco

In [46]:
items = ["Training Time (secs)", "Prediction Time for Training Set (secs)",
        "Prediction Time for Testing Set (secs)", "F1 Score for Training Set",
        "F1 Score for Testing Set"]
results = {'100': [0.008, 0.002, 0.002, 0.960, 0.728],
           '200': [0.012, 0.002, 0.001, 0.988, 0.706],
           '300': [0.012, 0.003, 0.002, 0.990, 0.800]}

pd.DataFrame(results, index = items)

Unnamed: 0,100,200,300
Training Time (secs),0.008,0.012,0.012
Prediction Time for Training Set (secs),0.002,0.002,0.003
Prediction Time for Testing Set (secs),0.002,0.001,0.002
F1 Score for Training Set,0.96,0.988,0.99
F1 Score for Testing Set,0.728,0.706,0.8


### Model 2 Support Vector Machine:
- General Applications:
    Support vector machines are supervised learning models which used for classification and regression analysis.
- Strengths:
    1. Optimizes the classification error rather than the likelihood.
    2. They work really well in complicated domains where there is a clear margin of separation.
- Weaknesses:
    1. They don't perform so well in very large data sets, because the training time happens to be cubic in the size of the data set
    2. They don't work well with lots of noise, they might be prone to overfitting to some of the noise, so the class are very overlapping you have to count independent evidence, that's wehere then a naive bayes classifier would be better
- Reason:
    In this case, we have large amount of features, and the data set is not very large, so we choose SVM which can perform well in complicated domains.

In [22]:
# TODO: Train and predict using two other models
# Support Vector Machine
from sklearn.svm import SVC
clf = SVC()

# size = 100
X_train, y_train, X_test, y_test = shuffle_split_data(100)
train_predict(clf, X_train, y_train, X_test, y_test)

# size = 200
X_train, y_train, X_test, y_test = shuffle_split_data(200)
train_predict(clf, X_train, y_train, X_test, y_test)

# size = 300
X_train, y_train, X_test, y_test = shuffle_split_data(300)
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size: 100
Training SVC...
Done!
Training time (secs): 0.002
Predicting labels using SVC...
Done!
Prediction time (secs): 0.001
F1 score for training set: 0.859060402685
Predicting labels using SVC...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.833333333333
------------------------------------------
Training set size: 200
Training SVC...
Done!
Training time (secs): 0.005
Predicting labels using SVC...
Done!
Prediction time (secs): 0.003
F1 score for training set: 0.858064516129
Predicting labels using SVC...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.84076433121
------------------------------------------
Training set size: 300
Training SVC...
Done!
Training time (secs): 0.012
Predicting labels using SVC...
Done!
Prediction time (secs): 0.008
F1 score for training set: 0.858387799564
Predicting labels using SVC...
Done!
Prediction time (secs): 0.003
F1 score for test set: 0.846153846154


In [47]:
items = ["Training Time (secs)", "Prediction Time for Training Set (secs)",
        "Prediction Time for Testing Set (secs)", "F1 Score for Training Set",
        "F1 Score for Testing Set"]
results = {'100': [0.002, 0.001, 0.001, 0.859, 0.833],
           '200': [0.005, 0.003, 0.002, 0.858, 0.840],
           '300': [0.012, 0.008, 0.003, 0.858, 0.846]}

pd.DataFrame(results, index = items)

Unnamed: 0,100,200,300
Training Time (secs),0.002,0.005,0.012
Prediction Time for Training Set (secs),0.001,0.003,0.008
Prediction Time for Testing Set (secs),0.001,0.002,0.003
F1 Score for Training Set,0.859,0.858,0.858
F1 Score for Testing Set,0.833,0.84,0.846


### Model 3 Gradient Boosting:
- General Applications:
    Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models.
- Strengths:
    1. Can handle categorical features very well.
    2. Can handle very well high dimensional spaces as well as large number of training examples.
    3. Gradient Boosting will usually perform better compared to Random Forest.
- Weaknesses:
    1. Harder to get right compared to Random Forest, since Gradient Boosting have more hyper-parameters to tune and more prone to overfitting,
- Reason:
    This are categorical features and this is a classification problem, and we have large amount of features.

In [23]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)

# size = 100
X_train, y_train, X_test, y_test = shuffle_split_data(100)
train_predict(clf, X_train, y_train, X_test, y_test)

# size = 200
X_train, y_train, X_test, y_test = shuffle_split_data(200)
train_predict(clf, X_train, y_train, X_test, y_test)

# size = 300
X_train, y_train, X_test, y_test = shuffle_split_data(300)
train_predict(clf, X_train, y_train, X_test, y_test)

------------------------------------------
Training set size: 100
Training GradientBoostingClassifier...
Done!
Training time (secs): 0.027
Predicting labels using GradientBoostingClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 0.984848484848
Predicting labels using GradientBoostingClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.65
------------------------------------------
Training set size: 200
Training GradientBoostingClassifier...
Done!
Training time (secs): 0.037
Predicting labels using GradientBoostingClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 0.901098901099
Predicting labels using GradientBoostingClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.772727272727
------------------------------------------
Training set size: 300
Training GradientBoostingClassifier...
Done!
Training time (secs): 0.037
Predicting labels using GradientBoostingClassifier...
Done!
Prediction ti

In [48]:
items = ["Training Time (secs)", "Prediction Time for Training Set (secs)",
        "Prediction Time for Testing Set (secs)", "F1 Score for Training Set",
        "F1 Score for Testing Set"]
results = {'100': [0.027, 0.000, 0.000, 0.984, 0.650],
           '200': [0.037, 0.000, 0.000, 0.901, 0.772],
           '300': [0.037, 0.001, 0.000, 0.875, 0.776]}

pd.DataFrame(results, index = items)

Unnamed: 0,100,200,300
Training Time (secs),0.027,0.037,0.037
Prediction Time for Training Set (secs),0.0,0.0,0.001
Prediction Time for Testing Set (secs),0.0,0.0,0.0
F1 Score for Training Set,0.984,0.901,0.875
F1 Score for Testing Set,0.65,0.772,0.776


#### Best Model
    
The best model in this example is Support Vector Machine. Comparing the Training time: Gradient Boosting takes longer time than Random Forest and Support Vector Machine. The F1 score of Random Forest and Gradient Boosting for Training Test is better than Support Vector Machine, however, the F1 score of Support Vector Machine is the best, so SVMs are harder to overfitting compared the other two models.

So we chose SVMs as the best model, since the training time of SVMs are almost the same as Random Forest, but SVMs perform best on testing set.

#### How the model chosen is supposed to work:

This project is a classification problem, we are provided lots of features of students, such as ages, weekly study time and number of past class failures etc, we want to predict whether or not the student will pass the final exam based on these features.

The best model we chose here is SVMs, what SVMs do is find a separating line between data of two classes. Suppose we have some data of two different classes, SVMs is an algorithm that takes this data as an input, and output a line that separates those classes. SVMs choose the line that maximizes the distance to the nearest points of either of two classes. So for SVMs, we are trying to find the separating line which classifies two classes correctly, and subject to that constraint, maximizes the distance between the nearest points from two classes.

In this example, we have lots information of students, we can classify them into two classes based on whether or not pass the final exam, we consider all students as two types of points in a plane, in the training process, we are trying to find lines that can correctly separate two classes of students, among all of these lines, we pick the one which maximizes the distances to the nearest points from two classes, then we finished training our data. In the predicting process, for example, if we know all features of a new student, we can locate this student as a point in the plane, then what we need to do is to check which side of the separating line this student located, we will predict the students as the same class as other points in that side.

In [84]:
from sklearn.metrics import make_scorer
from sklearn import svm, grid_search, datasets
from sklearn.metrics import f1_score

X_train, y_train, X_test, y_test = shuffle_split_data(300)
def f1_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, pos_label='yes')
    return f1

# grid serch
def grid_search_SVM(X, y):
    clf = SVC()
    parameters = {'C':(0.001, 0.01, 1, 10, 100),
                  'kernel': ('linear', 'poly', 'rbf','sigmoid'),
                  'tol': (1e-5, 1e-4, 1e-3, 1e-2, 1e-1),
                  'probability': (True, False)
                 }
    scoring_function = make_scorer(f1_metrics)
    grid = grid_search.GridSearchCV(clf, parameters, scoring_function)
    grid.fit(X, y)
    return grid, grid.best_estimator_

model, parameters = grid_search_SVM(X_train, y_train)
print "Parameters of the grid search: "
print parameters

Parameters of the grid search: 
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=1e-05, verbose=False)


In [88]:
predict_labels(model, X_all, y_all)

Predicting labels using GridSearchCV...
Done!
Prediction time (secs): 0.010


0.85528455284552862

#### The tuned parameters:
- kernel = 'rbf'
- C = 1
- tol = 1e-5
- probability = True

F1 score of tuned model on entire training set:
F1 score = 0.855, which is better than the F1 score(0.846) before tuning parameters.