In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from time import time
from sklearn.metrics import f1_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
student_data=pd.read_csv('/kaggle/input/student-intervension/student-data.csv')

In [7]:
student_data.columns

In [11]:
n_students=len(student_data)
n_fetaures=len(student_data.columns)-1
n_passed=len(student_data[student_data['passed']=='yes'])
n_failed=n_students-n_passed
grad_rate=float(n_passed)*100/n_students

In [13]:
student_data.isnull().sum().sum()

In [32]:
feature_cols=list(student_data.columns[:-1])
target_col=student_data.columns[-1]

X_all=student_data[feature_cols]
y_all=student_data[target_col]

In [33]:
y_all.head(1)

In [34]:
X_all.head(2)

In [35]:
def preprocess_features(X):
    ''' Preprocess the data. Convert non-binary --> binary 
        Convert Categorical --> Dummy variables'''
    
    output=pd.DataFrame(index = X.index)
    
    for col,col_data in X.iteritems():
        if col_data.dtype==object:
            col_data=col_data.replace(['yes','no'],[1,0])
        if col_data.dtype==object:
            col_data=pd.get_dummies(col_data,prefix=col)
        
        output=output.join(col_data)
    return output


In [37]:
X_all=preprocess_features(X_all)
X_all.columns

In [40]:
from sklearn.model_selection import train_test_split
num_train=300
num_test=X_all.shape[0]-num_train

X_train,X_test,y_train,y_test=train_test_split(X_all,y_all,stratify=y_all,test_size=num_test,train_size=num_train,random_state=20)


In [42]:
X_test.shape

In [48]:
def train_classifier(clf,X_train,y_train):
    start=time()
    clf.fit(X_train,y_train)
    end=time()
    
    print("Trained model in {:.4f} seconds".format(end-start))

def predict_labels(clf,features,target):
    start=time()
    y_pred=clf.predict(features)
    end=time()
    
    print("Made predictions in {:.4f} seconds".format(end-start))
    return f1_score(target.values,y_pred,pos_label='yes')

def train_predict(clf,X_train,y_train,X_test,y_test):
    train_classifier(clf,X_train,y_train)
    
    print("F1 score for training set: {:.4f}".format(predict_labels(clf,X_train,y_train)))
    print("F1 score for test set: {:.4f}".format(predict_labels(clf,X_test,y_test)))

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [51]:
clf_A= DecisionTreeClassifier(random_state=0)
clf_B=SVC(random_state=0)
clf_C=AdaBoostClassifier(random_state=0)

for size in [100,200,300]:
    train_predict(clf_A,X_train[:size],y_train[:size],X_test,y_test)
    train_predict(clf_B,X_train[:size],y_train[:size],X_test,y_test)
    train_predict(clf_C,X_train[:size],y_train[:size],X_test,y_test)

**Classifer 1 - Decision Trees**  

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100               |        0.0042           |        0.0020          |       1.00       |      0.6769     |
| 200               |        0.0038           |        0.0020          |       1.00       |      0.6720     |
| 300               |        0.0017           |        0.0015          |       1.00       |      0.6667     |

**Classifer 2 - SVM**  

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100               |      0.0037             |       0.0029           |      0.7927      |     0.8050      |
| 200               |      0.0044             |       0.0040           |      0.7964      |     0.8050      |
| 300               |      0.0067             |       0.0033           |      0.8040      |     0.8050      |

**Classifer 3 - AdaBoost**  

| Training Set Size | Training Time | Prediction Time (test) | F1 Score (train) | F1 Score (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 100               |      0.0699             |        0.0093          |      0.9403      |     0.6957      |
| 200               |      0.0781             |        0.0103          |      0.8623      |     0.7231      |
| 300               |      0.0847             |        0.0116          |      0.8505      |     0.7626      |


In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import Normalizer

In [59]:
normer=Normalizer()
X_train=normer.fit_transform(X_train)
X_test=normer.transform(X_test)

c_range=np.logspace(-2,8,11)
gamma_range=np.logspace(-7,3,11)
parameters=dict(gamma=gamma_range,C=c_range)

clf=SVC(random_state=0)

f1_scorer=make_scorer(f1_score,pos_label='yes')

grid_obj=GridSearchCV(estimator=clf,param_grid=parameters,scoring=f1_scorer,n_jobs=1,cv=5)

grid_obj=grid_obj.fit(X_train,y_train)

clf=grid_obj.best_estimator_

print("Tuned model has a train F1 Score of {:.4f}".format(predict_labels(clf,X_train,y_train)))
print("Tuned model has a test F1 Score of {:.4f}".format(predict_labels(clf,X_test,y_test)))