In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import BernoulliRBM
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import scale
import pydot
import sqlite3
import time
import pickle

In [2]:
con = sqlite3.connect('database.sqlite')

In [3]:
sample = pd.read_sql_query("""
SELECT Year, ADM_RATE_ALL, COSTT4_A, UGDS_BLACK, INC_PCT_LO, PAR_ED_PCT_1STGEN, 
       PCTFLOAN, C150_4
FROM Scorecard
WHERE  COSTT4_A != 'PrivacySuppressed' AND COSTT4_A IS NOT NULL
    AND ADM_RATE_ALL != 'PrivacySuppressed' AND ADM_RATE_ALL IS NOT NULL
    AND UGDS_BLACK != 'PrivacySuppressed' AND UGDS_BLACK IS NOT NULL
    AND INC_PCT_LO != 'PrivacySuppressed' AND INC_PCT_LO IS NOT NULL
    AND PAR_ED_PCT_1STGEN != 'PrivacySuppressed' AND PAR_ED_PCT_1STGEN IS NOT NULL
    AND PCTFLOAN != 'PrivacySuppressed' AND PCTFLOAN IS NOT NULL
    AND C150_4 != 'PrivacySuppressed' AND C150_4 IS NOT NULL
""",con)

In [4]:
data = pd.DataFrame(sample)

In [5]:
data.to_csv('data_2.csv')

In [6]:
data['completion_class'] = 'temp'

In [7]:
data['completion_class'][(data['C150_4'] < data['C150_4'].mean())] = 'Below Average'
data['completion_class'][(data['C150_4'] >= data['C150_4'].mean())] = 'Above Average'

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [8]:
# data['completion_class'][(data['C150_4'] >= data['C150_4'].quantile(.75))] = 'Above 75th %'
# data['completion_class'][(data['C150_4'] < data['C150_4'].quantile(.75))] = 'Between 50th and 75th %'
# data['completion_class'][(data['C150_4'] < data['C150_4'].quantile(.5))] = 'Between 25th and 50th %'
# data['completion_class'][(data['C150_4'] < data['C150_4'].quantile(.25))] = 'Below 25th %'

In [9]:
y = data.completion_class
X = data.drop(['C150_4', 'completion_class', 'Year'], axis=1).values
y_train, y_test, X_train, X_test = train_test_split(y, X, 
                                                    test_size=.3, 
                                                    random_state=515)

In [10]:
y.value_counts()

Below Average    2547
Above Average    2398
dtype: int64

#### Decision Tree

In [11]:
modDT = DecisionTreeClassifier(max_depth=6)
modDT.fit(X_train, y_train)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=6, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [12]:
print metrics.classification_report(y_test,modDT.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.80      0.76      0.78       744
Below Average       0.77      0.81      0.79       740

avg / total       0.79      0.79      0.79      1484



#### Random Forest

In [13]:
modRF = RandomForestClassifier(n_estimators=1000)
modRF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [14]:
print metrics.classification_report(y_test,modRF.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.84      0.78      0.81       744
Below Average       0.80      0.85      0.82       740

avg / total       0.82      0.82      0.82      1484



In [15]:
importance = modRF.feature_importances_

In [16]:
features = data.drop(['C150_4', 'completion_class', 'Year'], axis=1).columns

In [17]:
zip(features, importance)

[('ADM_RATE_ALL', 0.10038769259817625),
 ('COSTT4_A', 0.18776895579133862),
 ('UGDS_BLACK', 0.11709780614321259),
 ('INC_PCT_LO', 0.27427346401531699),
 ('PAR_ED_PCT_1STGEN', 0.21354658196228204),
 ('PCTFLOAN', 0.10692549948967285)]

#### Miltinomial Naive Bayes

In [18]:
modNB = MultinomialNB(alpha=1)
modNB.fit(X_train,y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [19]:
print metrics.classification_report(y_test,modNB.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.78      0.69      0.73       744
Below Average       0.72      0.81      0.76       740

avg / total       0.75      0.75      0.75      1484



#### Logistic Regression

In [20]:
modLog = LogisticRegression()
modLog.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [21]:
print metrics.classification_report(y_test,modLog.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.74      0.69      0.72       744
Below Average       0.71      0.75      0.73       740

avg / total       0.72      0.72      0.72      1484



#### Gradient Boosting Machines

In [22]:
modGBM = GradientBoostingClassifier(learning_rate = .05, subsample = .9, verbose = 1)
modGBM.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3458           0.0438            2.55s
         2           1.3091           0.0352            1.52s
         3           1.2770           0.0315            1.19s
         4           1.2458           0.0280            1.03s
         5           1.2207           0.0288            0.91s
         6           1.1947           0.0236            0.83s
         7           1.1717           0.0206            0.80s
         8           1.1495           0.0211            0.76s
         9           1.1301           0.0208            0.72s
        10           1.1096           0.0166            0.70s
        20           0.9745           0.0070            0.55s
        30           0.9008           0.0047            0.45s
        40           0.8522           0.0021            0.36s
        50           0.8245           0.0003            0.29s
        60           0.7860          -0.0005            0.23s
       

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.9, verbose=1,
              warm_start=False)

In [23]:
print metrics.classification_report(y_test,modGBM.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.83      0.76      0.80       744
Below Average       0.78      0.85      0.81       740

avg / total       0.81      0.80      0.80      1484



#### Restricted Boltzman Machine for Feature Selection Into Random Forest

In [24]:
RBM = BernoulliRBM(learning_rate=.1, n_iter = 20, n_components = 50)
RF = RandomForestClassifier(n_estimators=1000)
classifier = Pipeline([("rbm", RBM), ("randomforest", RF)])

In [25]:
classifier.fit(X_train, y_train)

Pipeline(steps=[('rbm', BernoulliRBM(batch_size=10, learning_rate=0.1, n_components=50, n_iter=20,
       random_state=None, verbose=0)), ('randomforest', RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0))])

In [26]:
print metrics.classification_report(y_test,classifier.predict(X_test))

             precision    recall  f1-score   support

Above Average       0.00      0.00      0.00       744
Below Average       0.50      1.00      0.67       740

avg / total       0.25      0.50      0.33      1484



  'precision', 'predicted', average, warn_for)


##### Scale Data

In [27]:
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

#### SVM

In [28]:
modSVM = SVC()
modSVM.fit(X_train_scaled,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
print metrics.classification_report(y_test,modSVM.predict(X_test_scaled))

             precision    recall  f1-score   support

Above Average       0.85      0.73      0.78       744
Below Average       0.76      0.87      0.81       740

avg / total       0.81      0.80      0.80      1484



#### KNN

In [30]:
modKNN = KNeighborsClassifier(n_neighbors=10)
modKNN.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')

In [31]:
print metrics.classification_report(y_test,modKNN.predict(X_test_scaled))

             precision    recall  f1-score   support

Above Average       0.80      0.78      0.79       744
Below Average       0.79      0.81      0.80       740

avg / total       0.79      0.79      0.79      1484



### Two Fields

In [32]:
ADM_RATE_ALL_train = X_train[:,0]
COSTT4_A_train = X_train[:,1]
UGDS_BLACK_train = X_train[:,2]
INC_PCT_LO_train = X_train[:,3]
PAR_ED_PCT_1STGEN_train = X_train[:,4]
PCTFLOAN_train = X_train[:,5]

In [33]:
ADM_RATE_ALL_test = X_test[:,0]
COSTT4_A_test = X_test[:,1]
UGDS_BLACK_test = X_test[:,2]
INC_PCT_LO_test = X_test[:,3]
PAR_ED_PCT_1STGEN_test = X_test[:,4]
PCTFLOAN_test = X_test[:,5]

In [34]:
ADM_RATE_ALL_scaled_train = X_train_scaled[:,0]
COSTT4_A_scaled_train = X_train_scaled[:,1]
UGDS_BLACK_scaled_train = X_train_scaled[:,2]
INC_PCT_LO_scaled_train = X_train_scaled[:,3]
PAR_ED_PCT_1STGEN_scaled_train = X_train_scaled[:,4]
PCTFLOAN_scaled_train = X_train_scaled[:,5]

In [35]:
ADM_RATE_ALL_scaled_test = X_test_scaled[:,0]
COSTT4_A_scaled_test = X_test_scaled[:,1]
UGDS_BLACK_scaled_test = X_test_scaled[:,2]
INC_PCT_LO_scaled_test = X_test_scaled[:,3]
PAR_ED_PCT_1STGEN_scaled_test = X_test_scaled[:,4]
PCTFLOAN_scaled_test = X_test_scaled[:,5]

##### Cost and % Low Income Students

In [36]:
X_Cost_PctLo_train = np.column_stack((COSTT4_A_train, INC_PCT_LO_train))
X_Cost_PctLo_test = np.column_stack((COSTT4_A_test, INC_PCT_LO_test))

##### Decision Tree

In [37]:
modDT = DecisionTreeClassifier(max_depth=6)
modDT.fit(X_Cost_PctLo_train, y_train)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=6, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [38]:
print metrics.classification_report(y_test,modDT.predict(X_Cost_PctLo_test))

             precision    recall  f1-score   support

Above Average       0.79      0.78      0.79       744
Below Average       0.78      0.79      0.79       740

avg / total       0.79      0.79      0.79      1484



##### Random Forest

In [39]:
modRF = RandomForestClassifier(n_estimators=1000)
modRF.fit(X_Cost_PctLo_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [40]:
print metrics.classification_report(y_test,modRF.predict(X_Cost_PctLo_test))

             precision    recall  f1-score   support

Above Average       0.79      0.75      0.77       744
Below Average       0.76      0.80      0.78       740

avg / total       0.77      0.77      0.77      1484



In [41]:
modRF.feature_importances_

array([ 0.43816578,  0.56183422])

##### Multinomial Naive Bayes

In [42]:
modNB = MultinomialNB(alpha=1)
modNB.fit(X_Cost_PctLo_train, y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [43]:
print metrics.classification_report(y_test,modNB.predict(X_Cost_PctLo_test))

             precision    recall  f1-score   support

Above Average       0.88      0.61      0.72       744
Below Average       0.70      0.91      0.79       740

avg / total       0.79      0.76      0.76      1484



##### Logistic Regressions

In [44]:
modLog = LogisticRegression()
modLog.fit(X_Cost_PctLo_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [45]:
print metrics.classification_report(y_test,modLog.predict(X_Cost_PctLo_test))

             precision    recall  f1-score   support

Above Average       0.66      0.77      0.71       744
Below Average       0.72      0.60      0.66       740

avg / total       0.69      0.69      0.69      1484



##### Gradient Boosting Machine

In [46]:
modGBM = GradientBoostingClassifier(learning_rate = .05, subsample = .9, verbose = 1)
modGBM.fit(X_Cost_PctLo_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3450           0.0361            0.39s
         2           1.3093           0.0316            0.33s
         3           1.2790           0.0320            0.31s
         4           1.2493           0.0299            0.30s
         5           1.2191           0.0236            0.29s
         6           1.1978           0.0243            0.29s
         7           1.1737           0.0211            0.28s
         8           1.1505           0.0166            0.28s
         9           1.1327           0.0179            0.27s
        10           1.1138           0.0147            0.27s
        20           0.9968           0.0086            0.25s
        30           0.9430           0.0038            0.24s
        40           0.8965           0.0008            0.20s
        50           0.8839           0.0006            0.16s
        60           0.8689           0.0007            0.12s
       

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.9, verbose=1,
              warm_start=False)

In [47]:
print metrics.classification_report(y_test,modGBM.predict(X_Cost_PctLo_test))

             precision    recall  f1-score   support

Above Average       0.82      0.77      0.79       744
Below Average       0.78      0.83      0.80       740

avg / total       0.80      0.80      0.80      1484



##### SVM

In [48]:
X_Cost_PctLo_train_scaled = np.column_stack((COSTT4_A_scaled_train, INC_PCT_LO_scaled_train))
X_Cost_PctLo_test_scaled = np.column_stack((COSTT4_A_scaled_test, INC_PCT_LO_scaled_test))

In [49]:
modSVM = SVC()
modSVM.fit(X_Cost_PctLo_train_scaled,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [50]:
print metrics.classification_report(y_test,modSVM.predict(X_Cost_PctLo_test_scaled))

             precision    recall  f1-score   support

Above Average       0.83      0.72      0.78       744
Below Average       0.76      0.86      0.80       740

avg / total       0.79      0.79      0.79      1484



##### KNN

In [51]:
modKNN = KNeighborsClassifier(n_neighbors = 15)
modKNN.fit(X_Cost_PctLo_train_scaled,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=15, p=2, weights='uniform')

In [52]:
print metrics.classification_report(y_test,modKNN.predict(X_Cost_PctLo_test_scaled))

             precision    recall  f1-score   support

Above Average       0.82      0.73      0.77       744
Below Average       0.76      0.84      0.79       740

avg / total       0.79      0.78      0.78      1484



##### Cost and 1st Gen Students

In [53]:
X_Cost_1stGen_train = np.column_stack((COSTT4_A_train, PAR_ED_PCT_1STGEN_train))
X_Cost_1stGen_test = np.column_stack((COSTT4_A_test, PAR_ED_PCT_1STGEN_test))

##### Decision Tree

In [54]:
modDT = DecisionTreeClassifier(max_depth=6)
modDT.fit(X_Cost_1stGen_train, y_train)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=6, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [55]:
print metrics.classification_report(y_test,modDT.predict(X_Cost_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.80      0.70      0.75       744
Below Average       0.73      0.83      0.78       740

avg / total       0.77      0.76      0.76      1484



##### Random Forest

In [56]:
modRF = RandomForestClassifier(n_estimators=1000)
modRF.fit(X_Cost_1stGen_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [57]:
print metrics.classification_report(y_test,modRF.predict(X_Cost_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.76      0.73      0.74       744
Below Average       0.74      0.76      0.75       740

avg / total       0.75      0.75      0.75      1484



In [58]:
modRF.feature_importances_

array([ 0.47304865,  0.52695135])

##### Multinomial Naive Bayes

In [59]:
modNB = MultinomialNB(alpha=1)
modNB.fit(X_Cost_1stGen_train, y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [60]:
print metrics.classification_report(y_test,modNB.predict(X_Cost_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.90      0.49      0.63       744
Below Average       0.65      0.94      0.77       740

avg / total       0.77      0.71      0.70      1484



##### Logistic Regressions

In [61]:
modLog = LogisticRegression()
modLog.fit(X_Cost_1stGen_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [62]:
print metrics.classification_report(y_test,modLog.predict(X_Cost_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.53      0.99      0.69       744
Below Average       0.95      0.11      0.19       740

avg / total       0.74      0.55      0.44      1484



##### Gradient Boosting Machine

In [63]:
modGBM = GradientBoostingClassifier(learning_rate = .05, subsample = .9, verbose = 1)
modGBM.fit(X_Cost_1stGen_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3497           0.0330            0.40s
         2           1.3190           0.0299            0.34s
         3           1.2886           0.0266            0.32s
         4           1.2622           0.0212            0.32s
         5           1.2404           0.0233            0.31s
         6           1.2163           0.0189            0.32s
         7           1.1958           0.0174            0.31s
         8           1.1787           0.0150            0.30s
         9           1.1623           0.0171            0.29s
        10           1.1459           0.0151            0.29s
        20           1.0415           0.0079            0.24s
        30           0.9868           0.0025            0.22s
        40           0.9594           0.0015            0.18s
        50           0.9290           0.0003            0.16s
        60           0.9171          -0.0007            0.12s
       

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.9, verbose=1,
              warm_start=False)

In [64]:
print metrics.classification_report(y_test,modGBM.predict(X_Cost_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.81      0.72      0.76       744
Below Average       0.75      0.83      0.79       740

avg / total       0.78      0.77      0.77      1484



##### SVM

In [65]:
X_Cost_1stGen_train_scaled = np.column_stack((COSTT4_A_scaled_train, PAR_ED_PCT_1STGEN_scaled_train))
X_Cost_1stGen_test_scaled = np.column_stack((COSTT4_A_scaled_test, PAR_ED_PCT_1STGEN_scaled_test))

In [66]:
modSVM = SVC()
modSVM.fit(X_Cost_1stGen_train_scaled,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [67]:
print metrics.classification_report(y_test,modSVM.predict(X_Cost_1stGen_test_scaled))

             precision    recall  f1-score   support

Above Average       0.84      0.67      0.74       744
Below Average       0.72      0.87      0.79       740

avg / total       0.78      0.77      0.77      1484



##### KNN

In [68]:
modKNN = KNeighborsClassifier(n_neighbors = 10)
modKNN.fit(X_Cost_1stGen_train_scaled,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')

In [69]:
print metrics.classification_report(y_test,modKNN.predict(X_Cost_1stGen_test_scaled))

             precision    recall  f1-score   support

Above Average       0.77      0.75      0.76       744
Below Average       0.76      0.77      0.76       740

avg / total       0.76      0.76      0.76      1484



##### % Lo Income and 1st Gen Students

In [70]:
X_PctLo_1stGen_train = np.column_stack((INC_PCT_LO_train, PAR_ED_PCT_1STGEN_train))
X_PctLo_1stGen_test = np.column_stack((INC_PCT_LO_test, PAR_ED_PCT_1STGEN_test))

##### Decision Tree

In [71]:
modDT = DecisionTreeClassifier(max_depth=6)
modDT.fit(X_PctLo_1stGen_train, y_train)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=6, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [72]:
print metrics.classification_report(y_test,modDT.predict(X_PctLo_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.79      0.75      0.77       744
Below Average       0.76      0.79      0.78       740

avg / total       0.77      0.77      0.77      1484



##### Random Forest

In [73]:
modRF = RandomForestClassifier(n_estimators=1000)
modRF.fit(X_PctLo_1stGen_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [74]:
print metrics.classification_report(y_test,modRF.predict(X_PctLo_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.79      0.72      0.75       744
Below Average       0.74      0.80      0.77       740

avg / total       0.76      0.76      0.76      1484



In [75]:
modRF.feature_importances_

array([ 0.52690631,  0.47309369])

##### Multinomial Naive Bayes

In [76]:
modNB = MultinomialNB(alpha=1)
modNB.fit(X_PctLo_1stGen_train, y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [77]:
print metrics.classification_report(y_test,modNB.predict(X_PctLo_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.00      0.00      0.00       744
Below Average       0.50      1.00      0.67       740

avg / total       0.25      0.50      0.33      1484



##### Logistic Regressions

In [78]:
modLog = LogisticRegression()
modLog.fit(X_PctLo_1stGen_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [79]:
print metrics.classification_report(y_test,modLog.predict(X_PctLo_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.78      0.77      0.78       744
Below Average       0.77      0.79      0.78       740

avg / total       0.78      0.78      0.78      1484



##### Gradient Boosting Machine

In [80]:
modGBM = GradientBoostingClassifier(learning_rate = .05, subsample = .9, verbose = 1)
modGBM.fit(X_PctLo_1stGen_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3457           0.0415            0.38s
         2           1.3105           0.0339            0.39s
         3           1.2782           0.0283            0.37s
         4           1.2514           0.0278            0.34s
         5           1.2258           0.0270            0.32s
         6           1.1997           0.0230            0.31s
         7           1.1791           0.0228            0.30s
         8           1.1585           0.0189            0.29s
         9           1.1405           0.0184            0.28s
        10           1.1214           0.0159            0.28s
        20           1.0082           0.0072            0.24s
        30           0.9495           0.0027            0.21s
        40           0.9165           0.0009            0.18s
        50           0.9104           0.0001            0.15s
        60           0.9018           0.0006            0.12s
       

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=0.9, verbose=1,
              warm_start=False)

In [81]:
print metrics.classification_report(y_test,modGBM.predict(X_PctLo_1stGen_test))

             precision    recall  f1-score   support

Above Average       0.80      0.74      0.77       744
Below Average       0.76      0.81      0.79       740

avg / total       0.78      0.78      0.78      1484



##### SVM

In [82]:
X_PctLo_1stGen_train_scaled = np.column_stack((INC_PCT_LO_scaled_train, PAR_ED_PCT_1STGEN_scaled_train))
X_PctLo_1stGen_test_scaled = np.column_stack((INC_PCT_LO_scaled_test, PAR_ED_PCT_1STGEN_scaled_test))

In [83]:
modSVM = SVC()
modSVM.fit(X_PctLo_1stGen_train_scaled,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [84]:
print metrics.classification_report(y_test,modSVM.predict(X_PctLo_1stGen_test_scaled))

             precision    recall  f1-score   support

Above Average       0.82      0.71      0.76       744
Below Average       0.74      0.85      0.79       740

avg / total       0.78      0.78      0.78      1484



##### KNN

In [85]:
modKNN = KNeighborsClassifier(n_neighbors = 10)
modKNN.fit(X_PctLo_1stGen_train_scaled,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')

In [86]:
print metrics.classification_report(y_test,modKNN.predict(X_PctLo_1stGen_test_scaled))

             precision    recall  f1-score   support

Above Average       0.79      0.75      0.77       744
Below Average       0.76      0.80      0.78       740

avg / total       0.77      0.77      0.77      1484

