In [101]:
import pandas as pd
import time
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

def scaleContinuous(X):
    cont_df = X[(X.isin([0, 1])).all().
                index[(X.isin([0, 1])).all() == False]]
    df = pd.DataFrame(preprocessing.scale(cont_df), columns = cont_df.columns)
    
    bin_cols = (X.isin([0, 1])).all().index[(X.isin([0, 1])).all() == True]
    df = pd.concat([df, pd.DataFrame(columns = bin_cols)])
    df[bin_cols] = X[(X.isin([0, 1])).all().
                     index[(X.isin([0, 1])).all() == True]].values
    return df

data = pd.read_csv(r'Rebound_Features.csv')

X = data.drop('REBOUNDER', axis=1)
y = data['REBOUNDER'].astype(int)

t0 = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train2 = scaleContinuous(X_train)
X_test2 = scaleContinuous(X_test)

classifiers = [
    LogisticRegression(),
    SVC(kernel="rbf", C=1, gamma = .001),
    MLPClassifier(alpha=1),
    KNN(n_neighbors=5),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                    max_depth=1, random_state=0)
]

for model in classifiers:
    model.fit(X_train2, y_train)
    y_pred = model.predict(X_test2)
    model.fit(X_train, y_train)
    y_pred2 = model.predict(X_test)
    scores = cross_val_score(model, X_train2, y_train, scoring='recall', cv=10)
    print(model)
    print('CV Precision: {}'.format(scores.mean()))
    print(metrics.confusion_matrix(y_test, y_pred))
    print('Scaled Accuracy: {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('Unscaled Accuracy: {}'.format(metrics.accuracy_score(y_test, y_pred2)))
    print('\n')

t1 = time.time()
print(t1-t0)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
CV Precision: 0.4474456707247696
[[3349  127]
 [ 439  328]]
Scaled Accuracy: 0.8666038180532643
Unscaled Accuracy: 0.8670751826537827


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
CV Precision: 0.06618784844692228
[[3456   20]
 [ 699   68]]
Scaled Accuracy: 0.8305444261135989
Unscaled Accuracy: 0.8159321234975253


MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, mo