In [13]:
from sklearn.neighbors import KNeighborsRegressor


def knnpredictSoft(Xtr, Ytr, Xte, n_neighbors):

    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    knn.fit(Xtr, Ytr)
    
    return knn.predict(Xte)


In [3]:
import numpy as np
import mltools as ml
from sklearn.ensemble import RandomForestRegressor

def dtpredictSoft(Xtr, Ytr, Xte, maxDepth, minLeaf, nFeatures, nTrees):

    # Set up storage for trees
    trees = [None] * nTrees

    # Make trees
    for i in range(nTrees):
        M = Xtr.shape[0]
        Xi, Yi = ml.bootstrapData(Xtr, Ytr, M)
        trees[i] = RandomForestRegressor(n_estimators=nFeatures, max_depth=maxDepth, min_samples_leaf= minLeaf)
        trees[i] = trees[i].fit(Xi, Yi)

    predictXte = np.zeros((Xte.shape[0], nTrees))

    for i in range(nTrees):
        predictXte[:, i] = trees[i].predict(Xte)

    return np.mean(predictXte, axis=1)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


def lrpredictSoft(Xtr, Ytr, Xte, deg):

    model = make_pipeline(PolynomialFeatures(deg), LinearRegression())
    model.fit(Xtr, Ytr)

    return model.predict(Xte)


In [5]:
from sklearn import svm


def svmpredictSoft(Xtr, Ytr, Xte):
    model = svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
    model.fit(Xtr, Ytr)
    return model.predict(Xte)

In [6]:
from sklearn import ensemble

def gbpredictSoft(Xtr, Ytr, Xte, n_estimators, max_depth, min_samples_split, learning_rate):
    params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
              'learning_rate': learning_rate}
    model = ensemble.GradientBoostingRegressor(**params)
    model.fit(Xtr, Ytr)
    return model.predict(Xte)

In [15]:
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
import time

from sklearn.metrics import mean_squared_error


# General Settings
generateKaggleFile = 0

if (generateKaggleFile):
    sampleSize = 100000  # Don't change this
    split = 1  # Don't change this
else:
    sampleSize = 10000
    split = 0.75

# Using
usingDtree = 0
usingKnn = 0
usingLinear = 0
usingGradientBoosting = 0
usingSVM = 1

# Dtree Settings
maxDepth = 19
minLeaf = 64
nFeatures = 12
nTrees = 150

# Knn Settings
n_neighbors = 39

# Linear Settings
deg = 1

# Gradient Boosting Settings
n_estimators = 3000
max_depth = 500
min_samples_split = 100
learning_rate = 1


# Read in and set up data
X = np.genfromtxt('X_train.txt', delimiter=None, max_rows=sampleSize)
Y = np.genfromtxt('Y_train.txt', delimiter=None, max_rows=sampleSize)
Xte = np.genfromtxt('X_test.txt', delimiter=None, max_rows=sampleSize)

# Split data for training and validation
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, split)


# Get average soft prediction from all learners
def predictSoft(Xte):
    Yhat = [None] * len(Xte)

    if (usingDtree):
        Yhat = np.column_stack((Yhat, dtpredictSoft(Xtr, Ytr, Xte, maxDepth, minLeaf, nFeatures, nTrees)))

    if (usingKnn):
        Yhat = np.column_stack((Yhat, knnpredictSoft(Xtr, Ytr, Xte, n_neighbors)))

    if (usingLinear):
        Yhat = np.column_stack((Yhat, lrpredictSoft(Xtr, Ytr, Xte, deg)))

    if (usingGradientBoosting):
        Yhat = np.column_stack((Yhat, gbpredictSoft(Xtr, Ytr, Xte, n_estimators, max_depth, min_samples_split, learning_rate)))

    if (usingSVM):
        Yhat = np.column_stack((Yhat, svmpredictSoft(Xtr, Ytr, Xte)))

    return np.mean(Yhat[:, 1:], axis=1)


def mse(Yhat, Y):
    mean = mean_squared_error(Yhat, Y)
    print('MSE: %.4f' % mean)
    return mean

def auc(soft, Y):
    """Manual AUC function for applying to soft prediction vectors"""
    indices = np.argsort(soft)  
    Y = Y[indices]
    sorted_soft = soft[indices]

    dif = np.hstack(([True], np.diff(sorted_soft) != 0, [True]))
    r1 = np.argwhere(dif).flatten()
    r2 = r1[0:-1] + 0.5 * (r1[1:] - r1[0:-1]) + 0.5
    rnk = r2[np.cumsum(dif[:-1]) - 1]

    n0, n1 = sum(Y == 0), sum(Y == 1)

    result = (np.sum(rnk[Y == 1]) - n1 * (n1 + 1.0) / 2.0) / n1 / n0
    print('AUC: %.4f' % result)
    return result

def toKaggle():
    Yhat = predictSoft(Xte)
    np.savetxt('Yhat_knn.txt', np.vstack((np.arange(len(Yhat)), Yhat)).T, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')


start = time.time()

if generateKaggleFile:
    toKaggle()
else:
    #global nFeatures
    #aucs = [0] * 10
    #for i in range(5, 16):
    #    print i
    #    nFeatures = i
    #    aucs[i-5] = auc(predictSoft(Xva), Yva)

    #plt.plot(aucs)
    #plt.show()

    Yhat = predictSoft(Xtr)
    mse(Yhat, Ytr)
    auc(Yhat, Ytr)

end = time.time()

print('Seconds elapsed: %.4f' % (end - start))

MSE: 0.0174
AUC: 0.9987
Seconds elapsed: 3.2917
