# Project

In [None]:
from __future__ import division 
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
np.random.seed(0)
%matplotlib inline

In [None]:
X = np.genfromtxt('data/X_train.txt', delimiter=',') 
Y = np.genfromtxt('data/Y_train.txt', delimiter=',') 
X,Y = ml.shuffleData(X,Y)
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y,0.75)

# K-Nearest Neighbors

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import  metrics
K=[1,2,5,10,50,100,200]
for i in K:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    y_pred = knn.predict(Xva)
    print("K=",i," AUC=",metrics.roc_auc_score(Yva, y_pred))

K= 1  AUC= 0.5205701512468429
K= 2  AUC= 0.5132214706650797
K= 5  AUC= 0.5329607803292014
K= 10  AUC= 0.5440990507155921
K= 50  AUC= 0.5545789183383169
K= 100  AUC= 0.550181438151363
K= 200  AUC= 0.5480053415391761


# Linear models

In [5]:
from sklearn.linear_model import LinearRegression
degrees = np.array([1,3,5,7,10,15,18])
nFolds = 5

def function(nFolds,degree):
    J= np.zeros(nFolds)
    for iFold in range(nFolds):
        Xti,Xvi,Yti,Yvi = ml.crossValidate(Xtr,Ytr,nFolds,iFold)
        XtiP = ml.transforms.fpoly(Xti, degree, bias=False)
        XtiP,params = ml.transforms.rescale(XtiP)
        learner = ml.linear.linearRegress(XtiP,Yti)
        XviP,_ = ml.transforms.rescale(ml.transforms.fpoly(Xvi,degree,False), params)
        XtiP,_ = ml.transforms.rescale( ml.transforms.fpoly(Xti,degree,False), params)
        J[iFold] = learner.mse(XviP,Yvi)
    return np.mean(J)

for degree in degrees:
    r=function(nFolds,degree)
    print("degree=",degree)
    print(" trainning error=",r)

degree= 1
 trainning error= 0.2501490609874931
degree= 3
 trainning error= 0.2501490609874931
degree= 5
 trainning error= 0.2501490609874931
degree= 7
 trainning error= 0.2501490609874931
degree= 10
 trainning error= 0.2501490609874931
degree= 15
 trainning error= 0.2501490609874931
degree= 18
 trainning error= 0.2501490609874931


# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
for i in range(1000,1003):
    ada = AdaBoostClassifier(n_estimators=i,learning_rate=0.8)
    Xi, Yi = ml.bootstrapData(Xtr, Ytr)
    ada.fit(Xi,Yi)
    y_pred = ada.predict(Xva)
    print(i,accuracy_score(Ytr, ada.predict(Xtr)),accuracy_score(Yva, y_pred))

In [7]:
import pandas
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
seed = 7
kfold = model_selection.KFold(n_splits=5, random_state=seed)
model = AdaBoostClassifier(n_estimators=2000, random_state=seed)
Xi, Yi = ml.bootstrapData(Xtr, Ytr )
results = model_selection.cross_val_score(model, Xi, Yi, cv=kfold)
print(results.mean())

0.7531845772420278


# Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
def gbc(depth):
    n_bags = 25
    bags = []
    for l in range(n_bags):# Gradient boostingags):
        Xi, Yi = ml.bootstrapData(Xtr, Ytr)
        tree = GradientBoostingClassifier(max_depth=depth,n_estimators=100,loss='deviance')
        tree.fit(Xi,Yi)
        bags.append(tree)
    Yvahat=np.zeros((Xva.shape[0],n_bags))
    for i in range(n_bags):
        Yvahat[:,i]=bags[i].predict(Xva)
    predYte=np.mean(Yvahat, axis=1)
    print(depth,metrics.roc_auc_score(Yva, predYte))
gbc(5)

In [147]:
from sklearn import  metrics 
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(0) 
n_bags = 25
bags = []
for l in range(n_bags):
    Xi, Yi = ml.bootstrapData(X, Y)
    tree = GradientBoostingClassifier(max_depth=5)
    tree.fit(Xi,Yi)
    bags.append(tree)

In [148]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
Ytehat=np.zeros((Xte.shape[0],n_bags))
for i in range(n_bags):
    Ytehat[:,i]=bags[i].predict(Xte)
predYte=np.mean(Ytehat, axis=1)
Yte = np.vstack((np.arange(Xte.shape[0]), predYte)).T 
np.savetxt('problem3_3.txt',Yte,'%d, %.2f', header='Id,Predicted',comments='',delimiter=',')

# Random forests

In [142]:
from sklearn import  metrics 
np.random.seed(0) 
n_bags = 25
bags = []
for l in range(n_bags):
    Xi, Yi = ml.bootstrapData(Xtr, Ytr)
    tree = ml.dtree.treeClassify(Xi, Yi, minLeaf=8,maxDepth=50,nFeatures =50,minParent=1024)
    bags.append(tree)
Yvahat=np.zeros((Xva.shape[0],n_bags))
for i in range(n_bags):
    Yvahat[:,i]=bags[i].predict(Xva)
predYte=np.mean(Yvahat, axis=1)
print(metrics.roc_auc_score(Yva, predYte))

0.7300563073290346
