In [None]:
# train_c_1500_d1000
# Generated 1000 train examples from a boolean formula with 1500 clauses.

# valid_c_1500_d1000
# Validation set for the corresponding train set.

# test_c_1500_1000
# Test set for the corresponding test set.

# Train the classifier on the train set.
# Tune the parameters using validation set.
# Combine train and validation set into train&valid and 
    # train the classifier using the parameter settings
# Test the classifer on test set and gather 1. Accuracy and 2. F1 Score

In [1]:
!git clone https://github.com/Harsha0723/ClassifierAccuracy.git

Cloning into 'ClassifierAccuracy'...
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
ls ClassifierAccuracy/Datasets

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import f1_score, accuracy_score

In [None]:
def prepData(data):
    X = data.drop(data.shape[1]-1, axis=1)
    y = data[data.shape[1]-1]
    return X,y

In [None]:
import pandas as pd
import numpy as np

# TODO: read test and validation, given the datafolder name.
def importData(datasetName):
  trainData = pd.read_csv('ML-Homeworks/HW3Data/train_{0}.csv'.format(datasetName), header = None) 
  train_X, train_y = prepData(trainData)
  validData = pd.read_csv('ML-Homeworks/HW3Data/valid_{0}.csv'.format(datasetName), header = None) 
  valid_X, valid_y = prepData(validData)
  testData = pd.read_csv('ML-Homeworks/HW3Data/test_{0}.csv'.format(datasetName), header = None) 
  test_X, test_y = prepData(testData)
  return train_X, train_y, valid_X, valid_y, test_X, test_y

In [None]:
def tuneAndTest(model, gridParams, datasetName):
# Performing CV on the train and validation sets as well.
# Choose k fold cross-validation by combining both train and validation
# Choosing only elements in validation set to be in test using PredefinedSplit
  train_X, train_y, valid_X, valid_y, test_X, test_y = importData(datasetName)

  X = pd.concat([train_X, valid_X])
  test_fold = [-1 for _ in range(train_X.shape[0])] + [0 for _ in range(valid_X.shape[0])]
  y = pd.concat([train_y, valid_y])

  split = PredefinedSplit(test_fold)

  #bestParamsSearch = RandomizedSearchCV(estimator = model, param_distributions = gridParams, cv = split, scoring = "accuracy", n_jobs = -1)  
  bestParamsSearch = GridSearchCV(estimator = model, param_grid = gridParams, cv = split, scoring = "accuracy", n_jobs = -1)  
  bestParamsSearch.fit(X, y)

  bestParamEstimate=bestParamsSearch.best_estimator_
  yPred=bestParamEstimate.predict(test_X)
  accuracy=accuracy_score(test_y, yPred)
  f1Score=f1_score(test_y, yPred)

  return bestParamEstimate, accuracy, f1Score

In [None]:
# 1.a
# DECISION TREE
# Note: Choosing equally weighted samples i.e sample_weight is not specified.
# Note: Choosing class_weight = balanced (adjust weights of y inversely proportional to frequency in sample). 
# 1. criterion: log-loss, gini or entropy. Default: gini. (impurtiy measurement for a division)
# 2. splitter: best, random. (splitting at each node).
# 3. min_samples_split: int or float(fraction = min_samples_split * #samples), default: 2.
    # minimum samples needed to split a node
# 4. max_depth: if None, then nodes are expanded until all leaves are pure 
    # until all leaves contain less than min_samples_split sample 
# 5. min_samples_leaves: int or float 
    # number of samples to be there on left and right half after split
# 6. max_features: int, float, srqt, log2, none
    # number of features to consider when looking for best split(choosing a node at position).
# 7. min_impurity_decrease: A node will be split, if it reduces impurity by >= value

# 1.a
def runDecisionTreeModel(datasetName):

  decisionTreeParams = {'criterion' : ['gini', 'entropy'],
                      'splitter' : ['best'],
                      'min_samples_split' : [2, 5, 10, 50, 0.3, 0.1, 100],
                      'max_depth' : [10, 15, 20, 40, 80, 160],
                      'min_samples_leaf' : [1, 10, 15, 20, 50],
                      'max_features' : ['sqrt', 'log2','auto']
                      }
  decisionTree = DecisionTreeClassifier(random_state = 0)
  paramSetting, accuracy, f1Score = tuneAndTest(decisionTree, decisionTreeParams, datasetName)

  print("*********************************")
  print("Dataset:", datasetName)
  print("Params:", paramSetting)
  print("Accuracy:", accuracy)
  print("F1Score:", f1Score)
  print("**********************************")




In [None]:
runDecisionTreeModel("c300_d100")
runDecisionTreeModel("c300_d1000")
runDecisionTreeModel("c300_d5000")

runDecisionTreeModel("c500_d100")
runDecisionTreeModel("c500_d1000")
runDecisionTreeModel("c500_d5000")

runDecisionTreeModel("c1000_d100")
runDecisionTreeModel("c1000_d1000")
runDecisionTreeModel("c1000_d5000")

runDecisionTreeModel("c1500_d100")
runDecisionTreeModel("c1500_d1000")
runDecisionTreeModel("c1500_d5000")

runDecisionTreeModel("c1800_d100")
runDecisionTreeModel("c1800_d1000")
runDecisionTreeModel("c1800_d5000")

*********************************
Dataset: c300_d100
Params: DecisionTreeClassifier(max_depth=10, max_features='log2', min_samples_leaf=15,
                       random_state=0)
Accuracy: 0.585
F1Score: 0.5990338164251208
**********************************
*********************************
Dataset: c300_d1000
Params: DecisionTreeClassifier(max_depth=10, max_features='sqrt', min_samples_split=100,
                       random_state=0)
Accuracy: 0.612
F1Score: 0.5987590486039296
**********************************
*********************************
Dataset: c300_d5000
Params: DecisionTreeClassifier(max_depth=10, max_features='sqrt', min_samples_split=50,
                       random_state=0)
Accuracy: 0.6559
F1Score: 0.6636692405434463
**********************************
*********************************
Dataset: c500_d100
Params: DecisionTreeClassifier(max_depth=10, max_features='log2', min_samples_split=0.1,
                       random_state=0)
Accuracy: 0.62
F1Score: 0.6082474226804

In [None]:
# 1.2
# BAGGING CLASSIFIER - Ensemble model

# 1. base_estimator: classifier that is used, default decision tree
# use stable - KNN, SVM use unstable - Decision tree, NN.

# 2. n_estimators {2,8,10,25,50,100}

# 3. max_samples: number of bootstrap samples N to draw each of size n(size of training set)
# {1, 0.5, 0.75, 1}

# 4. max_features int or float {0.1, 0.25, 0.5, 0.75 ,1.0}: only a fraction of features

# 5. bootstrapbool: {True, False} : samples drawn with replacement or not.

# 6. bootstrap_featuresbool, {True, False} : features are drawn with replacement.
# 7. n_jobs = -1 use all processors

from sklearn.svm import SVC

def runBaggingEnsemble(datasetName):
  baggingParams = { 'n_estimators' : [10,25,50,100],
                    'max_samples' : [5, 10, 0.5, 1.0],
                    'max_features' : [1, 2, 10, 0.5],
                    'bootstrap' : [True, False],
                    'bootstrap_features' : [True, False],                 
                  }
  baggingEnsemble1 = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0, n_jobs = 5)
  paramSetting, accuracy, f1Score = tuneAndTest(baggingEnsemble1, baggingParams, datasetName)

  print("*********************************")
  print("Dataset with DecisionTree(unstable):", datasetName)
  print("Params:", paramSetting)
  print("Accuracy:", accuracy)
  print("F1Score:", f1Score)
  print("**********************************")
  # default: DecisionTreeClassifier()
  #baggingEnsemble2 = BaggingClassifier(random_state = 0)
  #paramSetting, accuracy, f1Score = tuneAndTest(baggingEnsemble2, baggingParams, datasetName)
 
  #print("*********************************")
  #print("Dataset woth Decision Tree(unstable):", datasetName)
  #print("Params:", paramSetting)
  #print("Accuracy:", accuracy)
  #print("F1Score:", f1Score)
  #print("**********************************")


In [None]:
runBaggingEnsemble("c300_d100")
runBaggingEnsemble("c300_d1000")
runBaggingEnsemble("c300_d5000")

runBaggingEnsemble("c500_d100")
runBaggingEnsemble("c500_d1000")
runBaggingEnsemble("c500_d5000")

runBaggingEnsemble("c1000_d100")
runBaggingEnsemble("c1000_d1000")
runBaggingEnsemble("c1000_d5000")

runBaggingEnsemble("c1500_d100")
runBaggingEnsemble("c1500_d1000")
runBaggingEnsemble("c1500_d5000")

runBaggingEnsemble("c1800_d100")
runBaggingEnsemble("c1800_d1000")
runBaggingEnsemble("c1800_d5000")


*********************************
Dataset with DecisionTree(unstable): c300_d100
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=0.5, n_estimators=100,
                  n_jobs=5, random_state=0)
Accuracy: 0.805
F1Score: 0.8040201005025126
**********************************
*********************************
Dataset with DecisionTree(unstable): c300_d1000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
                  n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.873
F1Score: 0.8721047331319234
**********************************




*********************************
Dataset with DecisionTree(unstable): c300_d5000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=0.5, n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.9571
F1Score: 0.9576881349245489
**********************************
*********************************
Dataset with DecisionTree(unstable): c500_d100
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.5,
                  n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.865
F1Score: 0.8656716417910448
**********************************
*********************************
Dataset with DecisionTree(unstable): c500_d1000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  bootstrap_features=True, max_features=0.5, n_estimators=100,
                  n_jobs=5, random_state=0)
Accuracy: 0.932
F1Score: 0.9323383084577115
**********************************



*********************************
Dataset with DecisionTree(unstable): c500_d5000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=0.5, n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.9665
F1Score: 0.9666234930756201
**********************************
*********************************
Dataset with DecisionTree(unstable): c1000_d100
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  bootstrap_features=True, max_features=0.5, n_estimators=100,
                  n_jobs=5, random_state=0)
Accuracy: 0.925
F1Score: 0.9230769230769231
**********************************
*********************************
Dataset with DecisionTree(unstable): c1000_d1000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=0.5, n_estimators=100,
                  n_jobs=5, random_state=0)
Accuracy: 0.982
F1Score: 0.982017



*********************************
Dataset with DecisionTree(unstable): c1000_d5000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=10, n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.9962
F1Score: 0.9962045545345585
**********************************
*********************************
Dataset with DecisionTree(unstable): c1500_d100
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.99
F1Score: 0.99
**********************************
*********************************
Dataset with DecisionTree(unstable): c1500_d1000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=10, n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 1.0
F1Score: 1.0
**********************************




*********************************
Dataset with DecisionTree(unstable): c1500_d5000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=10,
                  max_samples=0.5, n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 1.0
F1Score: 1.0
**********************************
*********************************
Dataset with DecisionTree(unstable): c1800_d100
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=10, n_estimators=50,
                  n_jobs=5, random_state=0)
Accuracy: 0.995
F1Score: 0.9949748743718593
**********************************
*********************************
Dataset with DecisionTree(unstable): c1800_d1000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=100, n_jobs=5, random_state=0)
Accuracy: 0.9985
F1Score: 0.9984992496248123
**********



*********************************
Dataset with DecisionTree(unstable): c1800_d5000
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=10, max_samples=0.5,
                  n_estimators=50, n_jobs=5, random_state=0)
Accuracy: 1.0
F1Score: 1.0
**********************************


In [None]:
# 1.3
# RANDOM FOREST 
# 1. n_estimatorsint : [100, 200, 500, 1000]
# 2. criterion : measuring impurity [“gini”, “entropy”, “log_loss”]
# 3. max_depth: [10, 20, 50, 100, 200]
# 4. min_samples_split : [2, 10, 0.25, 0.5, 0.75, 1]
# 5. min_samples_leaf : [1, 0.25, 0.5, 0.75, 1]
# 6. max_features{“sqrt”, “log2”, None}, 
# 7. bootstrap [True, False]
# 8. n_jobs = -1
def runRandomForestClassifier(datasetName):  
  randomForestParams = { 'n_estimators' : [10,50,100, 200],
                         'criterion' : ["gini", "entropy"],
                         'max_depth' : [2, 5, 10, 20, 50],
                         'min_samples_leaf' : [1, 2, 5, 10],
                         'max_features' : [0.1, 'sqrt', 'log2'],
                         'bootstrap' : [True, False]               
                        }
  randomForest = RandomForestClassifier(random_state = 0, n_jobs = 5)
  paramSetting, accuracy, f1Score = tuneAndTest(randomForest, randomForestParams, datasetName)

  print("*********************************")
  print("Dataset:", datasetName)
  print("Params:", paramSetting)
  print("Accuracy:", accuracy)
  print("F1Score:", f1Score)
  print("**********************************")

In [None]:
runRandomForestClassifier("c300_d100")
runRandomForestClassifier("c300_d1000")
runRandomForestClassifier("c300_d5000")

runRandomForestClassifier("c500_d100")
runRandomForestClassifier("c500_d1000")
runRandomForestClassifier("c500_d5000")

runRandomForestClassifier("c1000_d100")
runRandomForestClassifier("c1000_d1000")
runRandomForestClassifier("c1000_d5000")

runRandomForestClassifier("c1500_d100")
runRandomForestClassifier("c1500_d1000")
runRandomForestClassifier("c1500_d5000")

runRandomForestClassifier("c1800_d100")
runRandomForestClassifier("c1800_d1000")
runRandomForestClassifier("c1800_d5000")

*********************************
Dataset: c300_d100
Params: RandomForestClassifier(criterion='entropy', max_depth=10, max_features='sqrt',
                       min_samples_leaf=2, n_estimators=200, n_jobs=5,
                       random_state=0)
Accuracy: 0.81
F1Score: 0.8155339805825242
**********************************
*********************************
Dataset: c300_d1000
Params: RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       max_features='sqrt', min_samples_leaf=5,
                       n_estimators=200, n_jobs=5, random_state=0)
Accuracy: 0.8985
F1Score: 0.8994551758296188
**********************************
*********************************
Dataset: c300_d5000
Params: RandomForestClassifier(bootstrap=False, max_depth=10, max_features='sqrt',
                       min_samples_leaf=2, n_estimators=200, n_jobs=5,
                       random_state=0)
Accuracy: 0.9236
F1Score: 0.9252738654147106
********************************

In [None]:
# 1. loss : {‘log_loss’, ‘deviance’, ‘exponential’}, default=’log_loss’
# 2. n_estimators : [100, 500, 1000, 10000]
# large numbers give better results.
# 3. criterion{‘friedman_mse’, ‘squared_error’, ‘mse’}, default=’friedman_mse’
# 4. min_samples_split: samples required to split an internal node: [2, 0.2, 0.5, 1.0]
# 5. subsample :[0.8, 0.9,1.0] # if <1, increase bias and decrease variance - overfitting avoidance
# 6. min_samples_leaf: [0.2, 0.5, 1.0]
# 7. max_depth: maximum depth of the individual regression estimators. [1, 10, 20, 40, 80, 160, 250, 500] 
# 8. max_features: {‘auto’, ‘sqrt’, ‘log2’}
# Choosing max_features < n_features leads to a reduction of variance and an increase in bias.
# 9. tol [1e-5, 1e-3, 1e-1, 0.1, 1, 10]

# 1.4 
# GRADIENT BOOSTING
def runGradientBoostingClassifier(datasetName):  
  gradientBoostingParams = { 'loss' : ['deviance', 'exponential'],
                             'n_estimators' : [10, 50, 100],
                             'criterion' : ['friedman_mse', 'squared_error'],
                             'min_samples_leaf' : [1, 2, 5, 10],
                             'max_depth' : [1, 2, 5, 10],
                             'max_features' : ['sqrt', 'log2']
                            }
  gradientBooster = GradientBoostingClassifier(random_state = 0)
  paramSetting, accuracy, f1Score = tuneAndTest(gradientBooster, gradientBoostingParams, datasetName)

  print("*********************************")
  print("Dataset:", datasetName)
  print("Params:", paramSetting)
  print("Accuracy:", accuracy)
  print("F1Score:", f1Score)
  print("**********************************")
 # 'min_samples_split' : [0.2, 0.5],
                             #'subsample' : [0.5, 0.8],
                             

In [None]:
runGradientBoostingClassifier("c300_d100")
runGradientBoostingClassifier("c300_d1000")
runGradientBoostingClassifier("c300_d5000")

runGradientBoostingClassifier("c500_d100")
runGradientBoostingClassifier("c500_d1000")
runGradientBoostingClassifier("c500_d5000")

runGradientBoostingClassifier("c1000_d100")
runGradientBoostingClassifier("c1000_d1000")
runGradientBoostingClassifier("c1000_d5000")

runGradientBoostingClassifier("c1500_d100")
runGradientBoostingClassifier("c1500_d1000")
runGradientBoostingClassifier("c1500_d5000")

runGradientBoostingClassifier("c1800_d100")
runGradientBoostingClassifier("c1800_d1000")
runGradientBoostingClassifier("c1800_d5000")

*********************************
Dataset: c300_d100
Params: GradientBoostingClassifier(loss='exponential', max_depth=5, max_features='log2',
                           min_samples_leaf=5, random_state=0)
Accuracy: 0.79
F1Score: 0.7857142857142857
**********************************
*********************************
Dataset: c300_d1000
Params: GradientBoostingClassifier(max_depth=5, max_features='sqrt', random_state=0)
Accuracy: 0.909
F1Score: 0.9088176352705412
**********************************
*********************************
Dataset: c300_d5000
Params: GradientBoostingClassifier(loss='exponential', max_depth=10,
                           max_features='sqrt', min_samples_leaf=10,
                           random_state=0)
Accuracy: 0.9431
F1Score: 0.9440896138351184
**********************************
*********************************
Dataset: c500_d100
Params: GradientBoostingClassifier(loss='exponential', max_depth=10,
                           max_features='sqrt', min_samples_le

In [None]:
# 1.6
# Import MNIST dataset
# Tune params on train usin CV
# Run on test
from sklearn.datasets import fetch_openml
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, cache=True, return_X_y=True)
X = X / 255.


In [None]:
# rescale the data, use the traditional train/test split

# (10K: Train) and (1K: Test)
XTrainMNIST, XTestMNIST = X[:10000], X[69000:]
yTrainMNIST, yTestMNIST = y[:10000], y[69000:]

In [None]:
def tuneAndTestMNIST(model, gridParams):
# Performing CV on the train and validation sets as well.
# Choose k fold cross-validation by combining both train and validation
# Choosing only elements in validation set to be in test using PredefinedSplit
  bestParamsSearch = RandomizedSearchCV(estimator = model, param_distributions = gridParams, cv = 5, scoring = "accuracy", n_jobs = -1, verbose = 1, n_iter = 10)  
  bestParamsSearch.fit(XTrainMNIST, yTrainMNIST)

  bestParamEstimate=bestParamsSearch.best_estimator_
  yPred=bestParamEstimate.predict(XTestMNIST)
  accuracy=accuracy_score(yTestMNIST, yPred)
  
  return bestParamEstimate, accuracy

In [None]:
decisionTreeParams = {'criterion' : ['gini', 'entropy'],
                      'splitter' : ['best'],
                      'min_samples_split' : [2, 3, 7, 7, 9],
                      'max_depth' : [10,30,50,60,80,100],
                      'max_features' : ['sqrt', 'log2','auto']
                      }

decisionTree = DecisionTreeClassifier()
paramSetting, accuracy = tuneAndTestMNIST(decisionTree, decisionTreeParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset: MNIST
Params: DecisionTreeClassifier(criterion='entropy', max_depth=30, max_features='sqrt')
Accuracy: 0.772
**********************************


In [None]:
baggingParams = { 'n_estimators' : [10,50,100],
                    'max_samples' : [5, 10, 25, 50],
                    'max_features' : [1, 2, 10, 0.5],
                    'bootstrap' : [True, False],
                    'bootstrap_features' : [True, False],                 
                  }

baggingEnsemble1 = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0)
paramSetting, accuracy = tuneAndTestMNIST(baggingEnsemble1, baggingParams)

print("*********************************")
print("Dataset with Decision Tree(unstable): MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset with Decision Tree(unstable): MNIST
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=10, max_samples=25, n_estimators=100,
                  random_state=0)
Accuracy: 0.624
**********************************


In [None]:
randomForestParams = {
                'bootstrap': [True, False],    
                'max_features' : ['sqrt', 'log2'],
                'criterion': ["gini", "entropy"],
                'max_depth': [2, 10, 20],
                'min_samples_leaf': [2,5,10],
                'n_estimators': [100, 250, 500],
            }
       
randomForest = RandomForestClassifier(random_state = 0, n_jobs = -1)
paramSetting, accuracy = tuneAndTestMNIST(randomForest, randomForestParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset: MNIST
Params: RandomForestClassifier(bootstrap=False, max_depth=20, max_features='sqrt',
                       min_samples_leaf=2, n_estimators=500, n_jobs=-1,
                       random_state=0)
Accuracy: 0.963
**********************************


In [None]:
gradientBoostingParams = { 'loss' : ['deviance', 'exponential'],
                             'n_estimators' : [10, 25, 50],
                             'criterion' : ['mse', 'friedman_mse', 'squared_error'],
                             'min_samples_leaf' : [1, 2, 5, 10],
                             'max_depth' : [2, 5, 10],
                             'max_features' : ['sqrt', 'log2']
                            }
gradientBooster = GradientBoostingClassifier(random_state = 0)
paramSetting, accuracy = tuneAndTestMNIST(gradientBooster, gradientBoostingParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 310, in _check_params
    self.loss_ = loss_class(self.n_classes_)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb_losses.py", line 892, in __init__
    self.__class__.__name__, n_classes
ValueError: Exponen

*********************************
Dataset: MNIST
Params: GradientBoostingClassifier(criterion='mse', max_depth=10, max_features='log2',
                           n_estimators=50, random_state=0)
Accuracy: 0.944
**********************************


In [None]:
# Preparing the data
# Reading from csv, getting X and y
# train examples are double, clauses are 500 no matter what for c300
import pandas as pd

file_data = pd.read_csv('train_c300_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c500_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1000_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1500_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1800_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)