In [None]:
# 1.6
# Import MNIST dataset
# Tune params on train usin CV
# Run on test
from sklearn.datasets import fetch_openml
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, cache=True, return_X_y=True)
X = X / 255.


In [None]:
# rescale the data, use the traditional train/test split

# (10K: Train) and (1K: Test)
XTrainMNIST, XTestMNIST = X[:10000], X[69000:]
yTrainMNIST, yTestMNIST = y[:10000], y[69000:]

In [None]:
def tuneAndTestMNIST(model, gridParams):
# Performing CV on the train and validation sets as well.
# Choose k fold cross-validation by combining both train and validation
# Choosing only elements in validation set to be in test using PredefinedSplit
  bestParamsSearch = RandomizedSearchCV(estimator = model, param_distributions = gridParams, cv = 5, scoring = "accuracy", n_jobs = -1, verbose = 1, n_iter = 10)  
  bestParamsSearch.fit(XTrainMNIST, yTrainMNIST)

  bestParamEstimate=bestParamsSearch.best_estimator_
  yPred=bestParamEstimate.predict(XTestMNIST)
  accuracy=accuracy_score(yTestMNIST, yPred)
  
  return bestParamEstimate, accuracy

In [None]:
decisionTreeParams = {'criterion' : ['gini', 'entropy'],
                      'splitter' : ['best'],
                      'min_samples_split' : [2, 3, 7, 7, 9],
                      'max_depth' : [10,30,50,60,80,100],
                      'max_features' : ['sqrt', 'log2','auto']
                      }

decisionTree = DecisionTreeClassifier()
paramSetting, accuracy = tuneAndTestMNIST(decisionTree, decisionTreeParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset: MNIST
Params: DecisionTreeClassifier(criterion='entropy', max_depth=30, max_features='sqrt')
Accuracy: 0.772
**********************************


In [None]:
baggingParams = { 'n_estimators' : [10,50,100],
                    'max_samples' : [5, 10, 25, 50],
                    'max_features' : [1, 2, 10, 0.5],
                    'bootstrap' : [True, False],
                    'bootstrap_features' : [True, False],                 
                  }

baggingEnsemble1 = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0)
paramSetting, accuracy = tuneAndTestMNIST(baggingEnsemble1, baggingParams)

print("*********************************")
print("Dataset with Decision Tree(unstable): MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset with Decision Tree(unstable): MNIST
Params: BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=10, max_samples=25, n_estimators=100,
                  random_state=0)
Accuracy: 0.624
**********************************


In [None]:
randomForestParams = {
                'bootstrap': [True, False],    
                'max_features' : ['sqrt', 'log2'],
                'criterion': ["gini", "entropy"],
                'max_depth': [2, 10, 20],
                'min_samples_leaf': [2,5,10],
                'n_estimators': [100, 250, 500],
            }
       
randomForest = RandomForestClassifier(random_state = 0, n_jobs = -1)
paramSetting, accuracy = tuneAndTestMNIST(randomForest, randomForestParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
*********************************
Dataset: MNIST
Params: RandomForestClassifier(bootstrap=False, max_depth=20, max_features='sqrt',
                       min_samples_leaf=2, n_estimators=500, n_jobs=-1,
                       random_state=0)
Accuracy: 0.963
**********************************


In [None]:
gradientBoostingParams = { 'loss' : ['deviance', 'exponential'],
                             'n_estimators' : [10, 25, 50],
                             'criterion' : ['mse', 'friedman_mse', 'squared_error'],
                             'min_samples_leaf' : [1, 2, 5, 10],
                             'max_depth' : [2, 5, 10],
                             'max_features' : ['sqrt', 'log2']
                            }
gradientBooster = GradientBoostingClassifier(random_state = 0)
paramSetting, accuracy = tuneAndTestMNIST(gradientBooster, gradientBoostingParams)

print("*********************************")
print("Dataset: MNIST")
print("Params:", paramSetting)
print("Accuracy:", accuracy)
#print("F1Score:", f1Score)
print("**********************************")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 310, in _check_params
    self.loss_ = loss_class(self.n_classes_)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb_losses.py", line 892, in __init__
    self.__class__.__name__, n_classes
ValueError: Exponen

*********************************
Dataset: MNIST
Params: GradientBoostingClassifier(criterion='mse', max_depth=10, max_features='log2',
                           n_estimators=50, random_state=0)
Accuracy: 0.944
**********************************


In [None]:
# Preparing the data
# Reading from csv, getting X and y
# train examples are double, clauses are 500 no matter what for c300
import pandas as pd

file_data = pd.read_csv('train_c300_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c500_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1000_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1500_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)

file_data = pd.read_csv('train_c1800_d1000.csv', header = None)
#print(file_data) 
print(file_data.shape)