In [6]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# read data
df_train = pd.read_csv("../datasets/mnist_train.csv")
df_test = pd.read_csv("../datasets/mnist_test.csv")
X_train = df_train.iloc[0:5000].drop("5", axis=1).values
y_train = df_train.iloc[0:5000]["5"].values.ravel()
X_test = df_test.drop("7", axis=1).values
y_test = df_test["7"].values.ravel()

# random forest
"""
param_grid = [
    {"n_estimators": range(2,200,20), "min_samples_leaf": range(2,40,4)}
]
forest_clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy")
forest_clf.fit(X_train, y_train)
scores = np.array(forest_clf.cv_results_["mean_test_score"])
X1, X2 = np.meshgrid(np.arange(2,200,20), np.arange(2,40,4))
plt.figure()
plt.contourf(X1, X2, scores.reshape(X1.shape), cmap=plt.get_cmap("jet"), )
plt.xlabel("n_estimators")
plt.ylabel("min_samples_leaf")
plt.colorbar()
plt.savefig("../plots/ex_7_03.pdf")
"""
forest_clf = RandomForestClassifier(n_estimators=100)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

extree_clf = ExtraTreesClassifier(n_estimators=100)
extree_scores = cross_val_score(extree_clf, X_train, y_train, cv=3, scoring="accuracy")

svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(probability=True))
])
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=3, scoring="accuracy")

voting_clf = VotingClassifier(
    estimators=[("forest", forest_clf), ("extree", extree_clf), ("svm", svm_clf)],
    voting="soft")
voting_scores = cross_val_score(voting_clf, X_train, y_train, cv=3, scoring="accuracy")

print(forest_scores.mean(), extree_scores.mean(), svm_scores.mean(), voting_scores.mean())

0.924403964919 0.93600240009 0.902403597546 0.939608651161
