In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

In [2]:
#loading in the data
data = pd.read_csv('digit-recognizer/train.csv')
test = pd.read_csv('digit-recognizer/test.csv')

In [3]:
#prep the data for the random forest classifier
x = data.iloc[:,1:]
y = data.iloc[:,0]

In [4]:
#split test and training data 
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.2, #training data is 0.8 
                                                    random_state = 42) #has to be 42 or else

In [5]:
randforest = RandomForestClassifier(n_estimators = 1000, #number of trees in forest 
                                    criterion = "entropy", #entropy instead of gini
                                    random_state = 42, #has to be 42 or else
                                    n_jobs = -1, #use all processors for fitting and predicting 
                                    min_samples_split = 4, #minimum samples needed to split a node
                                    max_depth = 6 )#max depth of tree based on data variables 

In [6]:
#fit to the training data
randforest.fit (x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [7]:
#predicting test data
pred = randforest.predict(test)

In [8]:
#necessary to generate a compatible csv file for kaggle with predictions 
output = pd.Series(pred, name = "Label") 

In [9]:
output2 = pd.concat([pd.Series(range(1,28001), name = "ImageId"), output], axis = 1) 

In [10]:
output2.to_csv("results/results.csv", index=False)