In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neural_network import MLPClassifier

from copy import deepcopy


%matplotlib inline

In [2]:
train_df = pd.read_csv('data/train.csv', index_col='Id')
train_df.columns

Index([u'Elevation', u'Aspect', u'Slope', u'Horizontal_Distance_To_Hydrology',
       u'Vertical_Distance_To_Hydrology', u'Horizontal_Distance_To_Roadways',
       u'Hillshade_9am', u'Hillshade_Noon', u'Hillshade_3pm',
       u'Horizontal_Distance_To_Fire_Points', u'Wilderness_Area1',
       u'Wilderness_Area2', u'Wilderness_Area3', u'Wilderness_Area4',
       u'Soil_Type1', u'Soil_Type2', u'Soil_Type3', u'Soil_Type4',
       u'Soil_Type5', u'Soil_Type6', u'Soil_Type7', u'Soil_Type8',
       u'Soil_Type9', u'Soil_Type10', u'Soil_Type11', u'Soil_Type12',
       u'Soil_Type13', u'Soil_Type14', u'Soil_Type15', u'Soil_Type16',
       u'Soil_Type17', u'Soil_Type18', u'Soil_Type19', u'Soil_Type20',
       u'Soil_Type21', u'Soil_Type22', u'Soil_Type23', u'Soil_Type24',
       u'Soil_Type25', u'Soil_Type26', u'Soil_Type27', u'Soil_Type28',
       u'Soil_Type29', u'Soil_Type30', u'Soil_Type31', u'Soil_Type32',
       u'Soil_Type33', u'Soil_Type34', u'Soil_Type35', u'Soil_Type36',
       u'Soil_

In [3]:
CATEGORICAL_VARIABLES = [u'Elevation', u'Aspect', u'Slope', u'Horizontal_Distance_To_Hydrology',
       u'Vertical_Distance_To_Hydrology', u'Horizontal_Distance_To_Roadways',
       u'Hillshade_9am', u'Hillshade_Noon', u'Hillshade_3pm',
       u'Horizontal_Distance_To_Fire_Points']
CONTINUOUS_VARIABLES = [u'Wilderness_Area1',
       u'Wilderness_Area2', u'Wilderness_Area3', u'Wilderness_Area4',
       u'Soil_Type1', u'Soil_Type2', u'Soil_Type3', u'Soil_Type4',
       u'Soil_Type5', u'Soil_Type6', u'Soil_Type7', u'Soil_Type8',
       u'Soil_Type9', u'Soil_Type10', u'Soil_Type11', u'Soil_Type12',
       u'Soil_Type13', u'Soil_Type14', u'Soil_Type15', u'Soil_Type16',
       u'Soil_Type17', u'Soil_Type18', u'Soil_Type19', u'Soil_Type20',
       u'Soil_Type21', u'Soil_Type22', u'Soil_Type23', u'Soil_Type24',
       u'Soil_Type25', u'Soil_Type26', u'Soil_Type27', u'Soil_Type28',
       u'Soil_Type29', u'Soil_Type30', u'Soil_Type31', u'Soil_Type32',
       u'Soil_Type33', u'Soil_Type34', u'Soil_Type35', u'Soil_Type36',
       u'Soil_Type37', u'Soil_Type38', u'Soil_Type39', u'Soil_Type40',]

# Neural Networks

In [39]:
X_standardized = deepcopy(train_df)
X_standardized[CATEGORICAL_VARIABLES] = StandardScaler().fit_transform(X_standardized[CATEGORICAL_VARIABLES])

y = X_standardized.Cover_Type
X = X_standardized.drop("Cover_Type", 1)

solvers = ['lbfgs', 'sgd', 'adam']
alphas = [0.0001, 0.001, 0.01, 0.1]
learning_rates = ['constant', 'invscaling', 'adaptive']
hidden_layer_sizes = [(len(X.columns),)]
param_grid = {'solver': solvers, 'alpha': alphas, 'learning_rate': learning_rates, 'hidden_layer_sizes': hidden_layer_sizes}

mpl_grid_search = RandomizedSearchCV(MLPClassifier(), param_grid, cv=5, verbose=3, n_jobs=2,  n_iter=10)
mpl_grid_search.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,) 
[CV] alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,) 
[CV]  alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,), score=0.677579, total=  24.3s
[CV] alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,) 
[CV]  alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,), score=0.687831, total=  24.5s
[CV] alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,) 
[CV]  alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,), score=0.740741, total=  23.5s
[CV] alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,) 
[CV]  alpha=0.0001, learning_rate=invscaling, solver=adam, hidden_layer_sizes=(54,), score=0.720569, total=  24.0s
[CV] alpha=0.01, learning_rate=invscaling, solver=lbfgs, hi

[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  3.8min


[CV]  alpha=0.0001, learning_rate=adaptive, solver=sgd, hidden_layer_sizes=(54,), score=0.690476, total=  22.7s
[CV] alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,) 
[CV]  alpha=0.0001, learning_rate=adaptive, solver=sgd, hidden_layer_sizes=(54,), score=0.729167, total=  20.9s
[CV] alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,) 
[CV]  alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,), score=0.613426, total=  22.8s
[CV] alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,) 
[CV]  alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,), score=0.617063, total=  23.3s
[CV] alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,) 
[CV]  alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,), score=0.691468, total=  22.4s
[CV] alpha=0.001, learning_rate=constant, solver=sgd, hidden_layer_sizes=(54,) 
[CV]  alpha=0.001, learning_rate=constant, 

[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:  7.1min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'alpha': [0.0001, 0.001, 0.01, 0.1], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'hidden_layer_sizes': [(54,)], 'solver': ['lbfgs', 'sgd', 'adam']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=3)

In [None]:
"""
Best params:
{'alpha': 0.001,
 'hidden_layer_sizes': (54,),
 'learning_rate': 'invscaling',
 'solver': 'lbfgs'}
Best score:
0.73412698412698407
"""

# Random Forest

In [47]:
y = train_df.Cover_Type
X = train_df.drop("Cover_Type", 1)

n_estimators = [10,20,50,100,500]
criterions = ['gini', 'entropy']
max_features = ['sqrt', 'log2']
min_samples_leaf = [1, 20, 50]

param_grid = {'criterion': criterions, 'n_estimators': n_estimators, 'max_features': max_features,
              'min_samples_leaf': min_samples_leaf}

rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, verbose=3, n_jobs=2)
rf_grid_search.fit(X, y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV] max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.698743, total=   0.2s
[CV] max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.707672, total=   0.2s
[CV] max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.748677, total=   0.2s
[CV] max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.756614, total=   0.2s
[CV] max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=10, cr

[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   30.9s


[CV]  max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20, score=0.639881, total=   0.3s
[CV] max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20 
[CV]  max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20, score=0.672288, total=   0.3s
[CV] max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20 
[CV]  max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20, score=0.702381, total=   0.3s
[CV] max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20 
[CV]  max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20, score=0.729167, total=   0.3s
[CV] max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20 
[CV]  max_features=sqrt, n_estimators=20, criterion=gini, min_samples_leaf=20, score=0.771495, total=   0.3s
[CV] max_features=sqrt, n_estimators=50, criterion=gini, min_samples_leaf=20 
[CV]  max_features=sqrt, n_estimators=50, criterion=gini, min_sam

[CV]  max_features=sqrt, n_estimators=500, criterion=gini, min_samples_leaf=50, score=0.679894, total=   5.2s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.707672, total=   0.2s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.708995, total=   0.2s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.719907, total=   0.2s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1, score=0.759590, total=   0.2s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=1 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_lea

[CV]  max_features=log2, n_estimators=100, criterion=gini, min_samples_leaf=20, score=0.687500, total=   0.9s
[CV] max_features=log2, n_estimators=100, criterion=gini, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=100, criterion=gini, min_samples_leaf=20, score=0.744048, total=   1.0s
[CV] max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=100, criterion=gini, min_samples_leaf=20, score=0.770833, total=   1.0s
[CV] max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20, score=0.632606, total=   4.9s
[CV] max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20, score=0.671958, total=   4.9s
[CV] max_features=log2, n_estimators=500, criterion=gini, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=500, criterion=gi

[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  2.1min


[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50, score=0.622685, total=   0.1s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50, score=0.646495, total=   0.1s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50, score=0.675265, total=   0.1s
[CV] max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=10, criterion=gini, min_samples_leaf=50, score=0.716931, total=   0.1s
[CV] max_features=log2, n_estimators=20, criterion=gini, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=20, criterion=gini, min_samples_leaf=50, score=0.636243, total=   0.2s
[CV] max_features=log2, n_estimators=20, criterion=gini, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=20, criterion=gini, min_sam

[CV] max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=100, criterion=entropy, min_samples_leaf=1, score=0.846230, total=   2.8s
[CV] max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1, score=0.750331, total=  14.3s
[CV] max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1, score=0.743717, total=  14.4s
[CV] max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1, score=0.767196, total=  14.5s
[CV] max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1 
[CV]  max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=1, score=0.807209, total=  14.4s
[CV] max_features=sqrt, n_estimator

[CV]  max_features=sqrt, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.677579, total=   0.7s
[CV] max_features=sqrt, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=sqrt, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.728836, total=   0.7s
[CV] max_features=sqrt, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=sqrt, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.734788, total=   0.7s
[CV] max_features=sqrt, n_estimators=100, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=sqrt, n_estimators=100, criterion=entropy, min_samples_leaf=50, score=0.620040, total=   1.4s
[CV] max_features=sqrt, n_estimators=100, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=sqrt, n_estimators=500, criterion=entropy, min_samples_leaf=20, score=0.774802, total=   8.7s
[CV] max_features=sqrt, n_estimators=100, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=sqrt, n_est

[CV] max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20, score=0.637566, total=   0.3s
[CV] max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20, score=0.657738, total=   0.3s
[CV] max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20, score=0.671296, total=   0.3s
[CV] max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20, score=0.716601, total=   0.3s
[CV] max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20 
[CV]  max_features=log2, n_estimators=20, criterion=entropy, min_samples_leaf=20, score=0.751984, total=   0.4s
[CV] max_features=log2, n_estimator

[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  5.6min


[CV]  max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.598545, total=   0.5s
[CV] max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.644180, total=   0.5s
[CV] max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.654762, total=   0.5s
[CV] max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.706019, total=   0.5s
[CV] max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=log2, n_estimators=50, criterion=entropy, min_samples_leaf=50, score=0.725198, total=   0.5s
[CV] max_features=log2, n_estimators=100, criterion=entropy, min_samples_leaf=50 
[CV]  max_features=log2, n_estimat

[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:  6.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_estimators': [10, 20, 50, 100, 500], 'max_features': ['sqrt', 'log2'], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 20, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [48]:
print rf_grid_search.best_params_
print rf_grid_search.best_score_

{'max_features': 'sqrt', 'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}
0.78373015873


In [49]:
"""
Best params:
{'max_features': 'sqrt', 'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}
Best score:
0.78373015873
"""

"\nBest params:\n{'max_features': 'sqrt', 'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}\nBest score:\n0.78373015873\n"

In [50]:
y = train_df.Cover_Type
X = train_df.drop("Cover_Type", 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestClassifier(criterion='gini', n_estimators=500, max_features='sqrt', min_samples_leaf=1)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)

print sklearn.metrics.accuracy_score(y_test, preds)

0.863756613757
