## On EDA, it was demonstrated that the variables are strong predictors, but only Tree-like models where succesful in separating boundaries. Also, there is a strong tendency on overfitting. So, I will try to tune decision trees, random forests or boosting methods

In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline



In [3]:
train_df=pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df=pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [4]:
train_df = train_df.drop(["Id"], axis = 1)
test_ids = test_df["Id"]
test_df = test_df.drop(["Id"], axis = 1)

In [5]:
tree = DecisionTreeClassifier(max_depth=10,
    min_samples_split=3,
    min_samples_leaf=2)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(['Cover_Type'], axis=1), 
    train_df['Cover_Type'], test_size=0.2)

In [7]:
tree.fit(X_train, y_train)
print(tree.score(X_train, y_train))
print(tree.score(X_val, y_val))


0.8220899470899471
0.7609126984126984


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time
import numpy as np


def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
param_dist = {"max_depth": [5, 10, 15, 25, 30],
              "min_samples_split": [2, 4, 5, 10, 20],
              "min_samples_leaf": [1, 2, 8, 12],
              'n_estimators': [20, 50, 100, 200, 600, 800]
             }
n_iter_search = 50
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, n_jobs=-1)

start = time()
X = train_df.drop(['Cover_Type'], axis=1)
y = train_df['Cover_Type']
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


RandomizedSearchCV took 183.04 seconds for 50 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.785 (std: 0.039)
Parameters: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.784 (std: 0.040)
Parameters: {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'min_samples_leaf': 1}

Model with rank: 3
Mean validation score: 0.779 (std: 0.040)
Parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}



In [9]:
random_search.cv_results_

{'mean_fit_time': array([ 1.18533459, 33.58849812, 10.88979273,  5.26937127,  2.6921257 ,
         1.50752525,  6.56830783,  1.30274925,  1.94394288, 11.5065176 ,
         1.85571275,  1.60289407, 12.71372609,  0.19909635,  7.43920059,
         5.58980207,  2.14702444,  5.21998568,  0.31689425, 13.34811411,
         2.2984489 ,  3.68835821,  0.3142498 ,  5.79304585,  1.79579253,
         0.79636264,  1.02063723,  8.773948  ,  0.7035584 ,  0.85085769,
         0.86640158,  0.90798936,  1.31530461, 11.48803506,  1.41058469,
         0.45129771,  4.37738585,  0.45260181,  0.75193539,  3.0165204 ,
         1.69929485,  3.40667858,  0.86844754,  3.64628572,  3.51938667,
        14.08520508,  3.42876348,  3.1802618 ,  1.88650255,  1.50515556]),
 'mean_score_time': array([0.01157031, 0.410429  , 1.11904416, 0.19562593, 0.09109006,
        0.0500042 , 0.31445675, 0.04397974, 0.08248296, 0.35467329,
        0.07533908, 0.05282669, 0.38071566, 0.01024451, 0.30679455,
        0.2282784 , 0.065685

In [11]:
clf = RandomForestClassifier(
    **{'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}
)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_val, y_val))

1.0
0.8756613756613757


In [18]:
clf2 = RandomForestClassifier( 
    **{'min_samples_leaf': 1, 'n_estimators': 200,
       'max_depth': 15, 'min_samples_split': 5}
)
clf2.fit(X_train, y_train)
print(clf2.score(X_train, y_train))
print(clf2.score(X_val, y_val))

0.9446097883597884
0.8369708994708994


In [19]:
clf3 = RandomForestClassifier( 
    **{'min_samples_leaf': 1, 'n_estimators': 50,
       'max_depth': 20, 'min_samples_split': 4}
)
clf3.fit(X_train, y_train)
print(clf3.score(X_train, y_train))
print(clf3.score(X_val, y_val))

0.986276455026455
0.8521825396825397


In [12]:
sorted([ (X_train.columns[i], importance) for i, importance in enumerate(clf.feature_importances_)],
       key=lambda x: x[1], reverse=True)[:15]

[('Elevation', 0.2208982858180783),
 ('Horizontal_Distance_To_Roadways', 0.09123035651336923),
 ('Horizontal_Distance_To_Fire_Points', 0.07261279603037486),
 ('Horizontal_Distance_To_Hydrology', 0.06227608596753147),
 ('Vertical_Distance_To_Hydrology', 0.0549933731184667),
 ('Hillshade_9am', 0.05332026663814828),
 ('Aspect', 0.05000827528091991),
 ('Hillshade_3pm', 0.046886696828998195),
 ('Hillshade_Noon', 0.04594084169198874),
 ('Wilderness_Area4', 0.04436854477936237),
 ('Slope', 0.03721242617018737),
 ('Soil_Type10', 0.022708289018002474),
 ('Soil_Type38', 0.02067407012542399),
 ('Soil_Type39', 0.01877014107499059),
 ('Wilderness_Area3', 0.017532243227467843)]

In [14]:
sorted([ (X_train.columns[i], importance) for i, importance in enumerate(clf2.feature_importances_)],
       key=lambda x: x[1], reverse=True)[:15]

[('Elevation', 0.2519943672972272),
 ('Horizontal_Distance_To_Roadways', 0.08979504313208746),
 ('Horizontal_Distance_To_Fire_Points', 0.05821370440938506),
 ('Horizontal_Distance_To_Hydrology', 0.056034059683657454),
 ('Wilderness_Area4', 0.0543076761858151),
 ('Vertical_Distance_To_Hydrology', 0.044928966883990985),
 ('Hillshade_9am', 0.04481570616286216),
 ('Aspect', 0.038264038488216655),
 ('Hillshade_3pm', 0.03524069999133791),
 ('Hillshade_Noon', 0.0342725303795335),
 ('Soil_Type10', 0.029273998670529225),
 ('Slope', 0.02644249186019339),
 ('Soil_Type39', 0.026402940311175935),
 ('Wilderness_Area3', 0.024301981047279122),
 ('Soil_Type38', 0.023743785560694834)]

In [13]:
test_pred = clf.predict(test_df)

In [14]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)