## On EDA, it was demonstrated that the variables are strong predictors, but only Tree-like models where succesful in separating boundaries. Also, there is a strong tendency on overfitting. So, I will try to tune decision trees, random forests or boosting methods

In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline



In [3]:
train_df=pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df=pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [4]:
train_df = train_df.drop(["Id"], axis = 1)

test_ids = test_df["Id"]
test_df = test_df.drop(["Id"], axis = 1)

In [5]:
tree = DecisionTreeClassifier(max_depth=10,
    min_samples_split=3,
    min_samples_leaf=2)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(['Cover_Type'], axis=1), 
    train_df['Cover_Type'], test_size=0.2)

In [7]:
tree.fit(X_train, y_train)
print(tree.score(X_train, y_train))
print(tree.score(X_val, y_val))


0.8209325396825397
0.7622354497354498


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time
import numpy as np


def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
param_dist = {"max_depth": [5, 10, 15],
              "min_samples_split": [2, 4, 5, 10, 20],
              "min_samples_leaf": [1, 2, 8, 12],
              'n_estimators': [20, 50, 100, 200]
             }
n_iter_search = 50
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, n_jobs=-1)

start = time()
X = train_df.drop(['Cover_Type'], axis=1)
y = train_df['Cover_Type']
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


RandomizedSearchCV took 49.68 seconds for 50 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.759 (std: 0.042)
Parameters: {'n_estimators': 100, 'min_samples_split': 4, 'max_depth': 15, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.756 (std: 0.042)
Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 15, 'min_samples_leaf': 2}

Model with rank: 3
Mean validation score: 0.753 (std: 0.044)
Parameters: {'n_estimators': 100, 'min_samples_split': 4, 'max_depth': 15, 'min_samples_leaf': 2}



In [9]:
random_search.cv_results_

{'mean_fit_time': array([1.72555676, 1.6905839 , 0.1908814 , 0.42978954, 1.44035807,
        0.6842382 , 0.43824568, 0.8055429 , 0.42888093, 2.87830215,
        0.28425136, 3.44411702, 0.43288503, 1.49958563, 0.75603456,
        1.90399652, 1.56797314, 0.32701874, 2.66304584, 0.17607832,
        2.29604225, 4.68493156, 0.64002256, 2.000809  , 1.69564767,
        1.45738664, 0.86702924, 0.74803438, 1.6100678 , 0.37964702,
        1.94475527, 1.60979762, 0.31251602, 0.88510661, 3.63661146,
        3.15717421, 0.80380917, 0.78505068, 0.41866765, 3.64625239,
        0.37678142, 0.19523802, 1.51241369, 3.46337352, 0.19343591,
        0.94702735, 0.94732966, 4.00528879, 0.30708857, 1.63619213]),
 'mean_score_time': array([0.05588579, 0.05441675, 0.01013947, 0.01959128, 0.04955139,
        0.02549367, 0.02145381, 0.02618151, 0.0188518 , 0.08970776,
        0.01119876, 0.10484753, 0.01910305, 0.04622817, 0.02501302,
        0.06507339, 0.05428443, 0.0120575 , 0.10887618, 0.00921617,
        0.

In [17]:
clf = RandomForestClassifier(
    **{'min_samples_leaf': 1, 'n_estimators': 100,
     'max_depth': 15, 'min_samples_split': 4}
)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_val, y_val))

0.9518849206349206
0.8386243386243386


In [18]:
clf2 = RandomForestClassifier( 
    **{'min_samples_leaf': 1, 'n_estimators': 200,
       'max_depth': 15, 'min_samples_split': 5}
)
clf2.fit(X_train, y_train)
print(clf2.score(X_train, y_train))
print(clf2.score(X_val, y_val))

0.9446097883597884
0.8369708994708994


In [19]:
clf3 = RandomForestClassifier( 
    **{'min_samples_leaf': 1, 'n_estimators': 50,
       'max_depth': 20, 'min_samples_split': 4}
)
clf3.fit(X_train, y_train)
print(clf3.score(X_train, y_train))
print(clf3.score(X_val, y_val))

0.986276455026455
0.8521825396825397


In [13]:
sorted([ (X_train.columns[i], importance) for i, importance in enumerate(clf.feature_importances_)],
       key=lambda x: x[1], reverse=True)[:15]

[('Elevation', 0.22314167935032403),
 ('Horizontal_Distance_To_Roadways', 0.09068754899445769),
 ('Horizontal_Distance_To_Fire_Points', 0.07125150960291289),
 ('Horizontal_Distance_To_Hydrology', 0.05962509990906601),
 ('Wilderness_Area4', 0.05422376513042041),
 ('Vertical_Distance_To_Hydrology', 0.051708937715740234),
 ('Hillshade_9am', 0.05121416829745952),
 ('Hillshade_3pm', 0.04569945389541346),
 ('Aspect', 0.04525196530554043),
 ('Hillshade_Noon', 0.044179011111912546),
 ('Slope', 0.034492901916539344),
 ('Soil_Type10', 0.0234833693893333),
 ('Soil_Type38', 0.02329923493726738),
 ('Soil_Type39', 0.02084158537906254),
 ('Wilderness_Area3', 0.01925427195278042)]

In [14]:
sorted([ (X_train.columns[i], importance) for i, importance in enumerate(clf2.feature_importances_)],
       key=lambda x: x[1], reverse=True)[:15]

[('Elevation', 0.2519943672972272),
 ('Horizontal_Distance_To_Roadways', 0.08979504313208746),
 ('Horizontal_Distance_To_Fire_Points', 0.05821370440938506),
 ('Horizontal_Distance_To_Hydrology', 0.056034059683657454),
 ('Wilderness_Area4', 0.0543076761858151),
 ('Vertical_Distance_To_Hydrology', 0.044928966883990985),
 ('Hillshade_9am', 0.04481570616286216),
 ('Aspect', 0.038264038488216655),
 ('Hillshade_3pm', 0.03524069999133791),
 ('Hillshade_Noon', 0.0342725303795335),
 ('Soil_Type10', 0.029273998670529225),
 ('Slope', 0.02644249186019339),
 ('Soil_Type39', 0.026402940311175935),
 ('Wilderness_Area3', 0.024301981047279122),
 ('Soil_Type38', 0.023743785560694834)]

In [20]:
test_pred = clf3.predict(test_df)

In [21]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)