## The challenge:

In this competition you’ll predict what types of trees there are in an area based on various geographic features.

The competition datasets comes from a study conducted in four wilderness areas within the beautiful Roosevelt National Forest of northern Colorado. These areas represent forests with very little human disturbances – the existing forest cover types there are more a result of ecological processes rather than forest management practices.

The data is in raw form and contains categorical data such as wilderness areas and soil type.

## Import Packages

In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/train.csv
data/test.csv
data/sample_submission.csv
data/sample_submission.csv.zip
data/input
data/test.csv.zip
data/train.csv.zip


## Load Dataset

In [3]:
train_df=pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df=pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [4]:
train_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [5]:
test_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,...,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,...,0,0,0,0,0,0,0,0,0,0


In [6]:
print("shape training csv: %s" % str(train_df.shape)) 
print("shape test csv: %s" % str(test_df.shape)) 

shape training csv: (15120, 56)
shape test csv: (565892, 55)


## Delete Ids
**Let's delete the Id column in the training set but store it for the test set before deleting**

In [7]:
train_df = train_df.drop(["Id"], axis = 1)

test_ids = test_df["Id"]
test_df = test_df.drop(["Id"], axis = 1)

# Feature engineering

In [8]:

def new_features(df):
    df = df.copy()
    df['Hydro_Elevation_diff'] = df[['Elevation',
                                   'Vertical_Distance_To_Hydrology']
                                  ].diff(axis='columns').iloc[:, [1]]

    df['Hydro_Euclidean'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 +
                                   df['Vertical_Distance_To_Hydrology']**2)

    df['Hydro_Fire_sum'] = df[['Horizontal_Distance_To_Hydrology',
                             'Horizontal_Distance_To_Fire_Points']
                            ].sum(axis='columns')

    df['Hydro_Fire_diff'] = df[['Horizontal_Distance_To_Hydrology',
                              'Horizontal_Distance_To_Fire_Points']
                             ].diff(axis='columns').iloc[:, [1]].abs()

    df['Hydro_Road_sum'] = df[['Horizontal_Distance_To_Hydrology',
                             'Horizontal_Distance_To_Roadways']
                            ].sum(axis='columns')

    df['Hydro_Road_diff'] = df[['Horizontal_Distance_To_Hydrology',
                              'Horizontal_Distance_To_Roadways']
                             ].diff(axis='columns').iloc[:, [1]].abs()

    df['Road_Fire_sum'] = df[['Horizontal_Distance_To_Roadways',
                            'Horizontal_Distance_To_Fire_Points']
                           ].sum(axis='columns')

    df['Road_Fire_diff'] = df[['Horizontal_Distance_To_Roadways',
                             'Horizontal_Distance_To_Fire_Points']
                            ].diff(axis='columns').iloc[:, [1]].abs()
    df['Stoneyness'] = sum(i * df['Soil_Type{}'.format(i)] for i in range(1, 41))

    # For all 40 Soil_Types, 1=rubbly, 2=stony, 3=very stony, 4=extremely stony, 0=?
    stoneyness = [4, 3, 1, 1, 1, 2, 0, 0, 3, 1, 
                  1, 2, 1, 0, 0, 0, 0, 3, 0, 0, 
                  0, 4, 0, 4, 4, 3, 4, 4, 4, 4, 
                  4, 4, 4, 4, 1, 4, 4, 4, 4, 4]

    # Replace Soil_Type number with "stoneyness" value
    df['Stoneyness'] = df['Stoneyness'].replace(range(1, 41), stoneyness)
    return df
    
def drop_features(df):
    df = df.copy()
    columns_to_drop = []
    size = len(df)
    for column in df.columns:
        frequency = df[column].sum() / size
        if 'Soil' in column and frequency < 0.04:
            columns_to_drop.append(column)
    df = df.drop(columns_to_drop, axis='columns')
    return df
    

In [33]:
X = new_features(train_df)
X = drop_features(X)
X_test = new_features(test_df)
X_test = X_test[X.columns.drop(['Cover_Type'])]

In [34]:
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Cover_Type,Hydro_Elevation_diff,Hydro_Euclidean,Hydro_Fire_sum,Hydro_Fire_diff,Hydro_Road_sum,Hydro_Road_diff,Road_Fire_sum,Road_Fire_diff,Stoneyness
0,2596,51,3,258,0,510,221,232,148,6279,...,5,-2596.0,258.0,6537,6021.0,768,252.0,6789,5769.0,4
1,2590,56,2,212,-6,390,220,235,151,6225,...,5,-2596.0,212.084889,6437,6013.0,602,178.0,6615,5835.0,4
2,2804,139,9,268,65,3180,234,238,135,6121,...,2,-2739.0,275.769832,6389,5853.0,3448,2912.0,9301,2941.0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,2,-2667.0,269.235956,6453,5969.0,3332,2848.0,9301,3121.0,4
4,2595,45,2,153,-1,391,220,234,150,6172,...,5,-2596.0,153.003268,6325,6019.0,544,238.0,6563,5781.0,4


In [35]:
X.describe()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Cover_Type,Hydro_Elevation_diff,Hydro_Euclidean,Hydro_Fire_sum,Hydro_Fire_diff,Hydro_Road_sum,Hydro_Road_diff,Road_Fire_sum,Road_Fire_diff,Stoneyness
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,...,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,2749.322553,156.676653,16.501587,227.195701,51.076521,1714.023214,212.704299,218.965608,135.091997,1511.147288,...,4.0,-2698.246032,235.948778,1738.342989,1291.834788,1941.218915,1492.094577,3225.170503,897.908333,2.333003
std,417.678187,110.085801,8.453927,210.075296,61.239406,1325.066358,30.561287,22.801966,45.895189,1099.936493,...,2.000066,414.680029,215.491695,1152.123001,1077.167415,1383.173588,1292.673425,2093.672478,884.718318,1.580487
min,1863.0,0.0,0.0,0.0,-146.0,0.0,0.0,99.0,0.0,0.0,...,1.0,-3776.0,0.0,30.0,0.0,30.0,0.0,95.0,0.0,0.0
25%,2376.0,65.0,10.0,67.0,5.0,764.0,196.0,207.0,106.0,730.0,...,2.0,-3068.0,67.186308,903.0,532.0,967.0,534.0,1806.0,291.0,1.0
50%,2752.0,126.0,15.0,180.0,32.0,1316.0,220.0,223.0,138.0,1256.0,...,4.0,-2696.0,188.480768,1462.0,1023.5,1501.0,1148.0,2520.5,632.0,2.0
75%,3104.0,261.0,22.0,330.0,79.0,2270.0,235.0,235.0,167.0,1988.25,...,6.0,-2323.0,342.069071,2302.0,1726.0,2554.0,2042.0,4232.25,1220.0,4.0
max,3849.0,360.0,52.0,1343.0,554.0,6890.0,254.0,254.0,248.0,6993.0,...,7.0,-1842.0,1356.939571,7167.0,6898.0,7306.0,6860.0,12504.0,6012.0,4.0


## Cluster features

In [26]:
# TODO: add cluster features

# Brute Force parameters search

In [13]:
# A simple first test to validate features of actual df

X_train, y_train = X.drop(['Cover_Type'], axis=1), train_df['Cover_Type']

etc = ExtraTreesClassifier(
    bootstrap=True, oob_score=True,
    **{'max_depth': 60, 'n_estimators': 200, 'max_features': 0.8}
    )
etc.fit(X_train, y_train)
etc.oob_score_

0.8976851851851851

In [50]:
from time import time
from sklearn.model_selection import RandomizedSearchCV



In [38]:
param_dist = {"max_depth": [5, 10, 15, 25, 40, 80],
              'max_features': [0.2, 0.4, 0.6, 0.8],
              'n_estimators': [20, 50, 100, 200, 600, 1200, 2000, 2400]
             }


def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
n_iter_search = 5
random_search = RandomizedSearchCV(ExtraTreesClassifier(bootstrap=False), param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False)

start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 313.60 seconds for 5 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.801 (std: 0.037)
Parameters: {'max_features': 0.6, 'n_estimators': 2000, 'max_depth': 25}

Model with rank: 2
Mean validation score: 0.786 (std: 0.039)
Parameters: {'max_features': 0.2, 'n_estimators': 2400, 'max_depth': 80}

Model with rank: 3
Mean validation score: 0.783 (std: 0.039)
Parameters: {'max_features': 0.2, 'n_estimators': 1200, 'max_depth': 25}



In [44]:
X_train, X_val, y_train, y_val = train_test_split(X_e, y_e, test_size=0.2)

In [46]:
etc = ExtraTreesClassifier(
    bootstrap=True, oob_score=True,
    class_weight= {1:1000, 2: 1000, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1},
    **{'max_depth': 32, 'n_estimators': 2200, 'max_features': 0.7}
    )
etc.fit(X_train, y_train)
etc.oob_score_

0.8716931216931217

In [47]:
y_pred = etc.predict(X_val)
report(y_val, y_pred)

Accuracy: 0.8796296296296297
              precision    recall  f1-score   support

           1       0.84      0.75      0.79       218
           2       0.78      0.70      0.74       193
           3       0.87      0.83      0.85       194
           4       0.95      0.98      0.96       236
           5       0.90      0.97      0.93       235
           6       0.88      0.91      0.89       220
           7       0.91      0.98      0.95       216

    accuracy                           0.88      1512
   macro avg       0.87      0.87      0.87      1512
weighted avg       0.88      0.88      0.88      1512

[[163  32   0   0   5   0  18]
 [ 28 135   2   0  19   7   2]
 [  0   3 161  10   1  19   0]
 [  0   0   4 231   0   1   0]
 [  0   3   3   0 228   1   0]
 [  0   1  15   3   1 200   0]
 [  4   0   0   0   0   0 212]]


In [48]:
# Train with all data, maximizing trees variety (bootstrap False)
etc = ExtraTreesClassifier(
    bootstrap=False,
    class_weight= {1:10000, 2: 10000, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1},
    **{'max_depth': 32, 'n_estimators': 2200, 'max_features': 0.7}
    )
etc.fit(X_e, y_e)
etc.score(X_e, y_e)

0.9986772486772487

In [24]:
from sklearn.metrics import classification_report, confusion_matrix
def report(y_true, y_pred):
    print('Accuracy: %s' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

In [39]:
X_1_2 = X_e[y_e < 3]
y_1_2 = y_e[y_e < 3]
X_train, X_val, y_train, y_val = train_test_split(X_1_2, y_1_2, test_size=0.2)
etc = ExtraTreesClassifier(
    bootstrap=False,
    **{'max_depth': 20, 'n_estimators': 3000, 'max_features': 0.6}
    )

etc.fit(X_train, y_train)
y_pred = etc.predict(X_val)

In [40]:
print(etc.score(X_train, y_train))
report(y_val, y_pred)

0.9994212962962963
Accuracy: 0.8171296296296297
              precision    recall  f1-score   support

           1       0.81      0.82      0.81       421
           2       0.83      0.81      0.82       443

    accuracy                           0.82       864
   macro avg       0.82      0.82      0.82       864
weighted avg       0.82      0.82      0.82       864

[[346  75]
 [ 83 360]]


# Mount the ensemble

In [15]:
X_, X_val, y_, y_val = train_test_split(X_train, y_train, test_size=0.2)

ss = StandardScaler()
pca = PCA(random_state=60, n_components=20)
knn = KNeighborsClassifier(algorithm='ball_tree', n_jobs=-1, n_neighbors=1)

pipe = Pipeline(steps=[
        ('scale', ss),
        ('pca', pca),
        ('pred', knn)
]).fit(X_, y_)
print(pipe.score(X_, y_))
print(pipe.score(X_val, y_val))

1.0
0.8234126984126984


In [16]:
rf = RandomForestClassifier(n_estimators=800, max_features=0.8, max_depth=40)
rf.fit(X_, y_)
print(rf.score(X_, y_))
print(rf.score(X_val, y_val))

1.0
0.8697089947089947


In [24]:
etc = ExtraTreesClassifier(
    bootstrap=False,
    class_weight= {1:10000, 2: 10000}, #, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1},
    **{'max_depth': 40, 'n_estimators': 400, 'max_features': 0.8}
    )
etc.fit(X_, y_)
print(etc.score(X_, y_))
print(etc.score(X_val, y_val))

1.0
0.894510582010582


In [18]:
etc2 = ExtraTreesClassifier(
    bootstrap=False,
    **{'max_depth': 25, 'n_estimators': 1000, 'max_features': 0.7}
).fit(X_, y_)
print(etc2.score(X_, y_))
print(etc2.score(X_val, y_val))

1.0
0.9011243386243386


In [25]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(estimators=[('knn', pipe),
                                     ('etc', etc),
                                    ('etc2', etc2)],
                         voting='hard')
model.fit(X_, y_)
print(model.score(X_, y_))
print(model.score(X_val, y_val))

1.0
0.8968253968253969


In [37]:
## Fit with all data

model.fit(X_train, y_train)


VotingClassifier(estimators=[('knn',
                              Pipeline(memory=None,
                                       steps=[('scale',
                                               StandardScaler(copy=True,
                                                              with_mean=True,
                                                              with_std=True)),
                                              ('pca',
                                               PCA(copy=True,
                                                   iterated_power='auto',
                                                   n_components=20,
                                                   random_state=60,
                                                   svd_solver='auto', tol=0.0,
                                                   whiten=False)),
                                              ('pred',
                                               KNeighborsClassifier(algorithm='ball_tree',
       

## Predictions

In [36]:
print(X_test.shape)
print(X_train.shape)

(565892, 36)
(15120, 36)


In [38]:
test_pred = model.predict(X_test)

In [39]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)