## The challenge:

In this competition you’ll predict what types of trees there are in an area based on various geographic features.

The competition datasets comes from a study conducted in four wilderness areas within the beautiful Roosevelt National Forest of northern Colorado. These areas represent forests with very little human disturbances – the existing forest cover types there are more a result of ecological processes rather than forest management practices.

The data is in raw form and contains categorical data such as wilderness areas and soil type.

## Import Packages

In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, \
    ReduceLROnPlateau, TensorBoard

for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


data/train.csv
data/test.csv
data/sample_submission.csv
data/sample_submission.csv.zip
data/input
data/test.csv.zip
data/train.csv.zip


In [3]:
def report(y_true, y_pred):
    print('Accuracy: %s' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

## Load Dataset

In [4]:
train_df=pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df=pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [5]:
train_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [6]:
test_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,...,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print("shape training csv: %s" % str(train_df.shape)) 
print("shape test csv: %s" % str(test_df.shape)) 

shape training csv: (15120, 56)
shape test csv: (565892, 55)


## Delete Ids
**Let's delete the Id column in the training set but store it for the test set before deleting**

In [8]:
train_df = train_df.drop(["Id"], axis = 1)

test_ids = test_df["Id"]
test_df = test_df.drop(["Id"], axis = 1)

In [9]:
train_df.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [10]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = ss.fit_transform(train_df[['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']])

In [11]:
def categorical_features(df):
    size = len(df)
    columns_to_keep = []
    for column in df.columns:
        if 'Soil' in column or 'Wilderness' in column:
            frequency = df[column].sum() / size
            if frequency >= 0.04:
                columns_to_keep.append(column)
    print('Columns keeped %s' % columns_to_keep)
    return np.array(df[columns_to_keep])

In [12]:
categorical = categorical_features(train_df)

Columns keeped ['Wilderness_Area1', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type6', 'Soil_Type10', 'Soil_Type17', 'Soil_Type23', 'Soil_Type29', 'Soil_Type30', 'Soil_Type32', 'Soil_Type33', 'Soil_Type38', 'Soil_Type39']


In [13]:
categorical[:10]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [14]:
X = np.hstack((X, categorical))

In [15]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()
y = binarizer.fit_transform(train_df['Cover_Type'])

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [17]:
def model1():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7, activation='softmax'))
    return model


In [18]:
def model2():
    model = tf.keras.Sequential()
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7, activation='softmax'))
    return model


In [19]:
def model3():
    model = tf.keras.Sequential()
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7, activation='softmax'))
    return model


In [20]:
def model4():
    model = tf.keras.Sequential()
    model.add(layers.Dense(40, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(40, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7, activation='softmax'))
    return model


In [21]:
def compile_and_train(model, epochs=20):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=epochs, verbose=0)
    print(model.metrics_names)
    print(model.evaluate(X_train, y_train))
    print(model.evaluate(X_val, y_val))

In [22]:
for model in [model1(), model2(), model3(), model4()]:
    compile_and_train(model)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
['loss', 'acc']
[0.5652701917604304, 0.76748973]
[0.6191048879156668, 0.7242063]
['loss', 'acc']
[0.5956167677512665, 0.75073487]
[0.6424913898346916, 0.712963]
['loss', 'acc']
[0.7149608557817166, 0.70965606]
[0.7489979560413058, 0.6693122]
['loss', 'acc']
[0.6513574599939399, 0.72832155]
[0.6825983054423458, 0.6924603]


In [23]:
model = model2()
compile_and_train(model, epochs=100)

['loss', 'acc']
[0.5126627201125735, 0.7841711]
[0.579335238094683, 0.74007934]


In [24]:
model = model1()
compile_and_train(model, epochs=100)

['loss', 'acc']
[0.45864952025800365, 0.8180482]
[0.5328434024538312, 0.7718254]


In [25]:
model = model4()
compile_and_train(model, epochs=100)

['loss', 'acc']
[0.5919530842319648, 0.74698704]
[0.6463253315163668, 0.7037037]


## Final training

### Callbacks

In [57]:
tensorboard_logs = TensorBoard(log_dir='./logs', histogram_freq=1,
                               write_graph=True, write_images=True,
                               update_freq='epoch')
mcp_save = ModelCheckpoint(os.path.join('./keras_models', 
                                        'modelweights.{epoch:02d}-{val_loss:.2f}.hdf5'),
                           save_best_only=True, monitor='val_acc', mode='min')
early_stop = EarlyStopping(monitor='val_loss', patience=40, verbose=0, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=10,
                              verbose=1, min_delta=1e-2, mode='min')

### Model

In [53]:
def model_final():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu',
              kernel_regularizer=tf.keras.regularizers.l2(0.0001)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu',
              kernel_regularizer=tf.keras.regularizers.l2(0.0002)))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(64, activation='relu',
              kernel_regularizer=tf.keras.regularizers.l2(0.0002)))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(7, activation='softmax'))
    return model



In [56]:
def model_final():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7, activation='softmax'))
    return model


### Training

In [None]:
model = model_final()
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500,
          callbacks=[early_stop, reduce_lr, mcp_save, tensorboard_logs],
          validation_data=(X_val, y_val))


Train on 13608 samples, validate on 1512 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500

In [111]:
tf.__version__

'1.11.0'

In [70]:
y_pred = model.predict(X_val)
report(binarizer.inverse_transform(y_val), binarizer.inverse_transform(y_pred))

Accuracy: 0.8373015873015873
              precision    recall  f1-score   support

           1       0.73      0.75      0.74       220
           2       0.77      0.70      0.73       227
           3       0.80      0.83      0.81       211
           4       0.93      0.94      0.93       216
           5       0.91      0.90      0.90       211
           6       0.80      0.78      0.79       206
           7       0.92      0.96      0.94       221

    accuracy                           0.84      1512
   macro avg       0.84      0.84      0.84      1512
weighted avg       0.84      0.84      0.84      1512

[[166  31   2   0   1   1  19]
 [ 52 159   1   0  10   5   0]
 [  0   1 176  11   3  20   0]
 [  0   0   8 203   0   5   0]
 [  0  11   2   0 189   9   0]
 [  0   3  32   5   5 161   0]
 [  8   1   0   0   0   0 212]]


## Predictions

In [49]:
test_pred = model.predict(test_df)

In [51]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)