## Preprocess

In [1]:
import pandas as pd
train = pd.read_csv('/data/covertype2/train.csv')

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def convert_categorical_features(raw_data):
    categorical_features = ['Wild_Type', 'Soil_Type']
    encoders = {}
    processed = raw_data.copy()

    for feature in categorical_features:
        enc_label = LabelEncoder()
        enc_label.fit(raw_data[feature])
        processed[feature] = enc_label.transform(raw_data[feature])
        
        enc_onehot = OneHotEncoder()
        enc_onehot.fit(processed[feature].reshape((-1, 1)))
        onehot_vectors = enc_onehot.transform(processed[feature].reshape((-1, 1))).tocsc()
        for col in enc_onehot.active_features_:
            label = enc_label.classes_[col]
            new_label = '{}_{}'.format(feature, label)
            vec = onehot_vectors.getcol(col).toarray().flatten()
            processed[new_label] = vec

        encoders[feature] = (enc_label, enc_onehot)

    return processed.drop(categorical_features, axis=1), encoders

In [3]:
processed, encoders = convert_categorical_features(train)
processed.head()

  
  from ipykernel import kernelapp as app


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,"Soil_Type_Rogert family, very stony.",Soil_Type_Supervisor - Limber families complex.,"Soil_Type_Troutville family, very stony.",Soil_Type_Typic Cryaquepts - Typic Cryaquolls complex.,Soil_Type_Typic Cryaquolis - Borohemists complex.,"Soil_Type_Typic Cryaquolls - Leighcan family, till substratum complex.","Soil_Type_Vanet - Ratake families complex, very stony.","Soil_Type_Vanet - Wetmore families - Rock outcrop complex, stony.","Soil_Type_Vanet family - Rock outcrop complex complex, rubbly.",Soil_Type_unspecified in the USFS Soil and ELU Survey.
0,3122,266,10,433,75,3069,195,245,188,451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3018,308,15,60,14,5359,177,229,192,4546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3146,151,12,541,-2,5887,236,240,132,1371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2980,163,6,553,21,3538,226,242,149,1087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2972,187,16,255,109,6390,220,250,158,4119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
y = processed['Target']
X = processed.drop('Target', axis=1)

In [5]:
y.shape, X.shape

((464200,), (464200, 54))

In [6]:
y.value_counts()

2    226335
1    169268
3     28509
7     16398
6     13866
5      7593
4      2231
Name: Target, dtype: int64

## Training

In [7]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 3
max_depth = None

random_forest = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', max_features='sqrt', max_depth=max_depth, n_jobs=-1, verbose=1)

In [8]:
random_forest.fit(X, y)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.3s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=3, n_jobs=-1, oob_score=False, random_state=None,
            verbose=1, warm_start=False)

In [9]:
random_forest.n_classes_

7

## Evaluation

In [10]:
test = pd.read_csv('/data/covertype2/test.csv')

In [11]:
test_procesed, _ = convert_categorical_features(test)

  
  from ipykernel import kernelapp as app


In [12]:
test_procesed.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,"Soil_Type_Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.","Soil_Type_Rogert family, very stony.",Soil_Type_Supervisor - Limber families complex.,"Soil_Type_Troutville family, very stony.",Soil_Type_Typic Cryaquepts - Typic Cryaquolls complex.,Soil_Type_Typic Cryaquolis - Borohemists complex.,"Soil_Type_Typic Cryaquolls - Leighcan family, till substratum complex.","Soil_Type_Vanet - Ratake families complex, very stony.","Soil_Type_Vanet - Wetmore families - Rock outcrop complex, stony.","Soil_Type_Vanet family - Rock outcrop complex complex, rubbly."
0,2025,44,40,95,75,124,197,125,30,216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3092,52,18,182,39,4425,226,197,98,2408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3403,346,16,698,111,2505,188,213,163,1254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2783,186,9,60,5,1834,222,247,159,2318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3081,184,12,503,22,360,222,248,157,1276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y_test = test_procesed['Target']
X_test = test_procesed.drop('Target', axis=1)

In [14]:
for column in X.columns:
    if not column in X_test.columns:
        X_test[column] = 0

In [15]:
preds = random_forest.predict(X_test)

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.1s finished


In [16]:
(y_test == preds).value_counts().loc[True] / len(y_test)

0.9122103771791904

## Hyper parameter tuning

In [17]:
import numpy as np
trained_estimators = {}
for n_estimators in [1, 10, 30, 50]:
    random_forest = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', max_features='sqrt', max_depth=None, n_jobs=-1, verbose=1)
    random_forest.fit(X, y)
    preds = random_forest.predict(X_test)
    accuracy = np.sum(y_test == preds) / len(y_test)
    print('n_estimators: {}, accuracy: {}'.format(n_estimators, accuracy))
    trained_estimators[n_estimators]= random_forest

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


n_estimators: 1, accuracy: 0.8754254712580127


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    8.2s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.3s finished
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.2s finished


n_estimators: 10, accuracy: 0.9431380341068127


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   31.0s finished
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:    0.5s finished


n_estimators: 30, accuracy: 0.9531765640928175


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   53.2s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.8s finished


n_estimators: 50, accuracy: 0.9535221245054166
