## Import Packages

In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, calinski_harabasz_score, silhouette_score



import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/train.csv
data/test.csv
data/sample_submission.csv
data/sample_submission.csv.zip
data/input
data/test.csv.zip
data/train.csv.zip


## Load Dataset

In [3]:
train_df=pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

test_df=pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

## Delete Ids
**Let's delete the Id column in the training set but store it for the test set before deleting**

In [4]:
train_df = train_df.drop(["Id"], axis = 1)

test_ids = test_df["Id"]
test_df = test_df.drop(["Id"], axis = 1)

In [5]:
y = train_df['Cover_Type']
X = train_df.drop(['Cover_Type'], axis=1)

# Feature engineering

In [6]:
train_df[['Elevation', 'Aspect', 'Slope']].head(10)

Unnamed: 0,Elevation,Aspect,Slope
0,2596,51,3
1,2590,56,2
2,2804,139,9
3,2785,155,18
4,2595,45,2
5,2579,132,6
6,2606,45,7
7,2605,49,4
8,2617,45,9
9,2612,59,10


In [7]:
train_df[['Elevation', 'Aspect', 'Slope']].describe()

Unnamed: 0,Elevation,Aspect,Slope
count,15120.0,15120.0,15120.0
mean,2749.322553,156.676653,16.501587
std,417.678187,110.085801,8.453927
min,1863.0,0.0,0.0
25%,2376.0,65.0,10.0
50%,2752.0,126.0,15.0
75%,3104.0,261.0,22.0
max,3849.0,360.0,52.0


In [8]:
train_df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].head()

Unnamed: 0,Hillshade_9am,Hillshade_Noon,Hillshade_3pm
0,221,232,148
1,220,235,151
2,234,238,135
3,238,238,122
4,220,234,150


In [9]:
train_df[['Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways']].head()

Unnamed: 0,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways
0,258,0,510
1,212,-6,390
2,268,65,3180
3,242,118,3090
4,153,-1,391


In [10]:
train_df.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [11]:
X[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].describe()

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
count,15120.0,15120.0,15120.0,15120.0
mean,0.237897,0.033003,0.419907,0.309193
std,0.42581,0.178649,0.49356,0.462176
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [12]:
test_df[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].describe()

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
count,565892.0,565892.0,565892.0,565892.0
mean,0.454502,0.051927,0.436506,0.057066
std,0.497926,0.22188,0.495953,0.231968
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0


In [13]:

def new_features(df):
    df = df.copy()
    df['Hydro_Elevation_diff'] = df[['Elevation',
                                   'Vertical_Distance_To_Hydrology']
                                  ].diff(axis='columns').iloc[:, [1]]

    df['Hydro_Euclidean'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 +
                                   df['Vertical_Distance_To_Hydrology']**2)

    df['Hydro_Fire_sum'] = df[['Horizontal_Distance_To_Hydrology',
                             'Horizontal_Distance_To_Fire_Points']
                            ].sum(axis='columns')

    df['Hydro_Fire_diff'] = df[['Horizontal_Distance_To_Hydrology',
                              'Horizontal_Distance_To_Fire_Points']
                             ].diff(axis='columns').iloc[:, [1]].abs()

    df['Hydro_Road_sum'] = df[['Horizontal_Distance_To_Hydrology',
                             'Horizontal_Distance_To_Roadways']
                            ].sum(axis='columns')

    df['Hydro_Road_diff'] = df[['Horizontal_Distance_To_Hydrology',
                              'Horizontal_Distance_To_Roadways']
                             ].diff(axis='columns').iloc[:, [1]].abs()

    df['Road_Fire_sum'] = df[['Horizontal_Distance_To_Roadways',
                            'Horizontal_Distance_To_Fire_Points']
                           ].sum(axis='columns')

    df['Road_Fire_diff'] = df[['Horizontal_Distance_To_Roadways',
                             'Horizontal_Distance_To_Fire_Points']
                            ].diff(axis='columns').iloc[:, [1]].abs()
    
    df['log_Elevation'] = np.log(df['Elevation'] + 1)

    df['log_Hillshade'] = np.log(df['Hillshade_9am'] + 1)
    df['log_Hillshade_afternoon'] = np.log(df['Hillshade_3pm'] + 1)
    
    df.Aspect = df.Aspect.apply(lambda x: np.cos(2*np.pi*x/360.0))

    df['Stoneyness'] = sum(i * df['Soil_Type{}'.format(i)] for i in range(1, 41))

    # For all 40 Soil_Types, 1=rubbly, 2=stony, 3=very stony, 4=extremely stony, 0=?
    stoneyness = [4, 3, 1, 1, 1, 2, 0, 0, 3, 1, 
                  1, 2, 1, 0, 0, 0, 0, 3, 0, 0, 
                  0, 4, 0, 4, 4, 3, 4, 4, 4, 4, 
                  4, 4, 4, 4, 1, 4, 4, 4, 4, 4]

    # Replace Soil_Type number with "stoneyness" value
    df['Stoneyness'] = df['Stoneyness'].replace(range(1, 41), stoneyness)

    return df
    
def drop_features(df):
    df = df.copy()
    columns_to_drop = []
    size = len(df)
    for column in df.columns:
        frequency = df[column].sum() / size
        if 'Soil' in column and frequency < 0.04:
            columns_to_drop.append(column)
        columns_to_drop.append('Wilderness_Area4')  # Frequency between test and train too diferent
    df = df.drop(columns_to_drop, axis='columns')
    return df
    

In [14]:
X = new_features(train_df.drop(['Cover_Type'], axis=1))
X = drop_features(X)
X_test = new_features(test_df)
X_test = X_test[X.columns]

In [15]:
X.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type6', 'Soil_Type10', 'Soil_Type17', 'Soil_Type23',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type32', 'Soil_Type33',
       'Soil_Type38', 'Soil_Type39', 'Hydro_Elevation_diff', 'Hydro_Euclidean',
       'Hydro_Fire_sum', 'Hydro_Fire_diff', 'Hydro_Road_sum',
       'Hydro_Road_diff', 'Road_Fire_sum', 'Road_Fire_diff', 'log_Elevation',
       'log_Hillshade', 'log_Hillshade_afternoon', 'Stoneyness'],
      dtype='object')

In [16]:
X[['log_Elevation', 'Aspect', 'log_Hillshade', 'log_Hillshade_afternoon']].describe()

Unnamed: 0,log_Elevation,Aspect,log_Hillshade,log_Hillshade_afternoon
count,15120.0,15120.0,15120.0,15120.0
mean,7.907773,0.146622,5.352427,4.818777
std,0.153662,0.685404,0.166721,0.56014
min,7.53048,-1.0,0.0,0.0
25%,7.773594,-0.5,5.283204,4.672829
50%,7.920447,0.275637,5.398163,4.934474
75%,8.040769,0.809017,5.463832,5.123964
max,8.255828,1.0,5.541264,5.517453


In [17]:
X[['Hydro_Elevation_diff', 'Hydro_Euclidean', 'Hydro_Fire_sum',
       'Hydro_Fire_diff', 'Hydro_Road_sum', 'Hydro_Road_diff', 'Road_Fire_sum',
       'Road_Fire_diff', 'Stoneyness']].describe()

Unnamed: 0,Hydro_Elevation_diff,Hydro_Euclidean,Hydro_Fire_sum,Hydro_Fire_diff,Hydro_Road_sum,Hydro_Road_diff,Road_Fire_sum,Road_Fire_diff,Stoneyness
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,-2698.246032,235.948778,1738.342989,1291.834788,1941.218915,1492.094577,3225.170503,897.908333,2.333003
std,414.680029,215.491695,1152.123001,1077.167415,1383.173588,1292.673425,2093.672478,884.718318,1.580487
min,-3776.0,0.0,30.0,0.0,30.0,0.0,95.0,0.0,0.0
25%,-3068.0,67.186308,903.0,532.0,967.0,534.0,1806.0,291.0,1.0
50%,-2696.0,188.480768,1462.0,1023.5,1501.0,1148.0,2520.5,632.0,2.0
75%,-2323.0,342.069071,2302.0,1726.0,2554.0,2042.0,4232.25,1220.0,4.0
max,-1842.0,1356.939571,7167.0,6898.0,7306.0,6860.0,12504.0,6012.0,4.0


In [18]:
X[['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']].describe()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,2749.322553,0.146622,16.501587,227.195701,51.076521,1714.023214,212.704299,218.965608,135.091997,1511.147288
std,417.678187,0.685404,8.453927,210.075296,61.239406,1325.066358,30.561287,22.801966,45.895189,1099.936493
min,1863.0,-1.0,0.0,0.0,-146.0,0.0,0.0,99.0,0.0,0.0
25%,2376.0,-0.5,10.0,67.0,5.0,764.0,196.0,207.0,106.0,730.0
50%,2752.0,0.275637,15.0,180.0,32.0,1316.0,220.0,223.0,138.0,1256.0
75%,3104.0,0.809017,22.0,330.0,79.0,2270.0,235.0,235.0,167.0,1988.25
max,3849.0,1.0,52.0,1343.0,554.0,6890.0,254.0,254.0,248.0,6993.0


In [19]:
X[['Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type6', 'Soil_Type10', 'Soil_Type17', 'Soil_Type23',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type32', 'Soil_Type33',
       'Soil_Type38', 'Soil_Type39']].describe()

Unnamed: 0,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type6,Soil_Type10,Soil_Type17,Soil_Type23,Soil_Type29,Soil_Type30,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,0.041204,0.063624,0.055754,0.042989,0.141667,0.040476,0.050066,0.085384,0.04795,0.045635,0.040741,0.048148,0.043452
std,0.198768,0.244091,0.229454,0.20284,0.348719,0.19708,0.218089,0.279461,0.213667,0.208699,0.197696,0.214086,0.20388
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
X['Stoneyness'].value_counts()

4    6546
1    5096
0    1854
2     877
3     747
Name: Stoneyness, dtype: int64

# Cluster features

In [21]:
ss = StandardScaler()
X_all = np.vstack([X, X_test])
ss.fit(X_all)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [22]:
cluster_data = ss.transform(X)
cluster_data_test = ss.transform(X_test)
cluster_data_all = np.vstack([cluster_data, cluster_data_test])

In [23]:
km = KMeans(9)
km.fit(cluster_data_all)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [24]:
labels = km.predict(cluster_data)
print(completeness_score(y, labels))
print(silhouette_score(cluster_data, labels))
print(calinski_harabasz_score(cluster_data, labels))

0.3183895504258875
0.14869280122108627
1422.7538855162177


In [25]:
X.shape

(15120, 38)

In [26]:
X['cluster'] = km.predict(cluster_data)
X_test['cluster'] = km.predict(cluster_data_test)

In [27]:
gmm = GaussianMixture(11)
gmm.fit(cluster_data)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=11, n_init=1,
                precisions_init=None, random_state=None, reg_covar=1e-06,
                tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
                weights_init=None)

In [28]:
labels = gmm.predict(cluster_data)
print(completeness_score(y, labels))
print(silhouette_score(cluster_data, labels))
print(calinski_harabasz_score(cluster_data, labels))

0.28426810911408545
0.21669623152010659
2105.9880001761044


In [29]:
X['cluster2'] = labels
X_test['cluster2'] = gmm.predict(cluster_data_test)

# TRAIN FINAL MODEL

In [30]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

X_, X_val, y_, y_val = train_test_split(X, y, test_size=0.1)
X_filtered = X_[y < 3]
y_filtered = y_[y < 3]

In [33]:
etc12 = ExtraTreesClassifier(
    bootstrap=False,
    **{'max_depth': 42, 'n_estimators': 600, 'max_features': 0.6}
    )
etc12.fit(X_filtered, y_filtered)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=42, max_features=0.6, max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=600,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [34]:
y_pred = etc12.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

0.22685185185185186
              precision    recall  f1-score   support

           1       0.40      0.82      0.54       208
           2       0.16      0.83      0.27       206
           3       0.00      0.00      0.00       220
           4       0.00      0.00      0.00       223
           5       0.00      0.00      0.00       234
           6       0.00      0.00      0.00       212
           7       0.00      0.00      0.00       209

    accuracy                           0.23      1512
   macro avg       0.08      0.24      0.12      1512
weighted avg       0.08      0.23      0.11      1512

[[171  37   0   0   0   0   0]
 [ 34 172   0   0   0   0   0]
 [  3 217   0   0   0   0   0]
 [  0 223   0   0   0   0   0]
 [ 11 223   0   0   0   0   0]
 [  1 211   0   0   0   0   0]
 [204   5   0   0   0   0   0]]


# Mount the ensemble

In [35]:
X_, X_val, y_, y_val = train_test_split(X, y, test_size=0.2)

### Trees Classifiers and ensemble

In [36]:
rf = RandomForestClassifier(n_estimators=200,
                            bootstrap=False)
# rf.fit(X_, y_)

In [37]:
etc = ExtraTreesClassifier(
    bootstrap=False,
    **{'max_depth': 32, 'n_estimators': 500, 'max_features': 0.4}
    )
# etc.fit(X_, y_)

In [38]:
lgb2 = LGBMClassifier(n_estimators=400,
                      num_leaves=100)
# lgb2.fit(X_, y_)

In [39]:
from mlxtend.classifier import StackingCVClassifier

ensemble = [('etc', etc),
            ('rf', rf),
            ('lgb', lgb2)]

stack = StackingCVClassifier(classifiers=[clf for label, clf in ensemble],
                             meta_classifier=etc,
                             cv=5,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=0)

In [40]:
stack.fit(X_, y_)

Fitting 3 classifiers...
Fitting classifier1: extratreesclassifier (1/3)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   31.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier2: randomforestclassifier (2/3)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   26.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier3: lgbmclassifier (3/3)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   44.7s finished


StackingCVClassifier(classifiers=[ExtraTreesClassifier(bootstrap=False,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=32,
                                                       max_features=0.4,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=500,
                                                       n_jobs=None,
                        

In [41]:
print(stack.score(X_, y_))
print(stack.score(X_val, y_val))

0.9986772486772487
0.9103835978835979


In [42]:
y_pred = stack.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[331  59   0   0   7   0  10]
 [ 48 364   7   0  15   8   4]
 [  0   1 415  13   3  25   0]
 [  0   0   8 404   0   2   0]
 [  1  10   2   0 415   2   0]
 [  0   1  21  10   6 407   0]
 [  8   0   0   0   0   0 417]]
              precision    recall  f1-score   support

           1       0.85      0.81      0.83       407
           2       0.84      0.82      0.83       446
           3       0.92      0.91      0.91       457
           4       0.95      0.98      0.96       414
           5       0.93      0.97      0.95       430
           6       0.92      0.91      0.92       445
           7       0.97      0.98      0.97       425

    accuracy                           0.91      3024
   macro avg       0.91      0.91      0.91      3024
weighted avg       0.91      0.91      0.91      3024



### Combinar com classificador da classe 1 x 2

In [43]:
y_pred_proba_1x2 = etc12.predict_proba(X_val)

In [44]:
y_pred_proba = stack.predict_proba(X_val)

In [45]:
y_pred_ensembled = np.concatenate([y_pred_proba_1x2, y_pred_proba], axis=1)

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(2)
poly_y_pred_ensembled = poly.fit_transform(y_pred_ensembled)

In [47]:
log = LogisticRegression()
log.fit(y_pred_ensembled, y_val)
y_pred = log.predict(y_pred_ensembled)

In [48]:
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))


0.9454365079365079
              precision    recall  f1-score   support

           1       0.97      0.95      0.96       407
           2       0.95      0.93      0.94       446
           3       0.91      0.91      0.91       457
           4       0.95      0.98      0.96       414
           5       0.95      0.96      0.95       430
           6       0.92      0.91      0.92       445
           7       0.97      0.98      0.98       425

    accuracy                           0.95      3024
   macro avg       0.95      0.95      0.95      3024
weighted avg       0.95      0.95      0.95      3024

[[388   5   0   0   5   0   9]
 [  2 417   6   0  11   8   2]
 [  0   2 414  13   3  25   0]
 [  0   0   8 404   0   2   0]
 [  1  11   3   0 413   2   0]
 [  0   2  22   9   5 407   0]
 [  7   2   0   0   0   0 416]]


### Fit with all data

In [49]:
X_filtered2 = X[y < 3]
y_filtered2 = y[y < 3]
etc12.fit(X_filtered2, y_filtered2)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=42, max_features=0.6, max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=600,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [50]:
stack.fit(X, y)

Fitting 3 classifiers...
Fitting classifier1: extratreesclassifier (1/3)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier2: randomforestclassifier (2/3)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier3: lgbmclassifier (3/3)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   51.3s finished


StackingCVClassifier(classifiers=[ExtraTreesClassifier(bootstrap=False,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=32,
                                                       max_features=0.4,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=500,
                                                       n_jobs=None,
                        

In [51]:
y_pred_proba = stack.predict_proba(X)
y_pred_proba_12 = etc12.predict_proba(X)
y_ensembled = np.concatenate([y_pred_proba_12, y_pred_proba], axis=1)

In [52]:
log.fit(y_ensembled, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
y_pred_proba_val = stack.predict_proba(X_val)
y_pred_proba_12_val = etc12.predict_proba(X_val)
y_ensembled_val = np.concatenate([y_pred_proba_12_val, y_pred_proba_val], axis=1)
y_pred_val = log.predict(y_ensembled_val)
print(confusion_matrix(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

[[407   0   0   0   0   0   0]
 [  0 446   0   0   0   0   0]
 [  0   0 457   0   0   0   0]
 [  0   0   0 413   0   1   0]
 [  0   0   0   0 430   0   0]
 [  0   0   0   0   0 445   0]
 [  0   0   0   0   0   0 425]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       407
           2       1.00      1.00      1.00       446
           3       1.00      1.00      1.00       457
           4       1.00      1.00      1.00       414
           5       1.00      1.00      1.00       430
           6       1.00      1.00      1.00       445
           7       1.00      1.00      1.00       425

    accuracy                           1.00      3024
   macro avg       1.00      1.00      1.00      3024
weighted avg       1.00      1.00      1.00      3024



## Predictions

In [55]:
print(X_test.shape)

(565892, 40)


In [56]:
test_pred_proba_12 = etc12.predict_proba(X_test)

In [57]:
test_pred_proba = stack.predict_proba(X_test)

In [58]:
test_pred = log.predict(np.concatenate([test_pred_proba_12, test_pred_proba], axis=1))

In [59]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)

In [None]:
# Submit only stack
test_pred_stack = stack.predict(X_test)
# Save test predictions to file
output = pd.DataFrame({'ID': test_ids,
                       'Cover_Type': test_pred_stack})
output.to_csv('submission_stack.csv', index=False)