In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

## Import libs and data

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

train_df = train_df.set_index('Id')
X_train = train_df.drop(['Cover_Type'], axis=1)
y_train = train_df[['Cover_Type']]

X_test = test_df.set_index('Id')

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

## Train three diferent models

In [5]:
mm = MinMaxScaler((0,1))
kb = KBinsDiscretizer(encode='ordinal', n_bins=20, strategy='quantile')
pca = PCA(random_state=60, n_components=47)
knn = KNeighborsClassifier(algorithm='ball_tree', n_jobs=-1, n_neighbors=1)

pipe = Pipeline(steps=[
        ('scale', mm),
        ('pca', pca),
        ('disc', kb),
        ('pred', knn)
])

pipe.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params)


Pipeline(memory=None,
         steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=47,
                     random_state=60, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('disc',
                 KBinsDiscretizer(encode='ordinal', n_bins=20,
                                  strategy='quantile')),
                ('pred',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=-1, n_neighbors=1, p=2,
                                      weights='uniform'))],
         verbose=False)

In [6]:
print(pipe.score(X_train, y_train))
y_pred1 = pipe.predict(X_val)
accuracy_score(y_val, y_pred1)


1.0


0.841931216931217

In [7]:
rf = RandomForestClassifier(
    **{'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}
)
rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
y_pred2 = rf.predict(X_val)
accuracy_score(y_val, y_pred2)


  after removing the cwd from sys.path.


1.0


0.8640873015873016

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()


gbc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [9]:
print(gbc.score(X_train, y_train))
y_pred3 = gbc.predict(X_val)
accuracy_score(y_val, y_pred3)


0.8439980158730159


0.8032407407407407

## Compare errors

In [10]:
def report(y_true, y_pred):
    print('Accuracy: %s' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

In [11]:
report(y_val, y_pred1)

Accuracy: 0.841931216931217
              precision    recall  f1-score   support

           1       0.79      0.71      0.75       444
           2       0.74      0.71      0.73       433
           3       0.83      0.79      0.81       451
           4       0.93      0.94      0.93       424
           5       0.87      0.94      0.90       423
           6       0.80      0.84      0.82       424
           7       0.93      0.97      0.95       425

    accuracy                           0.84      3024
   macro avg       0.84      0.84      0.84      3024
weighted avg       0.84      0.84      0.84      3024

[[317  86   0   0  15   0  26]
 [ 71 309   5   0  33  11   4]
 [  0   7 355  23   8  58   0]
 [  0   0  10 399   0  15   0]
 [  2  12   5   0 398   6   0]
 [  0   4  51   8   5 356   0]
 [ 13   0   0   0   0   0 412]]


In [12]:
report(y_val, y_pred2)

Accuracy: 0.8640873015873016
              precision    recall  f1-score   support

           1       0.80      0.77      0.78       444
           2       0.79      0.72      0.75       433
           3       0.87      0.80      0.83       451
           4       0.94      0.97      0.96       424
           5       0.90      0.95      0.92       423
           6       0.82      0.88      0.85       424
           7       0.93      0.97      0.95       425

    accuracy                           0.86      3024
   macro avg       0.86      0.87      0.86      3024
weighted avg       0.86      0.86      0.86      3024

[[340  65   0   0  13   0  26]
 [ 69 313   7   0  28  10   6]
 [  0   2 362  17   5  65   0]
 [  0   0   7 413   0   4   0]
 [  1  11   5   0 401   5   0]
 [  0   5  37   8   1 373   0]
 [ 13   1   0   0   0   0 411]]


In [13]:
report(y_val, y_pred3)

Accuracy: 0.8032407407407407
              precision    recall  f1-score   support

           1       0.72      0.70      0.71       444
           2       0.73      0.56      0.63       433
           3       0.79      0.76      0.77       451
           4       0.93      0.96      0.94       424
           5       0.78      0.91      0.84       423
           6       0.76      0.79      0.77       424
           7       0.91      0.96      0.93       425

    accuracy                           0.80      3024
   macro avg       0.80      0.81      0.80      3024
weighted avg       0.80      0.80      0.80      3024

[[312  69   0   0  27   2  34]
 [105 242   9   0  60  12   5]
 [  0   0 342  19  13  77   0]
 [  0   0  10 407   0   7   0]
 [  0  20   7   0 386  10   0]
 [  0   1  66  14   9 334   0]
 [ 18   0   0   0   1   0 406]]


In [14]:
(y_pred1 == y_pred2).sum() / len(y_pred2)

0.8647486772486772

In [15]:
(y_pred2 == y_pred3).sum() / len(y_pred3)

0.8776455026455027

In [16]:
(y_pred1 == y_pred3).sum() / len(y_pred3)

0.7982804232804233

## Train model just for 1 x 2 class 

In [17]:
y_train_1_2 = y_train[y_train['Cover_Type'] < 3 ]
X_train_1_2 = X_train[y_train['Cover_Type'] < 3 ]
y_val_1_2 = y_val[y_val['Cover_Type'] < 3 ]
X_val_1_2 = X_val[y_val['Cover_Type'] < 3 ]



In [18]:
X_train_1_2.shape

(3443, 54)

In [19]:
y_train['Cover_Type'].value_counts()

5    1737
6    1736
4    1736
7    1735
2    1727
1    1716
3    1709
Name: Cover_Type, dtype: int64

In [20]:
1736+1748

3484

In [21]:
pipe_1_2 = Pipeline(steps=[
        ('scale', mm),
        ('pca', pca),
        ('disc', kb),
        ('pred', knn)
])

pipe_1_2.fit(X_train_1_2, y_train_1_2)
pipe_1_2.score(X_train_1_2, y_train_1_2)


  'decreasing the number of bins.' % jj)
  self._final_estimator.fit(Xt, y, **fit_params)


1.0

In [22]:
y_pred1_1_2 = pipe_1_2.predict(X_val_1_2)
report(y_val_1_2, y_pred1_1_2)

Accuracy: 0.7913340935005702
              precision    recall  f1-score   support

           1       0.82      0.76      0.79       444
           2       0.77      0.83      0.80       433

    accuracy                           0.79       877
   macro avg       0.79      0.79      0.79       877
weighted avg       0.79      0.79      0.79       877

[[336 108]
 [ 75 358]]


In [23]:
rf_1_2 = RandomForestClassifier(
    **{'n_estimators': 50, 'max_depth': 24, 'min_samples_split': 4, 'min_samples_leaf': 2}
)
rf_1_2.fit(X_train_1_2, y_train_1_2)
print(rf_1_2.score(X_train_1_2, y_train_1_2))
y_pred2_1_2 = rf_1_2.predict(X_val_1_2)
accuracy_score(y_val_1_2, y_pred2_1_2)

  after removing the cwd from sys.path.


0.9747313389485913


0.7833523375142531

In [24]:
report(y_val_1_2, y_pred2_1_2)

Accuracy: 0.7833523375142531
              precision    recall  f1-score   support

           1       0.78      0.79      0.79       444
           2       0.78      0.77      0.78       433

    accuracy                           0.78       877
   macro avg       0.78      0.78      0.78       877
weighted avg       0.78      0.78      0.78       877

[[352  92]
 [ 98 335]]


In [25]:
from sklearn.ensemble import VotingClassifier

from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(
    clfs=[pipe, rf, 
          RandomForestClassifier(
              n_estimators=50,
              max_depth=30,
              min_samples_split=4,
              min_samples_leaf=2,
              class_weight= {0:100, 1: 100, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}
          ),
          RandomForestClassifier(
              n_estimators=200,
              max_depth=50,
              min_samples_split=2,
              min_samples_leaf=1,
              class_weight= {0:2, 1: 2, 2: 1, 3: 1, 4: 2, 5: 1, 6: 1}
          ),
          RandomForestClassifier(
              n_estimators=200,
              max_depth=30,
              min_samples_split=10,
              min_samples_leaf=5,
              class_weight= {0:10, 1: 10, 2: 100, 3: 1, 4: 100, 5: 1, 6: 1}
          )

        ],
    voting='hard',
    # refit=False
)

eclf = eclf.fit(X_train, y_train)
# eclf = eclf.fit(None, [1, 2, 3, 4, 5, 6, 7])
y_pred_train = eclf.predict(X_train)
accuracy_score(y_train, y_pred_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


1.0

In [26]:
y_pred_e = eclf.predict(X_val)
report(y_val, y_pred_e)

Accuracy: 0.8660714285714286
              precision    recall  f1-score   support

           1       0.78      0.78      0.78       444
           2       0.79      0.71      0.75       433
           3       0.85      0.84      0.84       451
           4       0.94      0.97      0.96       424
           5       0.90      0.95      0.92       423
           6       0.85      0.86      0.85       424
           7       0.95      0.96      0.95       425

    accuracy                           0.87      3024
   macro avg       0.87      0.87      0.87      3024
weighted avg       0.86      0.87      0.86      3024

[[347  63   0   0  14   0  20]
 [ 81 309   8   0  26   7   2]
 [  0   3 377  16   5  50   0]
 [  0   0   8 413   0   3   0]
 [  1  10   4   0 403   5   0]
 [  0   4  47   9   1 363   0]
 [ 18   0   0   0   0   0 407]]


In [27]:
!pip install mlxtend



In [28]:
y_train['Cover_Type'].values

array([5, 5, 3, ..., 4, 1, 4])

In [None]:
test_pred = eclf.predict(X_test)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'ID': X_test.index,
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)