In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

# Import libs and data

In [47]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split


In [65]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

train_df = train_df.set_index('Id')
X_train = train_df.drop(['Cover_Type'], axis=1)
y_train = train_df['Cover_Type']

X_test = test_df.set_index('Id')

In [66]:
print(X_train.shape)
print(X_test.shape)

(15120, 54)
(565892, 54)


In [67]:
def report(y_true, y_pred):
    print('Accuracy: %s' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

# Apply Scaler and KMeans Cluster

In [68]:
X = np.vstack([X_train, X_test])

In [69]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [70]:
kms = MiniBatchKMeans(n_clusters=7)
kms.fit(X_scaled)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=7, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [71]:
X_scaled = np.hstack((X_scaled, np.expand_dims(kms.labels_, axis=1)))

In [72]:
X_train = X_scaled[:15120]
X_test = X_scaled[15120:]
print(X_train.shape)
print(X_test.shape)

(15120, 55)
(565892, 55)


In [73]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

# Train an ensemble

In [74]:
from sklearn.ensemble import VotingClassifier

from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(
    clfs=[
          RandomForestClassifier(
              n_estimators=200,
              max_depth=50
          ),
          RandomForestClassifier(
              n_estimators=500,
              max_depth=60,
              min_samples_split=2,
              min_samples_leaf=1
          ),
          RandomForestClassifier(
              n_estimators=200,
              max_depth=30,
              min_samples_split=4,
              min_samples_leaf=2,
              class_weight= {0:100, 1: 100, 2: 10, 3: 1, 4: 1, 5: 10, 6: 1}
          ),
          RandomForestClassifier(
              n_estimators=400,
              max_depth=50,
              class_weight= {0:100, 1: 100, 2: 100, 3: 1, 4: 1, 5: 1, 6: 1}
          ),

        ],
    voting='soft',
    # refit=False
)

eclf = eclf.fit(X_train, y_train)
y_pred_train = eclf.predict(X_train)
accuracy_score(y_train, y_pred_train)

1.0

In [75]:
y_pred = eclf.predict(X_val)
report(y_val, y_pred)

Accuracy: 0.857473544973545
              precision    recall  f1-score   support

           1       0.78      0.76      0.77       423
           2       0.77      0.69      0.72       416
           3       0.86      0.82      0.84       456
           4       0.93      0.96      0.95       447
           5       0.88      0.93      0.91       442
           6       0.81      0.87      0.84       409
           7       0.94      0.95      0.95       431

    accuracy                           0.86      3024
   macro avg       0.85      0.86      0.85      3024
weighted avg       0.86      0.86      0.86      3024

[[322  68   0   0   9   0  24]
 [ 72 285   8   0  39  12   0]
 [  0   0 374  21   2  59   0]
 [  0   0   9 431   0   7   0]
 [  1  16   7   0 413   5   0]
 [  0   1  38   9   4 357   0]
 [ 19   1   0   0   0   0 411]]


# Run on test

In [76]:
test_pred = eclf.predict(X_test)

In [78]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_df['Id'],
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)