In [1]:
DATA_DIR = '/kaggle/input/learn-together'
DATA_DIR = 'data'

# Import libs and data

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split


In [22]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

train_df = train_df.set_index('Id')
X = train_df.drop(['Cover_Type'], axis=1)
y = train_df['Cover_Type']

X_test = test_df.set_index('Id')
y_ids = test_df['Id']

In [23]:
print(X.shape)
print(X_test.shape)

(15120, 54)
(565892, 54)


In [24]:
def report(y_true, y_pred):
    print('Accuracy: %s' % accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

# Apply Scaler and KMeans Cluster

In [25]:
X_all = np.vstack([X, X_test])

In [26]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X_all)

In [27]:
kms = MiniBatchKMeans(n_clusters=7)
kms.fit(X_scaled)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=7, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [29]:
X_scaled = np.hstack((X_scaled, np.expand_dims(kms.labels_, axis=1)))
X_cluster_labels = np.hstack((X_all, np.expand_dims(kms.labels_, axis=1)))

In [30]:
X_cluster = X_cluster_labels[:15120]
X_cluster_test = X_cluster_labels[15120:]
X_cluster_scaled = X_scaled[:15120]
X_cluster_test_scaled = X_scaled[15120:]

print(X_cluster.shape)
print(X_cluster_test.shape)

(15120, 55)
(565892, 55)


# Simple trees test

## Original data

In [32]:
rf = RandomForestClassifier(n_estimators=640
                            ,n_jobs = -1,max_features=0.5,oob_score=True)
rf.fit(X, y)
rf.oob_score_

0.8755952380952381

In [33]:
xrf = ExtraTreesClassifier(n_estimators=640,n_jobs=-1,max_features=0.5,oob_score=True,bootstrap = True)
xrf.fit(X, y)
xrf.oob_score_

0.8770502645502646

## With cluster label

In [35]:
rfc = RandomForestClassifier(n_estimators=640
                            ,n_jobs = -1,max_features=0.5,oob_score=True)
rfc.fit(X_cluster, y)
rfc.oob_score_

0.8739417989417989

In [40]:
xrfc = ExtraTreesClassifier(n_estimators=640,n_jobs=-1,max_features=0.5,oob_score=True,bootstrap = True)
xrfc.fit(X_cluster, y)
xrfc.oob_score_

0.8788359788359789

## Scaled with cluster label

In [38]:
rfs = RandomForestClassifier(n_estimators=640
                            ,n_jobs = -1,max_features=0.5,oob_score=True)
rfs.fit(X_cluster_scaled, y)
rfs.oob_score_

0.874537037037037

In [39]:
xrfs = ExtraTreesClassifier(n_estimators=640,n_jobs=-1,max_features=0.5,oob_score=True,bootstrap = True)
xrfs.fit(X_cluster_scaled, y)
xrfs.oob_score_

0.8780423280423281

# Final evaluation and submission

In [42]:
y_pred = xrfc.predict(X_cluster)
report(y, y_pred)

Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2160
           2       1.00      1.00      1.00      2160
           3       1.00      1.00      1.00      2160
           4       1.00      1.00      1.00      2160
           5       1.00      1.00      1.00      2160
           6       1.00      1.00      1.00      2160
           7       1.00      1.00      1.00      2160

    accuracy                           1.00     15120
   macro avg       1.00      1.00      1.00     15120
weighted avg       1.00      1.00      1.00     15120

[[2160    0    0    0    0    0    0]
 [   0 2160    0    0    0    0    0]
 [   0    0 2160    0    0    0    0]
 [   0    0    0 2160    0    0    0]
 [   0    0    0    0 2160    0    0]
 [   0    0    0    0    0 2160    0]
 [   0    0    0    0    0    0 2160]]


# Run on test

In [43]:
test_pred = xrfc.predict(X_cluster_test)

In [44]:
# Save test predictions to file
output = pd.DataFrame({'ID': test_df['Id'],
                       'Cover_Type': test_pred})
output.to_csv('submission.csv', index=False)