Forest Cover Typr Classfication

In [4]:
# Import Libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [5]:
# Loading Dataset

data_file = "covtype.data.gz"

columns = [f"Feature_{i}" for i in range(1, 55)] + ["Cover_Type"]
df = pd.read_csv(data_file, header = None, names = columns)

In [7]:
print("Dataset Shape:", df.shape)
df

Dataset Shape: (581012, 55)


Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_46,Feature_47,Feature_48,Feature_49,Feature_50,Feature_51,Feature_52,Feature_53,Feature_54,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [13]:
# Spliting Features and Target

X = df.drop("Cover_Type", axis = 1)
y = df["Cover_Type"] - 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42, stratify = y
)


In [14]:
# Scaling Features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
# Training Random Forest

rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)

rf_acc = accuracy_score(y_test, y_pred_rf)

In [24]:
print("\nRandom Forest Result")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Result
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     42368
           1       0.95      0.97      0.96     56661
           2       0.94      0.96      0.95      7151
           3       0.92      0.86      0.89       549
           4       0.95      0.77      0.85      1899
           5       0.93      0.89      0.91      3473
           6       0.97      0.95      0.96      4102

    accuracy                           0.95    116203
   macro avg       0.95      0.91      0.92    116203
weighted avg       0.95      0.95      0.95    116203


Confusion Matrix (Random Forest):
 [[39900  2366     1     0     6     2    93]
 [ 1305 55110   106     1    56    69    14]
 [    0   122  6859    23     7   140     0]
 [    0     0    57   471     0    21     0]
 [   25   385    20     0  1460     9     0]
 [    5    91   258    18     5  3096     0]
 [  197    28     0     0     0     0  3877]]


In [18]:
# Training XGBoost

xgb_clf = xgb.XGBClassifier(
    objective = "multi:softmax",
    num_class = 7, # 7 cover types
    eval_metric = "mlogloss",
    use_label_encoder = False,
    random_state = 42
)

xgb_clf.fit(X_train, y_train)

y_pred_xgb = xgb_clf.predict(X_test)

xgb_acc = accuracy_score(y_test, y_pred_xgb)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [19]:
print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix (XGBoost):\n", confusion_matrix(y_test, y_pred_xgb))


XGBoost Results:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85     42368
           1       0.87      0.90      0.88     56661
           2       0.89      0.91      0.90      7151
           3       0.88      0.86      0.87       549
           4       0.89      0.61      0.72      1899
           5       0.85      0.80      0.82      3473
           6       0.95      0.91      0.93      4102

    accuracy                           0.87    116203
   macro avg       0.88      0.83      0.85    116203
weighted avg       0.87      0.87      0.87    116203


Confusion Matrix (XGBoost):
 [[35619  6550     4     0    25     6   164]
 [ 5311 50786   268     1   116   149    30]
 [    5   302  6504    40     2   298     0]
 [    0     0    52   471     0    26     0]
 [   19   686    28     0  1154    12     0]
 [    4   223   453    24     1  2768     0]
 [  331    22     0     0     0     0  3749]]


In [21]:
# Feature Importance

print("\nTop 10 Important Features (XGBoost):")
importances = xgb_clf.feature_importances_
top_idx = importances.argsort()[::-1][:10]

for i in top_idx:
    print(f"{columns[i]}: {importances[i]:.4f}")


Top 10 Important Features (XGBoost):
Feature_1: 0.0926
Feature_11: 0.0581
Feature_36: 0.0502
Feature_46: 0.0475
Feature_16: 0.0471
Feature_26: 0.0452
Feature_18: 0.0427
Feature_53: 0.0356
Feature_52: 0.0354
Feature_13: 0.0335


In [25]:
# Bonus: Model Comparison
print("\n=== Model Comparison Summary ===")
print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"XGBoost Accuracy:      {xgb_acc:.4f}")

if rf_acc > xgb_acc:
    print("Random Forest performed better on this dataset.")
elif xgb_acc > rf_acc:
    print("XGBoost performed better on this dataset.")
else:
    print("Both models performed equally well.")


=== Model Comparison Summary ===
Random Forest Accuracy: 0.9533
XGBoost Accuracy:      0.8696
Random Forest performed better on this dataset.
