In [None]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score




## Presprocessing

In [11]:
raw_df = pd.read_csv("../forestCover.csv", na_values="?")
raw_df = raw_df.rename(columns={"Observation_ID":"Water_Level" , "Water_Level":"Observation_ID"})
# display(raw_df)

In [12]:
na_col = raw_df.columns[raw_df.isna().any()]
# print(na_col)
raw_df["Slope"].isna().sum()

np.int64(298)

In [13]:
print(f"Corr of Inclination and Cover: {raw_df['Inclination'].corr(raw_df['Cover_Type'])}")
print(f"Corr of aspect and facet: {raw_df['Facet'].corr(raw_df['Aspect'])}")
print(f"Cardinality of water level: {raw_df['Water_Level'].nunique()}")
print(f"Cardinality of obs id: {raw_df['Observation_ID'].nunique()}")
df = raw_df.drop(columns={'Facet','Water_Level','Observation_ID', 'Inclination' })



Corr of Inclination and Cover: 0.0002417782055169697
Corr of aspect and facet: 0.9999980536139016
Cardinality of water level: 1
Cardinality of obs id: 581012


In [14]:
na_rows = df.index[df.isna().any(axis=1)]
na_proportions = (df["Cover_Type"][na_rows].value_counts() /  df["Cover_Type"].value_counts() * 100).round(3)
print(na_proportions)
df = df.dropna()



Cover_Type
1    0.045
2    0.055
3    0.053
4    0.036
5    0.021
6    0.069
7    0.054
Name: count, dtype: float64


In [15]:
df["Soil_Type1"] = (df["Soil_Type1"] == 'positive').astype(int)
df["Soil_Type1"].value_counts()

Soil_Type1
1    577685
0      3029
Name: count, dtype: int64

In [None]:
continuous_features = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

# # Plot distributions
# fig, axes = plt.subplots(len(continuous_features), 1, figsize=(5, 15))

# for i, col in enumerate(continuous_features):
#     sns.histplot(df[col], bins=50, kde=True, ax=axes[i], color="steelblue")
#     axes[i].set_title(f"Distribution of {col}")
#     axes[i].set_xlabel("")
#     axes[i].set_ylabel("Count")

# plt.tight_layout()
# plt.show()

## Data Augmentation

In [17]:
df_eng = df.copy()
df_eng["Relative_hoz_dist_mean"] = df[[
    "Horizontal_Distance_To_Hydrology",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Roadways"
]].mean(axis=1)

df_eng["total_hillshade"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].sum(axis=1)
# df_eng["min_hillshade"]   = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)
# df_eng["max_hillshade"]   = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1)

df_eng["Relative_height_from_water"] = df["Elevation"] - df["Vertical_Distance_To_Hydrology"]
df_eng = df_eng.drop(columns={"Elevation",  "Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon", "Hillshade_3pm"  })

## Train test

In [18]:
X = df.drop(columns={"Cover_Type"})
X_eng = df_eng.drop(columns={"Cover_Type"})
y = df["Cover_Type"]
y_eng = df_eng["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

eX_train, eX_test, ey_train, ey_test = train_test_split(X_eng, y_eng, test_size=0.3, random_state=42)



## PCA

In [None]:
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)



## Tomek

In [20]:
tomek = TomekLinks(sampling_strategy='auto')
X_train_t, y_train_t = tomek.fit_resample(X_train, y_train)
eX_train_t, ey_train_t = tomek.fit_resample(eX_train, ey_train)


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [None]:
smote = SMOTETomek(random_state=42)
X_train_s, y_train_s = smote.fit_resample(X_train, y_train)
eX_train_s, ey_train_s = smote.fit_resample(eX_train, ey_train)
pca_Xtr_s, pca_ytr_s = smote.fit_resample(X_train_pca, y_train_pca)

In [23]:
eX_train.describe()

Unnamed: 0,Aspect,Slope,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,...,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Relative_hoz_dist_mean,total_hillshade,Relative_height_from_water
count,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,...,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0,406499.0
mean,155.796942,14.097516,0.449551,0.051762,0.435032,0.063656,0.994868,0.012819,0.008344,0.021388,...,0.002772,0.003262,0.000187,0.000539,0.026866,0.023798,0.015117,3406.9,578.004839,3269994.0
std,111.948114,7.488743,0.497449,0.221545,0.495762,0.244139,0.071451,0.112494,0.090966,0.144673,...,0.052581,0.057021,0.013672,0.023205,0.161692,0.152421,0.122018,349497.9,43.396714,309577.6
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,179.0,2054184.0
25%,58.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,934.0,557.0,3103909.0
50%,127.0,13.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1398.333,586.0,3310516.0
75%,261.0,18.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1972.0,609.0,3495162.0
max,360.0,66.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,124764800.0,641.0,4263008.0


### KNN dataset

In [None]:
outlier_features = [
    "Horizontal_Distance_To_Hydrology",
    "Hillshade_Noon",
    "Vertical_Distance_To_Hydrology",
]

outlier_features_eng = [
    "Relative_hoz_dist_mean",
    
]

def clamp_series(s, lower_q=0.01, upper_q=0.99):
    lower, upper = s.quantile([lower_q, upper_q])
    return s.clip(lower, upper)

# apply clamping
for col in outlier_features:
    X_train_s[col] = clamp_series(X_train_s[col], 0.01, 0.99)
    X_train_t[col] = clamp_series(X_train_t[col], 0.01, 0.99)

eX_train_s["Relative_hoz_dist_mean"] = clamp_series(eX_train_s["Relative_hoz_dist_mean"], 0.01, 0.99)
eX_train_t["Relative_hoz_dist_mean"] = clamp_series(eX_train_t["Relative_hoz_dist_mean"], 0.01, 0.99)
    


continuous_features = [

    "Elevation",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

continuous_features_eng = [

    'Slope', 
    'Relative_hoz_dist_mean', 
    'total_hillshade',
    'Relative_height_from_water'
]

knn_X_t = X_train_t.copy()
knn_X_s = X_train_s.copy()
knn_eX_t = eX_train_t.copy()
knn_eX_s = eX_train_s.copy()

knn_X_test = X_test.copy()
knn_eX_test = eX_test.copy()



def scale_knn(knn_df, continuous_features = continuous_features):
    scaler = MinMaxScaler()
    knn_df[continuous_features] = scaler.fit_transform(knn_df[continuous_features])
    knn_df["Aspect_rad"] = np.deg2rad(knn_df["Aspect"])
    knn_df["Aspect_sin"] = np.sin(knn_df["Aspect_rad"])
    knn_df["Aspect_cos"] = np.cos(knn_df["Aspect_rad"])
    knn_df.drop(columns=["Aspect", "Aspect_rad"] , inplace=True)


scale_knn(knn_X_t)
scale_knn(knn_eX_s, continuous_features_eng)
scale_knn(knn_eX_t, continuous_features_eng)
scale_knn(knn_X_s)
scale_knn(knn_X_test)
scale_knn(knn_eX_test, continuous_features_eng)

    


In [54]:
display(knn_X_s)

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Aspect_sin,Aspect_cos
0,0.579790,0.333333,0.294055,0.525424,0.379935,0.787402,1.000000,0.708661,0.238394,0,...,1,0,0,0,0,0,0,0,-0.453990,-0.891007
1,0.659330,0.045455,0.707006,0.379661,0.755093,0.858268,0.771739,0.598425,0.150565,1,...,0,0,0,0,0,0,0,0,0.406737,0.913545
2,0.671336,0.212121,0.363057,0.345763,0.413095,0.897638,0.521739,0.440945,0.624286,1,...,0,0,0,0,0,0,0,0,0.819152,0.573576
3,0.590295,0.151515,0.171975,0.257627,0.336378,0.759843,0.750000,0.700787,0.238952,0,...,0,0,0,0,0,0,0,0,-0.681998,0.731354
4,0.497249,0.303030,0.131635,0.311864,0.018828,0.874016,0.336957,0.381890,0.191552,1,...,0,0,0,0,0,0,0,0,0.731354,0.681998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374750,0.431216,0.166667,0.031847,0.145763,0.323732,0.791339,0.630435,0.614173,0.576328,1,...,0,0,0,0,0,0,0,0,-0.087156,0.996195
374751,0.447224,0.151515,0.063694,0.152542,0.215962,0.755906,0.847826,0.740157,0.153771,0,...,0,0,0,0,0,0,0,0,-0.965926,0.258819
374752,0.665833,0.363636,0.284501,0.345763,0.241253,0.594488,0.847826,0.881890,0.304615,0,...,1,0,0,0,0,0,0,0,-0.999848,0.017452
374753,0.591296,0.181818,0.114650,0.196610,0.216243,0.937008,0.804348,0.496063,0.090339,0,...,0,0,0,0,0,0,0,0,0.642788,-0.766044


# Models

## KNN

In [None]:
def test_knn(X_train , y_train, X_test = knn_X_test, y_test = y_test, weights = 'distance' , k = 5, metric = "euclidean"):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights=weights)
    knn.fit(X_train, y_train)
    train_acc = knn.score(X_train, y_train)
    test_acc = knn.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    

In [57]:
# test_knn(X_train, y_train)

In [58]:
test_knn(knn_eX_s, ey_train_s, knn_eX_test, ey_test)
test_knn(knn_eX_t, ey_train_t, knn_eX_test, ey_test)

Train Accuracy: 1.0000
Test Accuracy: 0.6305
Train Accuracy: 1.0000
Test Accuracy: 0.6305


In [59]:
# display(knn_X_test)

In [60]:
test_knn(knn_eX_s, ey_train_s,  knn_eX_test, ey_test, weights='uniform')
test_knn(knn_eX_t, ey_train_t, knn_eX_test, ey_test, weights='uniform')

Train Accuracy: 0.8927
Test Accuracy: 0.6315
Train Accuracy: 0.8927
Test Accuracy: 0.6315


In [61]:
test_knn(knn_X_s, y_train_s)
test_knn(knn_X_t, y_train_t)

Train Accuracy: 1.0000
Test Accuracy: 0.7257
Train Accuracy: 1.0000
Test Accuracy: 0.7257


In [62]:
test_knn(knn_X_s, y_train_s, metric="minkowski")
test_knn(knn_X_t, y_train_t, metric="minkowski")

Train Accuracy: 1.0000
Test Accuracy: 0.7257
Train Accuracy: 1.0000
Test Accuracy: 0.7257


In [63]:
test_knn(knn_X_s, y_train_s, k=10)
test_knn(knn_X_t, y_train_t, k = 15)

test_knn(knn_X_t, y_train_t, k = 45)

Train Accuracy: 1.0000
Test Accuracy: 0.7341
Train Accuracy: 1.0000
Test Accuracy: 0.7344
Train Accuracy: 1.0000
Test Accuracy: 0.7330


## Testing the other notebooks knn

NameError: name 'X_train_bal' is not defined

## Trees

In [2]:
def test_tree(X_train, y_train, X_test = X_test, y_test = y_test, metric = 'entropy', max_depth = None):
    tree = DecisionTreeClassifier(criterion = metric, max_depth= max_depth, random_state=42)
    tree.fit(X_train, y_train)
    train_acc = tree.score(X_train, y_train)
    test_acc = tree.score(X_test, y_test)

    print(f"Smote train acc: {train_acc}")
    print(f"Smote test acc: {test_acc}")

NameError: name 'X_test' is not defined

In [None]:
test_tree(X_train, y_train)

Smote train acc: 1.0
Smote test acc: 0.9394828229486554


In [None]:
test_tree(eX_train_t, ey_train_t, eX_test, ey_test)
test_tree(eX_train_s, ey_train_s, eX_test, ey_test)

Smote train acc: 0.9999916260763981
Smote test acc: 0.8700111930660391
Smote train acc: 0.9999916260763981
Smote test acc: 0.8700111930660391


In [None]:
test_tree(X_train_t, y_train_t, X_test, y_test)
test_tree(X_train_s, y_train_s, X_test, y_test)

Smote train acc: 1.0
Smote test acc: 0.9342306919610825
Smote train acc: 1.0
Smote test acc: 0.9342306919610825


In [None]:
test_tree(X_train_t, y_train_t, X_test, y_test)
test_tree(X_train_s, y_train_s, X_test, y_test)

In [1]:
test_tree(X_train_t, y_train_t, X_test, y_test, max_depth=20)
test_tree(X_train_s, y_train_s, X_test, y_test, max_depth= 30)
test_tree(X_train_t, y_train_t, X_test, y_test, max_depth=10)
test_tree(X_train_s, y_train_s, X_test, y_test, max_depth= 25)


NameError: name 'test_tree' is not defined

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score


In [None]:
def evaluate_model(model, X, y, cv=5):
    # Accuracy with mean ± std
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    print(f"Accuracy: {scores.mean():.3f} ± {scores.std():.3f}")
    
    # Cross-validated predictions for confusion matrix etc.
    y_pred = cross_val_predict(model, X, y, cv=cv)
    
    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix ({type(model).__name__})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()
    
    # Classification report (precision, recall, f1 per class)
    print("Classification Report:")
    print(classification_report(y, y_pred, digits=3))
    
    # Cohen’s kappa (agreement beyond chance)
    kappa = cohen_kappa_score(y, y_pred)
    print(f"Cohen’s Kappa: {kappa:.3f}")


In [None]:
def plot_cv_boxplot(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    sns.boxplot(data=scores, orient="h", color="skyblue")
    plt.title(f"Accuracy per fold ({type(model).__name__})")
    plt.xlabel("Accuracy")
    plt.show()


In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 5)):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=cv, train_sizes=train_sizes, scoring="accuracy"
    )
    
    train_mean = np.mean(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(7,5))
    plt.plot(train_sizes, train_mean, "o-", label="Training")
    plt.plot(train_sizes, val_mean, "o-", label="Validation")
    plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=0.2)
    plt.fill_between(train_sizes, val_mean-val_std, val_mean+val_std, alpha=0.2)
    plt.title(f"Learning Curve ({type(model).__name__})")
    plt.xlabel("Training set size")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()


In [None]:
def plot_feature_importance(model, feature_names):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(8,5))
    sns.barplot(x=importances[indices][:10], y=np.array(feature_names)[indices][:10], palette="viridis")
    plt.title("Top 10 Feature Importances (Decision Tree)")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.show()


In [None]:
# knn_final = KNeighborsClassifier(n_neighbors=5, metric="euclidean", weights='distance')
# knn_final.fit(X_train_s, y_train_s)
