In [1]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score




In [None]:
raw_df = pd.read_csv("../forestCover.csv", na_values="?")
raw_df = raw_df.rename(columns={"Observation_ID":"Water_Level" , "Water_Level":"Observation_ID"})
# display(raw_df)

In [None]:
na_col = raw_df.columns[raw_df.isna().any()]
# print(na_col)
raw_df["Slope"].isna().sum()

In [None]:
print(f"Corr of Inclination and Cover: {raw_df['Inclination'].corr(raw_df['Cover_Type'])}")
print(f"Corr of aspect and facet: {raw_df['Facet'].corr(raw_df['Aspect'])}")
print(f"Cardinality of water level: {raw_df['Water_Level'].nunique()}")
print(f"Cardinality of obs id: {raw_df['Observation_ID'].nunique()}")
df = raw_df.drop(columns={'Facet','Water_Level','Observation_ID', 'Inclination' })

na_rows = df.index[df.isna().any(axis=1)]
na_proportions = (df["Cover_Type"][na_rows].value_counts() /  df["Cover_Type"].value_counts() * 100).round(3)
print(na_proportions)
df = df.dropna()

df["Soil_Type1"] = (df["Soil_Type1"] == 'positive').astype(int)
df["Soil_Type1"].value_counts()

In [None]:
continuous_features = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

# Plot distributions
fig, axes = plt.subplots(len(continuous_features), 1, figsize=(5, 15))

for i, col in enumerate(continuous_features):
    sns.histplot(df[col], bins=50, kde=True, ax=axes[i], color="steelblue")
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel("")
    axes[i].set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
df_eng = df.copy()
df_eng["Relative_hoz_dist_mean"] = df[[
    "Horizontal_Distance_To_Hydrology",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Roadways"
]].mean(axis=1)

df_eng["total_hillshade"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].sum(axis=1)
df_eng["Relative_height_from_water"] = df["Elevation"] - df["Vertical_Distance_To_Hydrology"]
df_eng = df_eng.drop(columns={"Elevation",  "Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon", "Hillshade_3pm"  })

In [None]:
X = df.drop(columns={"Cover_Type"})
X_eng = df_eng.drop(columns={"Cover_Type"})
y = df["Cover_Type"]
y_eng = df_eng["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

eX_train, eX_test, ey_train, ey_test = train_test_split(X_eng, y_eng, test_size=0.3, random_state=42)



In [None]:
outlier_features = [
    "Horizontal_Distance_To_Hydrology",
    "Hillshade_Noon",
    "Vertical_Distance_To_Hydrology",
]

outlier_features_eng = [
    "Relative_hoz_dist_mean",
    
]

knn_df = df.copy()
knn_eng = df.copy()


def clamp_series(s, lower_q=0.01, upper_q=0.99):
    lower, upper = s.quantile([lower_q, upper_q])
    return s.clip(lower, upper)

# apply clamping
for col in outlier_features:
    knn_df[col] = clamp_series(knn_df[col], 0.01, 0.99)
    

knn_eng["Relative_hoz_dist_mean"] = clamp_series(knn_eng["Relative_hoz_dist_mean"], 0.01, 0.99)

    


continuous_features = [

    "Elevation",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

continuous_features_eng = [

    'Slope', 
    'Relative_hoz_dist_mean', 
    'total_hillshade',
    'Relative_height_from_water'
]




def scale_knn(knn_df, continuous_features = continuous_features):
    scaler = MinMaxScaler()
    knn_df[continuous_features] = scaler.fit_transform(knn_df[continuous_features])
    knn_df["Aspect_rad"] = np.deg2rad(knn_df["Aspect"])
    knn_df["Aspect_sin"] = np.sin(knn_df["Aspect_rad"])
    knn_df["Aspect_cos"] = np.cos(knn_df["Aspect_rad"])
    knn_df.drop(columns=["Aspect", "Aspect_rad"] , inplace=True)


scale_knn(knn_df)
scale_knn(knn_eng)
    


In [None]:
X = df.drop(columns={"Cover_Type"})
X_eng = df_eng.drop(columns={"Cover_Type"})
y = df["Cover_Type"]
y_eng = df_eng["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

eX_train, eX_test, ey_train, ey_test = train_test_split(X_eng, y_eng, test_size=0.3, random_state=42)



In [None]:
kX = df.drop(columns={"Cover_Type"})
kX_eng = df_eng.drop(columns={"Cover_Type"})
ky = df["Cover_Type"]
ky_eng = df_eng["Cover_Type"]

kX_train, kX_test, ky_train, ky_test = train_test_split(X, y, test_size=0.3, random_state=42)

keX_train, keX_test, key_train, key_test = train_test_split(X_eng, y_eng, test_size=0.3, random_state=42)

In [None]:
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)
kX_pca = pca.fit(kX)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)
kX_train_pca, kX_test_pca, ky_train_pca, ky_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

## dealing with class imbalances

In [None]:
tomek = TomekLinks(sampling_strategy='auto')
X_tomek, y_tomek = tomek.fit_resample(X_train, y_train)
eX_tomek, ey_tomek = tomek.fit_resample(eX_train, ey_train)

kX_tomek, ky_tomek = tomek.fit_resample(kX_train, ky_train)
keX_tomek, key_tomek = tomek.fit_resample(keX_train, key_train)


In [None]:
smote = SMOTETomek(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
eX_smote, ey_smote = smote.fit_resample(eX_train, ey_train)
pca_X_smote, pca_y_smote = smote.fit_resample(X_train_pca, y_train_pca)


kX_smote, ky_smote = smote.fit_resample(kX_train, ky_train)
keX_smote, key_smote = smote.fit_resample(keX_train, key_train)
pca_kXsmote, pca_kysmote = smote.fit_resample(kX_train_pca, ky_train_pca)

## evaluation

In [None]:
def evaluate_model(model, X, y, cv=5):
    # Accuracy with mean ± std
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    print(f"Accuracy: {scores.mean():.3f} ± {scores.std():.3f}")
    
    # Cross-validated predictions for confusion matrix etc.
    y_pred = cross_val_predict(model, X, y, cv=cv)
    
    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix ({type(model).__name__})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()
    
    # Classification report (precision, recall, f1 per class)
    print("Classification Report:")
    print(classification_report(y, y_pred, digits=3))
    
    # Cohen’s kappa (agreement beyond chance)
    kappa = cohen_kappa_score(y, y_pred)
    print(f"Cohen’s Kappa: {kappa:.3f}")

def plot_cv_boxplot(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    sns.boxplot(data=scores, orient="h", color="skyblue")
    plt.title(f"Accuracy per fold ({type(model).__name__})")
    plt.xlabel("Accuracy")
    plt.show()

from sklearn.model_selection import learning_curve

def plot_learning_curve(model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 5)):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=cv, train_sizes=train_sizes, scoring="accuracy"
    )
    
    train_mean = np.mean(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(7,5))
    plt.plot(train_sizes, train_mean, "o-", label="Training")
    plt.plot(train_sizes, val_mean, "o-", label="Validation")
    plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=0.2)
    plt.fill_between(train_sizes, val_mean-val_std, val_mean+val_std, alpha=0.2)
    plt.title(f"Learning Curve ({type(model).__name__})")
    plt.xlabel("Training set size")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()


## Modeling

In [None]:
def test_knn(X_train , y_train, X_test = kX_test, y_test = ky_test, weights = 'distance' , k = 5, metric = "euclidean"):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights=weights)
    knn.fit(X_train, y_train)
    train_acc = knn.score(X_train, y_train)
    test_acc = knn.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    

In [None]:
test_knn(kX_tomek, ky_tomek)
test_knn(keX_tomek, key_tomek, X_test=keX_test)

In [None]:
test_knn(kX_smote, ky_tomek)
test_knn(keX_smote, key_smote, X_test=keX_test)
test_knn(pca_kXsmote,pca_kysmote)

### tree

In [None]:
def test_tree(X_train, y_train, X_test = X_test, y_test = y_test, metric = 'entropy', max_depth = None):
    tree = DecisionTreeClassifier(criterion = metric, max_depth= max_depth, random_state=42)
    tree.fit(X_train, y_train)
    train_acc = tree.score(X_train, y_train)
    test_acc = tree.score(X_test, y_test)

    print(f"Smote train acc: {train_acc}")
    print(f"Smote test acc: {test_acc}")

In [None]:
test_tree(X_tomek, y_tomek)
test_tree(eX_tomek, ey_tomek, X_test=eX_test)

In [None]:
test_tree(X_smote, y_tomek)
test_tree(eX_smote, ey_smote, X_test=eX_test)
test_tree(pca_X_smote,pca_y_smote)