In [1]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import GridSearchCV



## Presprocessing

In [3]:
raw_df = pd.read_csv("../forestCover.csv", na_values="?")
raw_df = raw_df.rename(columns={"Observation_ID":"Water_Level" , "Water_Level":"Observation_ID"})
# display(raw_df)

In [4]:
na_col = raw_df.columns[raw_df.isna().any()]
# print(na_col)
raw_df["Slope"].isna().sum()

np.int64(298)

In [6]:
print(f"Corr of Inclination and Cover: {raw_df['Inclination'].corr(raw_df['Cover_Type'])}")
print(f"Corr of aspect and facet: {raw_df['Facet'].corr(raw_df['Aspect'])}")
print(f"Cardinality of water level: {raw_df['Water_Level'].nunique()}")
print(f"Cardinality of obs id: {raw_df['Observation_ID'].nunique()}")
df = raw_df.drop(columns={'Facet','Water_Level','Observation_ID', 'Inclination' })



Corr of Inclination and Cover: 0.0002417782055169697
Corr of aspect and facet: 0.9999980536139016
Cardinality of water level: 1
Cardinality of obs id: 581012


In [None]:
na_rows = df.index[df.isna().any(axis=1)]
na_proportions = (df["Cover_Type"][na_rows].value_counts() /  df["Cover_Type"].value_counts() * 100).round(3)
print(na_proportions)
df = df.dropna()



Cover_Type
1    0.045
2    0.055
3    0.053
4    0.036
5    0.021
6    0.069
7    0.054
Name: count, dtype: float64


In [8]:
df["Soil_Type1"] = (df["Soil_Type1"] == 'positive').astype(int)
df["Soil_Type1"].value_counts()

Soil_Type1
1    577981
0      3031
Name: count, dtype: int64

In [9]:
continuous_features = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

# # Plot distributions
# fig, axes = plt.subplots(len(continuous_features), 1, figsize=(5, 15))

# for i, col in enumerate(continuous_features):
#     sns.histplot(df[col], bins=50, kde=True, ax=axes[i], color="steelblue")
#     axes[i].set_title(f"Distribution of {col}")
#     axes[i].set_xlabel("")
#     axes[i].set_ylabel("Count")

# plt.tight_layout()
# plt.show()

## Data Augmentation

In [None]:
df_eng = df.copy()
df_eng["Relative_hoz_dist_mean"] = df[[
    "Horizontal_Distance_To_Hydrology",
    "Horizontal_Distance_To_Fire_Points",
    "Horizontal_Distance_To_Roadways"
]].mean(axis=1)

df_eng["total_hillshade"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].sum(axis=1)
# df_eng["min_hillshade"]   = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)
# df_eng["max_hillshade"]   = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1)

df_eng["Relative_height_from_water"] = df["Elevation"] - df["Vertical_Distance_To_Hydrology"]
df_eng = df_eng.drop(columns={"Elevation",  "Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon", "Hillshade_3pm"  })

## Train test

In [None]:
X = df.drop(columns={"Cover_Type"})
X_eng = df_eng.drop(columns={"Cover_Type"})
y = df["Cover_Type"]
y_eng = df_eng["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

eX_train, eX_test, ey_train, ey_test = train_test_split(X_eng, y_eng, test_size=0.3, random_state=42)



## PCA

In [None]:
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)


## Tomek

In [None]:
tomek = TomekLinks(sampling_strategy='auto')
X_train_t, y_train_t = tomek.fit_resample(X_train, y_train)
eX_train_t, ey_train_t = tomek.fit_resample(eX_train, ey_train)


In [None]:
smote = SMOTETomek(random_state=42)
X_train_s, y_train_s = tomek.fit_resample(X_train, y_train)
eX_train_s, ey_train_s = tomek.fit_resample(eX_train, ey_train)

### KNN dataset

In [None]:
outlier_features = [
    "Horizontal_Distance_To_Hydrology",
    "Hillshade_Noon",
    "Vertical_Distance_To_Hydrology",
]

def clamp_series(s, lower_q=0.01, upper_q=0.99):
    lower, upper = s.quantile([lower_q, upper_q])
    return s.clip(lower, upper)

# apply clamping
for col in outlier_features:
    eX_train_s[col] = clamp_series(eX_train_s[col], 0.01, 0.99)
    eX_train_t[col] = clamp_series(eX_train_t[col], 0.01, 0.99)
    


continuous_features = [
    "Elevation",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

continuous_features_eng = [
    'Slope',
    'Cover_Type', 
    'Relative_hoz_dist_mean', 
    'total_hillshade',
    'Relative_height_from_water'
]

knn_X_t = X_train_t.copy()
knn_X_s = X_train_s.copy()
knn_eX_t = eX_train_t.copy()
knn_eX_s = eX_train_s.copy()


def scale_knn(knn_df, continuous_features = continuous_features):
    scaler = MinMaxScaler()
    knn_df[continuous_features] = scaler.fit_transform(knn_df[continuous_features])
    knn_df["Aspect_rad"] = np.deg2rad(knn_df["Aspect"])
    knn_df["Aspect_sin"] = np.sin(knn_df["Aspect_rad"])
    knn_df["Aspect_cos"] = np.cos(knn_df["Aspect_rad"])
    knn_df = knn_df.drop(columns=["Aspect", "Aspect_rad"])


scale_knn(knn_X_t)
scale_knn(knn_eX_s, continuous_features_eng)
scale_knn(knn_eX_t, continuous_features_eng)
scale_knn(knn_X_s)


    


# Models

## KNN

In [None]:
def test_knn(X_train, y_train, X_test = X_test, y_test = y_test, weights = 'distance' , k = 5, metric = "euclidean"):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights=weights)
    knn.fit(X_train, y_train)
    train_acc = knn.score(X_train, X_train)
    test_acc = knn.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    

In [None]:
test_knn(X_train, y_train)

In [None]:
test_knn(knn_eX_s, ey_train)
test_knn(knn_eX_t, ey_train)

In [None]:
test_knn(knn_eX_s, ey_train, weights='uniform')
test_knn(knn_eX_t, ey_train, weights='uniform')

In [None]:
test_knn(knn_X_s, y_train_s)
test_knn(knn_X_t, y_train_t)

In [None]:
test_knn(knn_X_s, y_train_s, k=10)
test_knn(knn_X_t, y_train_t, k = 15)