In [1]:
import sys
sys.path.append('../')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# df = pd.read_csv('Covtype_original.csv')
df = pd.read_csv(f'{Path.cwd()}/../datasets/Covtype_original.csv')

In [None]:
df.head()

In [4]:
# features_df = df.drop('Cover_Type', axis=1)
target_df = df[['Cover_Type']]

In [5]:
le = LabelEncoder()
target_encoded = le.fit_transform(target_df.to_numpy().ravel())
target_encoded_df = pd.DataFrame(target_encoded, columns=['Cover_Type'])

In [7]:
df_to_save = df.copy(deep=True)

In [8]:
df_to_save['Cover_Type'] = target_encoded_df['Cover_Type']

In [None]:
df.compare(df_to_save)

In [14]:
numerical_columns = [
  "Elevation", 
  "Aspect",
  "Slope",
  "Horizontal_Distance_To_Hydrology",
  "Vertical_Distance_To_Hydrology",
  "Horizontal_Distance_To_Roadways",
  "Hillshade_9am",
  "Hillshade_Noon",
  "Hillshade_3pm",
  "Horizontal_Distance_To_Fire_Points"
]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_to_save.drop("Cover_Type", axis=1),
    df_to_save[["Cover_Type"]],
    test_size=50000,
    random_state=42, 
    stratify=df_to_save[["Cover_Type"]].to_numpy().ravel()
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [12]:
merged_df = pd.concat([X_test, y_test], axis=1)

In [None]:
y_test['Cover_Type'].value_counts()

In [None]:
merged_df["Cover_Type"].value_counts().sort_index()

In [None]:
y_test['Cover_Type'].value_counts().sort_index()

In [25]:
# standard scale numerical columns
for col in numerical_columns:
    _scaler = StandardScaler()
    merged_df[col] = _scaler.fit_transform(merged_df[col].to_numpy().reshape(-1, 1))

In [None]:
merged_df.head()

In [18]:
merged_df.to_csv(f'{Path.cwd()}/../datasets/Covtype_preprocessed.csv', index=False)

Correlation matrix

In [None]:
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})