In [14]:
import pandas as pd
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [15]:
# Load the dataset
base_cov = pd.read_csv("cov_types.csv")
base_cov

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2767.0,66.0,17.0,210.0,18.0,1190.0,234.0,204.0,96.0,2251.0,2,30,Lodgepole Pine
1,2724.0,160.0,19.0,60.0,4.0,1350.0,236.0,240.0,127.0,2514.0,2,16,Lodgepole Pine
2,2360.0,65.0,7.0,127.0,21.0,1377.0,227.0,226.0,134.0,339.0,3,5,Ponderosa Pine
3,2995.0,45.0,4.0,285.0,30.0,5125.0,221.0,231.0,146.0,5706.0,0,11,Lodgepole Pine
4,2400.0,106.0,27.0,150.0,63.0,342.0,253.0,196.0,51.0,811.0,2,3,Ponderosa Pine
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2917.0,90.0,9.0,247.0,25.0,4095.0,235.0,225.0,121.0,3901.0,0,28,Lodgepole Pine
9996,3015.0,38.0,8.0,361.0,74.0,4846.0,220.0,223.0,138.0,1611.0,0,28,Lodgepole Pine
9997,3052.0,79.0,19.0,90.0,11.0,1003.0,241.0,203.0,85.0,1490.0,2,22,Spruce/Fir
9998,2958.0,58.0,6.0,319.0,19.0,2468.0,225.0,227.0,137.0,2280.0,0,28,Lodgepole Pine


In [16]:
# Remove unnecessary columns
base_cov = base_cov.drop(["Wilderness_Area", "Soil_Type"], axis=1)
base_cov

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type
0,2767.0,66.0,17.0,210.0,18.0,1190.0,234.0,204.0,96.0,2251.0,Lodgepole Pine
1,2724.0,160.0,19.0,60.0,4.0,1350.0,236.0,240.0,127.0,2514.0,Lodgepole Pine
2,2360.0,65.0,7.0,127.0,21.0,1377.0,227.0,226.0,134.0,339.0,Ponderosa Pine
3,2995.0,45.0,4.0,285.0,30.0,5125.0,221.0,231.0,146.0,5706.0,Lodgepole Pine
4,2400.0,106.0,27.0,150.0,63.0,342.0,253.0,196.0,51.0,811.0,Ponderosa Pine
...,...,...,...,...,...,...,...,...,...,...,...
9995,2917.0,90.0,9.0,247.0,25.0,4095.0,235.0,225.0,121.0,3901.0,Lodgepole Pine
9996,3015.0,38.0,8.0,361.0,74.0,4846.0,220.0,223.0,138.0,1611.0,Lodgepole Pine
9997,3052.0,79.0,19.0,90.0,11.0,1003.0,241.0,203.0,85.0,1490.0,Spruce/Fir
9998,2958.0,58.0,6.0,319.0,19.0,2468.0,225.0,227.0,137.0,2280.0,Lodgepole Pine


In [17]:
# Display the distribution of the target variable
y = base_cov["Cover_Type"]
y.value_counts()

Cover_Type
Lodgepole Pine       4847
Spruce/Fir           3714
Ponderosa Pine        581
Krummholz             362
Douglas-fir           278
Aspen                 163
Cottonwood/Willow      55
Name: count, dtype: int64

In [None]:
# Prepare the feature matrix
X = base_cov.drop("Cover_Type", axis = 1).values
X.shape

(10000, 10)

In [None]:
# Standardize the features
scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)
X

array([[-0.71700375, -0.78934465,  0.38187624, ..., -0.96994893,
        -1.19597298,  0.19948581],
       [-0.87120088,  0.05516249,  0.6497092 , ...,  0.84361411,
        -0.38987904,  0.39804796],
       [-2.17649753, -0.79832877, -0.95728858, ...,  0.1383396 ,
        -0.20785782, -1.2440535 ],
       ...,
       [ 0.30500049, -0.67255111,  0.6497092 , ..., -1.02032568,
        -1.48200632, -0.37506096],
       [-0.03208161, -0.8612176 , -1.09120506, ...,  0.18871635,
        -0.12984873,  0.22138049],
       [-1.02181203, -0.56474169, -0.15378969, ..., -0.21429766,
        -0.88393662, -0.24595781]])

In [20]:
# Apply KMeans clustering
km = KMeans(n_clusters = 7, random_state = 1)
km.fit(X)

In [21]:
centroids = km.cluster_centers_
centroids

array([[-0.29638   , -0.64869461, -0.36653759, -0.32522762, -0.42824874,
        -0.62769829,  0.55072083,  0.04763309, -0.36369826, -0.21396897],
       [-0.09901304, -0.17845554, -0.59259295,  0.06454506, -0.25186715,
         0.89230545,  0.23882474,  0.25083418,  0.0282891 ,  2.3424372 ],
       [ 0.37151224,  1.14823869, -0.29142083, -0.13731779, -0.27304327,
         0.12551443, -0.58200147,  0.69594178,  0.90315741, -0.30034968],
       [ 0.53873374, -0.60493221, -0.49455369, -0.23815786, -0.38834721,
         1.13572827,  0.52394559,  0.13888019, -0.28387876,  0.01165048],
       [-1.01038147,  1.33324381,  1.40266195, -0.23328241,  0.36717982,
        -0.54186367, -2.09643633, -0.31105117,  1.32970227, -0.50907718],
       [-0.51720783, -0.76904327,  1.34413278, -0.16468552,  0.24535341,
        -0.50521199,  0.57794063, -1.65572688, -1.58827671, -0.33989667],
       [ 0.74438479, -0.01869457, -0.02899445,  1.85534582,  1.89282891,
        -0.10979514,  0.01591413,  0.21642616

In [None]:
# Inverse transform the centroids to original scale
centroids = scaler_x.inverse_transform(centroids) 
centroids

array([[2884.29674306,   81.65540812,   11.41133896,  201.96823482,
          21.69240048, 1375.35705669,  227.19219944,  224.19943707,
         128.00683554, 1703.3699236 ],
       [2939.33524684,  133.99655568,    9.72330654,  284.39380023,
          31.95522388, 3752.5717566 ,  218.88289323,  228.23306544,
         143.0815155 , 5089.3869116 ],
       [3070.54771979,  281.66760696,   11.9722614 ,  241.70568876,
          30.72308416, 2553.34649741,  197.01504466,  237.06864128,
         176.72637518, 1588.95674659],
       [3117.17974514,   86.52649229,   10.45539906,  220.38095238,
          24.01408451, 4133.27364185,  226.47887324,  226.01073105,
         131.07645875, 2002.20791415],
       [2685.18757192,  302.26006904,   24.62255466,  221.41196778,
          67.97468354, 1509.59838895,  156.66858458,  217.07940161,
         193.13003452, 1312.49252014],
       [2822.7158516 ,   68.25969646,   24.18549747,  235.91821248,
          60.88617201, 1566.91989882,  227.91736931,  190

In [None]:
labels = km.labels_
labels

array([5, 0, 0, ..., 5, 0, 0])

In [None]:
# Determine the optimal number of clusters using the elbow method
wcss = []
for i in range(1, 10
               ):
  km = KMeans(n_clusters = i, random_state= 1)
  km.fit(X)
  wcss.append(km.inertia_)

In [None]:
# Plot the elbow method results
fig = px.line(x = range(1, 10), y = wcss)
fig.show();

In [34]:
# Perform PCA for visualization
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X)

In [36]:
# Plot the PCA results
fig = px.scatter(x = X_pca[:,0], y = X_pca[:,1], color = labels)
fig.show()

In [None]:
# Create a crosstab to analyze the clustering results
pd.crosstab(labels, y)

Cover_Type,Aspen,Cottonwood/Willow,Douglas-fir,Krummholz,Lodgepole Pine,Ponderosa Pine,Spruce/Fir
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,69,20,80,30,1394,164,732
1,5,0,0,3,613,0,250
2,25,1,16,85,957,30,1012
3,2,0,0,95,634,1,758
4,8,9,119,8,354,196,175
5,43,24,61,50,479,172,357
6,11,1,2,91,416,18,430
