In [1]:
# Importing Important Libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE # T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
# Importing the dataset
df = pd.read_csv('https://raw.githubusercontent.com/Hackveda/Data-Analyst-Course-by-Mr-Uttam/main/covtype_Data.csv', index_col = 0)
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [3]:
# Getting the list of columns
print(df.columns)

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [4]:
# Getting unique values of target variable
df['Cover_Type'].unique()

array([5, 2, 1, 7, 3, 6, 4])

In [5]:
# Creating the backup
X = df.copy()

In [6]:
# Checking null values across columns
df.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0
Soil_Type11                           0


In [7]:
# Adding a Feature
X["Distance_To_Hydrology"] = ((X["Horizontal_Distance_To_Hydrology"] ** 2) + (X["Vertical_Distance_To_Hydrology"] ** 2)) ** (0.5)

In [8]:
# Dropping irrelevant columns
X.drop(["Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology"], axis=1, inplace=True)

In [9]:
# head()
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,0,0,0,0,0,0,5,258.0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,0,0,0,0,0,0,5,212.084889
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,0,0,0,0,0,0,2,275.769832
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,0,0,0,0,0,0,2,269.235956
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,0,0,0,0,0,0,5,153.003268


In [10]:
# Converting numerical values into categorical
X['Cover_Type'].replace({1:'Spruce/Fir', 2:'Lodgepole Pine', 3:'Ponderosa Pine', 4:'Cottonwood/Willow', 5:'Aspen', 6:'Douglas-fir', 7:'Krummholz'}, inplace=True)

In [11]:
# head()
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,0,0,0,0,0,0,Aspen,258.0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,0,0,0,0,0,0,Aspen,212.084889
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,0,0,0,0,0,0,Lodgepole Pine,275.769832
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,0,0,0,0,0,0,Lodgepole Pine,269.235956
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,0,0,0,0,0,0,Aspen,153.003268


In [12]:
# Getting data types of columns in X
X.dtypes

Elevation                               int64
Aspect                                  int64
Slope                                   int64
Horizontal_Distance_To_Roadways         int64
Hillshade_9am                           int64
Hillshade_Noon                          int64
Hillshade_3pm                           int64
Horizontal_Distance_To_Fire_Points      int64
Wilderness_Area1                        int64
Wilderness_Area2                        int64
Wilderness_Area3                        int64
Wilderness_Area4                        int64
Soil_Type1                              int64
Soil_Type2                              int64
Soil_Type3                              int64
Soil_Type4                              int64
Soil_Type5                              int64
Soil_Type6                              int64
Soil_Type7                              int64
Soil_Type8                              int64
Soil_Type9                              int64
Soil_Type10                       

In [13]:
#We use pandas's 'get_dummies()' method
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type39,Soil_Type40,Distance_To_Hydrology,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,2596,51,3,510,221,232,148,6279,1,0,...,0,0,258.0,1,0,0,0,0,0,0
1,2590,56,2,390,220,235,151,6225,1,0,...,0,0,212.084889,1,0,0,0,0,0,0
2,2804,139,9,3180,234,238,135,6121,1,0,...,0,0,275.769832,0,0,0,0,1,0,0
3,2785,155,18,3090,238,238,122,6211,1,0,...,0,0,269.235956,0,0,0,0,1,0,0
4,2595,45,2,391,220,234,150,6172,1,0,...,0,0,153.003268,1,0,0,0,0,0,0


In [14]:
#numer is the DataFrame that holds all of X's numerical variables
numer = X[["Elevation","Aspect","Slope","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points","Distance_To_Hydrology"]]

In [15]:
#cater is the DataFrame that holds all of X's categorical variables
cater = X[["Wilderness_Area1","Wilderness_Area2","Wilderness_Area3","Wilderness_Area4","Soil_Type1","Soil_Type2","Soil_Type3","Soil_Type4","Soil_Type5","Soil_Type6","Soil_Type7","Soil_Type8","Soil_Type9","Soil_Type10","Soil_Type11","Soil_Type12","Soil_Type13","Soil_Type14","Soil_Type15","Soil_Type16","Soil_Type17","Soil_Type18","Soil_Type19","Soil_Type20","Soil_Type21","Soil_Type22","Soil_Type23","Soil_Type24","Soil_Type25","Soil_Type26","Soil_Type27","Soil_Type28","Soil_Type29","Soil_Type30","Soil_Type31","Soil_Type32","Soil_Type33","Soil_Type34","Soil_Type35","Soil_Type36","Soil_Type37","Soil_Type38","Soil_Type39","Soil_Type40","Cover_Type_Aspen","Cover_Type_Cottonwood/Willow","Cover_Type_Douglas-fir","Cover_Type_Krummholz","Cover_Type_Lodgepole Pine","Cover_Type_Ponderosa Pine","Cover_Type_Spruce/Fir"]]

In [16]:
numer.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Distance_To_Hydrology
0,2596,51,3,510,221,232,148,6279,258.0
1,2590,56,2,390,220,235,151,6225,212.084889
2,2804,139,9,3180,234,238,135,6121,275.769832
3,2785,155,18,3090,238,238,122,6211,269.235956
4,2595,45,2,391,220,234,150,6172,153.003268


In [17]:
cater.head()

Unnamed: 0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,...,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [18]:
# Initializing Scaler
scaler = StandardScaler()

In [19]:
# Scaling
numer = pd.DataFrame(scaler.fit_transform(numer))
numer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.473738,-0.824599,-1.40073,-1.699341,0.177018,0.437966,0.24583,2.055236,0.016471
1,-1.502714,-0.778347,-1.56163,-1.770138,0.129943,0.626714,0.341543,2.022805,-0.221737
2,-0.469223,-0.010552,-0.435334,-0.124101,0.788986,0.815461,-0.168927,1.960347,0.108661
3,-0.560982,0.137456,1.012761,-0.177199,0.977283,0.815461,-0.583684,2.014397,0.074763
4,-1.478567,-0.880102,-1.56163,-1.769548,0.129943,0.563798,0.309638,1.990976,-0.528253


In [20]:
# Renaming the columns
numer.columns = ["Elevation_Scaled","Aspect_Scaled",
                 "Slope_Scaled","Horizontal_Distance_To_Roadways_Scaled",
                 "Hillshade_9am_Scaled","Hillshade_Noon_Scaled",
                 "Hillshade_3pm_Scaled","Horizontal_Distance_To_Fire_Points_Scaled",
                 "Distance_To_Hydrology_Scaled"]

In [21]:
numer.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled
0,-1.473738,-0.824599,-1.40073,-1.699341,0.177018,0.437966,0.24583,2.055236,0.016471
1,-1.502714,-0.778347,-1.56163,-1.770138,0.129943,0.626714,0.341543,2.022805,-0.221737
2,-0.469223,-0.010552,-0.435334,-0.124101,0.788986,0.815461,-0.168927,1.960347,0.108661
3,-0.560982,0.137456,1.012761,-0.177199,0.977283,0.815461,-0.583684,2.014397,0.074763
4,-1.478567,-0.880102,-1.56163,-1.769548,0.129943,0.563798,0.309638,1.990976,-0.528253


In [22]:
# Concatenating numer and cater
X = pd.concat([numer, cater], axis = 1, join = 'inner')
X.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir
0,-1.473738,-0.824599,-1.40073,-1.699341,0.177018,0.437966,0.24583,2.055236,0.016471,1,...,0,0,0,1,0,0,0,0,0,0
1,-1.502714,-0.778347,-1.56163,-1.770138,0.129943,0.626714,0.341543,2.022805,-0.221737,1,...,0,0,0,1,0,0,0,0,0,0
2,-0.469223,-0.010552,-0.435334,-0.124101,0.788986,0.815461,-0.168927,1.960347,0.108661,1,...,0,0,0,0,0,0,0,1,0,0
3,-0.560982,0.137456,1.012761,-0.177199,0.977283,0.815461,-0.583684,2.014397,0.074763,1,...,0,0,0,0,0,0,0,1,0,0
4,-1.478567,-0.880102,-1.56163,-1.769548,0.129943,0.563798,0.309638,1.990976,-0.528253,1,...,0,0,0,1,0,0,0,0,0,0


In [23]:
# Initializing kmeans
kmeans = KMeans(n_clusters = 3)

In [24]:
# Fitting our model
kmeans.fit(X)

KMeans(n_clusters=3)

In [26]:
# Find out the cluster on same dataset
clusters = kmeans.predict(X)

In [27]:
clusters

array([0, 0, 2, ..., 2, 2, 1], dtype=int32)

In [28]:
# Adding the cluster column to X
X['cluster'] = clusters

In [29]:
X.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster
0,-1.473738,-0.824599,-1.40073,-1.699341,0.177018,0.437966,0.24583,2.055236,0.016471,1,...,0,0,1,0,0,0,0,0,0,0
1,-1.502714,-0.778347,-1.56163,-1.770138,0.129943,0.626714,0.341543,2.022805,-0.221737,1,...,0,0,1,0,0,0,0,0,0,0
2,-0.469223,-0.010552,-0.435334,-0.124101,0.788986,0.815461,-0.168927,1.960347,0.108661,1,...,0,0,0,0,0,0,1,0,0,2
3,-0.560982,0.137456,1.012761,-0.177199,0.977283,0.815461,-0.583684,2.014397,0.074763,1,...,0,0,0,0,0,0,1,0,0,2
4,-1.478567,-0.880102,-1.56163,-1.769548,0.129943,0.563798,0.309638,1.990976,-0.528253,1,...,0,0,1,0,0,0,0,0,0,0


In [30]:
print(X.shape)

(180000, 61)


In [33]:
np.random.seed(3)

In [48]:
#plotX is a DataFrame containing 5000 values sampled randomly from X
plotX = pd.DataFrame(np.array(X.sample(5000)))

#Rename plotX's columns since it was briefly converted to an np.array above
plotX.columns = X.columns
plotX.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster
0,-0.425759,-1.231623,-0.596233,-0.560684,-0.293727,-0.002444,0.341543,-0.210679,-0.881054,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.858861,-0.769096,-0.596233,0.957918,0.41239,-0.128276,-0.26464,-0.430484,-1.322035,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
2,-1.05358,-0.297319,-0.757133,-1.304646,0.694837,0.437966,-0.232736,-0.07195,1.396604,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.119963,-1.213122,-0.274435,0.729596,-0.387876,-0.254107,0.277734,-0.578222,0.094139,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
4,1.158284,0.970005,0.047364,-1.925302,-1.093993,1.507534,1.649621,0.15266,0.098835,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [32]:
print(plotX.shape)

(5000, 61)


In [34]:
# PCA
pca_1d = PCA(n_components = 1)
pca_2d = PCA(n_components = 2)
pca_3d = PCA(n_components = 3)

In [37]:
pd.DataFrame(pca_1d.fit_transform(plotX.drop(['cluster'], axis = 1)))

Unnamed: 0,0
0,-2.155380
1,0.532802
2,0.713658
3,-0.981947
4,-0.317682
...,...
4995,-0.227473
4996,0.549225
4997,1.819697
4998,0.889315


In [38]:
pd.DataFrame(pca_2d.fit_transform(plotX.drop(['cluster'], axis = 1)))

Unnamed: 0,0,1
0,-2.155380,-0.221815
1,0.532802,-0.493686
2,0.713658,-0.045606
3,-0.981947,3.879404
4,-0.317682,-0.602063
...,...,...
4995,-0.227473,-1.242969
4996,0.549225,-1.237549
4997,1.819697,-0.922404
4998,0.889315,-0.774393


In [39]:
pd.DataFrame(pca_3d.fit_transform(plotX.drop(['cluster'], axis = 1)))

Unnamed: 0,0,1,2
0,-2.155380,-0.221815,0.148445
1,0.532802,-0.493686,-0.349625
2,0.713658,-0.045606,-1.091859
3,-0.981947,3.879404,1.191879
4,-0.317682,-0.602063,-0.651617
...,...,...,...
4995,-0.227473,-1.242969,1.864968
4996,0.549225,-1.237549,-0.037399
4997,1.819697,-0.922404,2.205066
4998,0.889315,-0.774393,0.498390


In [41]:
#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["cluster"], axis=1)))

In [42]:
PCs_1d.head()

Unnamed: 0,0
0,-2.15538
1,0.532802
2,0.713658
3,-0.981947
4,-0.317682


In [43]:
PCs_2d.head()

Unnamed: 0,0,1
0,-2.15538,-0.221815
1,0.532802,-0.493686
2,0.713658,-0.045606
3,-0.981947,3.879404
4,-0.317682,-0.602063


In [44]:
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'

PCs_2d.columns = ["PC1_2d", "PC2_2d"]
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

In [45]:
PCs_3d.head()

Unnamed: 0,PC1_3d,PC2_3d,PC3_3d
0,-2.15538,-0.221815,0.148445
1,0.532802,-0.493686,-0.349625
2,0.713658,-0.045606,-1.091859
3,-0.981947,3.879404,1.191879
4,-0.317682,-0.602063,-0.651617


In [49]:
plotX = pd.concat([plotX, PCs_1d, PCs_2d, PCs_3d], axis = 1, join = 'inner')
plotX.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster,PC1_1d,PC1_2d,PC2_2d,PC1_3d,PC2_3d,PC3_3d
0,-0.425759,-1.231623,-0.596233,-0.560684,-0.293727,-0.002444,0.341543,-0.210679,-0.881054,1.0,...,1.0,0.0,0.0,0.0,-2.15538,-2.15538,-0.221815,-2.15538,-0.221815,0.148445
1,0.858861,-0.769096,-0.596233,0.957918,0.41239,-0.128276,-0.26464,-0.430484,-1.322035,1.0,...,0.0,0.0,1.0,2.0,0.532802,0.532802,-0.493686,0.532802,-0.493686,-0.349625
2,-1.05358,-0.297319,-0.757133,-1.304646,0.694837,0.437966,-0.232736,-0.07195,1.396604,1.0,...,1.0,0.0,0.0,0.0,0.713658,0.713658,-0.045606,0.713658,-0.045606,-1.091859
3,0.119963,-1.213122,-0.274435,0.729596,-0.387876,-0.254107,0.277734,-0.578222,0.094139,1.0,...,1.0,0.0,0.0,2.0,-0.981947,-0.981947,3.879404,-0.981947,3.879404,1.191879
4,1.158284,0.970005,0.047364,-1.925302,-1.093993,1.507534,1.649621,0.15266,0.098835,0.0,...,1.0,0.0,0.0,1.0,-0.317682,-0.317682,-0.602063,-0.317682,-0.602063,-0.651617


In [50]:
plotX['dummy'] = 0
plotX.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster,PC1_1d,PC1_2d,PC2_2d,PC1_3d,PC2_3d,PC3_3d,dummy
0,-0.425759,-1.231623,-0.596233,-0.560684,-0.293727,-0.002444,0.341543,-0.210679,-0.881054,1.0,...,0.0,0.0,0.0,-2.15538,-2.15538,-0.221815,-2.15538,-0.221815,0.148445,0
1,0.858861,-0.769096,-0.596233,0.957918,0.41239,-0.128276,-0.26464,-0.430484,-1.322035,1.0,...,0.0,1.0,2.0,0.532802,0.532802,-0.493686,0.532802,-0.493686,-0.349625,0
2,-1.05358,-0.297319,-0.757133,-1.304646,0.694837,0.437966,-0.232736,-0.07195,1.396604,1.0,...,0.0,0.0,0.0,0.713658,0.713658,-0.045606,0.713658,-0.045606,-1.091859,0
3,0.119963,-1.213122,-0.274435,0.729596,-0.387876,-0.254107,0.277734,-0.578222,0.094139,1.0,...,0.0,0.0,2.0,-0.981947,-0.981947,3.879404,-0.981947,3.879404,1.191879,0
4,1.158284,0.970005,0.047364,-1.925302,-1.093993,1.507534,1.649621,0.15266,0.098835,0.0,...,0.0,0.0,1.0,-0.317682,-0.317682,-0.602063,-0.317682,-0.602063,-0.651617,0


In [52]:
#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["cluster"] == 0]
cluster1 = plotX[plotX["cluster"] == 1]
cluster2 = plotX[plotX["cluster"] == 2]

In [53]:
# This is essential to make sure that your Jupyter Notebook is in connected mode.
init_notebook_mode(connected = True)

In [54]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_1d"],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in One Dimension Using PCA"
layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [55]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"
layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [56]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)
data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [58]:
# Set our perplexity
perplexity = 50

In [59]:
#T-SNE with one dimension
tsne_1d = TSNE(n_components=1, perplexity=perplexity)

#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

#T-SNE with three dimensions
tsne_3d = TSNE(n_components=3, perplexity=perplexity)

In [60]:
#plotX will hold the values we wish to plot
plotX = pd.DataFrame(np.array(X.sample(5000)))
plotX.columns = X.columns
plotX.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Soil_Type39,Soil_Type40,Cover_Type_Aspen,Cover_Type_Cottonwood/Willow,Cover_Type_Douglas-fir,Cover_Type_Krummholz,Cover_Type_Lodgepole Pine,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster
0,-0.164971,-0.621087,-0.435334,0.132539,0.647762,-0.128276,-0.519875,1.874467,0.820989,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
1,1.684689,1.987563,-1.40073,1.310724,-0.105429,0.563798,0.532969,-0.097173,1.782825,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.110305,1.969062,-0.274435,-1.278097,-0.670323,-0.128276,0.564873,-0.755988,-0.543816,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.65757,-1.166869,0.047364,-1.176621,-0.434951,-0.694518,0.022499,-1.011827,0.452385,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.628593,-0.537833,-0.596233,-1.050366,0.694837,-0.002444,-0.456066,-0.191461,0.080839,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
#This DataFrame holds a single dimension,built by T-SNE
TCs_1d = pd.DataFrame(tsne_1d.fit_transform(plotX.drop(["cluster"], axis=1)))

#This DataFrame contains two dimensions, built by T-SNE
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plotX.drop(["cluster"], axis=1)))

#And this DataFrame contains three dimensions, built by T-SNE
TCs_3d = pd.DataFrame(tsne_3d.fit_transform(plotX.drop(["cluster"], axis=1)))

In [63]:
TCs_1d.columns = ["TC1_1d"]

PCs_1d.columns = ["PC1_1d"]

#"TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
#And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
TCs_2d.columns = ["TC1_2d","TC2_2d"]

TCs_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]

In [64]:
plotX = pd.concat([plotX,TCs_1d,TCs_2d,TCs_3d], axis=1, join='inner')

In [65]:
plotX['dummy'] = 0

In [71]:
plotX.head()

Unnamed: 0,Elevation_Scaled,Aspect_Scaled,Slope_Scaled,Horizontal_Distance_To_Roadways_Scaled,Hillshade_9am_Scaled,Hillshade_Noon_Scaled,Hillshade_3pm_Scaled,Horizontal_Distance_To_Fire_Points_Scaled,Distance_To_Hydrology_Scaled,Wilderness_Area1,...,Cover_Type_Ponderosa Pine,Cover_Type_Spruce/Fir,cluster,TC1_1d,TC1_2d,TC2_2d,TC1_3d,TC2_3d,TC3_3d,dummy
0,-0.164971,-0.621087,-0.435334,0.132539,0.647762,-0.128276,-0.519875,1.874467,0.820989,1.0,...,0.0,1.0,2.0,22.446621,-6.970609,-16.709356,-8.99149,14.970486,1.717221,0
1,1.684689,1.987563,-1.40073,1.310724,-0.105429,0.563798,0.532969,-0.097173,1.782825,1.0,...,0.0,0.0,1.0,42.482433,15.645441,-41.133461,-5.70573,2.089988,-10.315873,0
2,0.110305,1.969062,-0.274435,-1.278097,-0.670323,-0.128276,0.564873,-0.755988,-0.543816,1.0,...,0.0,0.0,1.0,69.032593,-29.133841,-44.246025,4.149238,4.738729,-11.903035,0
3,-0.65757,-1.166869,0.047364,-1.176621,-0.434951,-0.694518,0.022499,-1.011827,0.452385,1.0,...,0.0,0.0,0.0,-49.517193,-41.802177,14.697855,9.455491,9.968051,5.504207,0
4,-0.628593,-0.537833,-0.596233,-1.050366,0.694837,-0.002444,-0.456066,-0.191461,0.080839,1.0,...,0.0,0.0,0.0,-46.909973,-39.884995,9.207149,6.847211,11.057826,2.809161,0


In [66]:
# Seprating rows based on cluster
cluster0 = plotX[plotX["cluster"] == 0]
cluster1 = plotX[plotX["cluster"] == 1]
cluster2 = plotX[plotX["cluster"] == 2]

In [67]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["TC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["TC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["TC1_1d"],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in One Dimension Using T-SNE (perplexity=" + str(perplexity) + ")"
layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [68]:
trace1 = go.Scatter(
                    x = cluster0["TC1_2d"],
                    y = cluster0["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["TC1_2d"],
                    y = cluster1["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["TC1_2d"],
                    y = cluster2["TC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"
layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [69]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["TC1_3d"],
                    y = cluster0["TC2_3d"],
                    z = cluster0["TC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["TC1_3d"],
                    y = cluster1["TC2_3d"],
                    z = cluster1["TC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["TC1_3d"],
                    y = cluster2["TC2_3d"],
                    z = cluster2["TC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)
data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Three Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"

layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)