In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objs as go
import plotly.figure_factory as ff


**These libraries help us in analyzing and visualizing data**

In [None]:
 
dataset = pd.read_csv("/content/Municipalities (1).csv")
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

           tc        area    pop        pden          wden  urb      paper  \
0  502.250000  283.750000  22648   79.816742  119671.47660    2   4.363508   
1  228.050003   34.439999   4952  143.786301   70030.98438    3   9.887817   
2  268.010010   26.620001   3895  146.318558   81116.52344    3  11.991079   
3  199.089996   84.300003   7140   84.697502   43320.46094    3   9.762878   
4  233.639999   35.700001  12193  341.540619  201565.26560    2   6.601569   

      glass     metal   plastic      msw_so    msw_un         msw    sor  geo  \
0  3.592508  0.462317  1.131815  20396261.0  13560520  33956781.0  60.07    3   
1  9.518352  1.860965  4.643623   1831407.0    580460   2411867.0  75.93    3   
2  6.653014  0.744725  5.224834   1694922.0    464400   2159322.0  78.49    3   
3  7.551381  0.746540  5.202531   2881055.0    770860   3651915.0  78.89    3   
4  4.334883  0.103101  5.120555   3026700.0   4169180   7195880.0  42.06    1   

   roads  s_wteregio  s_landfill  
0  285.0 

**Importing the dataset and examining it for it's features and descrption**

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3642 entries, 0 to 3641
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tc          3642 non-null   float64
 1   area        3642 non-null   float64
 2   pop         3642 non-null   int64  
 3   pden        3642 non-null   float64
 4   wden        3642 non-null   float64
 5   urb         3642 non-null   int64  
 6   paper       3642 non-null   float64
 7   glass       3642 non-null   float64
 8   metal       3642 non-null   float64
 9   plastic     3642 non-null   float64
 10  msw_so      3642 non-null   float64
 11  msw_un      3642 non-null   int64  
 12  msw         3642 non-null   float64
 13  sor         3642 non-null   float64
 14  geo         3642 non-null   int64  
 15  roads       3642 non-null   float64
 16  s_wteregio  3642 non-null   float64
 17  s_landfill  3642 non-null   float64
dtypes: float64(14), int64(4)
memory usage: 512.3 KB


**This is to drop the 'geo' column from the dataset as it is categorical and cannot be used in PCA**

In [None]:
dataset = dataset.drop('geo', axis=1)

**Standardize the features of the dataset using StandardScaler from sklearn.preprocessing**

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset)

**This step performs PCA to reduce the dimensionality of the municipalities dataset and also it gives us the variance of the components imvolved**

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))


Variance explained by each of the n_components:  [0.28440736 0.15746944]
Total variance explained by the n_components:  0.4418768031256871


**This creates a scatter plot to visualize the data**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_pca[:, 0], y=data_pca[:, 1], mode='markers',
                          marker=dict(color=dataset['urb'], size=5, opacity=0.8,
                                      showscale=True, colorscale='Plasma'),
                          text=dataset.index))

fig.update_layout(title='PCA Visualization of the Municipalities Dataset',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2')

fig.show()

**In the scatter plot obtained each point represents a municipality, colored by its urbanization index, 1 is low while 3 is high The x and y coordinates of the points correspond to the first and second principal components obtained from PCA**

**There are distinct clusters visible in the plot,this indicates the waste produced and urbanization index are highly correlated and can be used to group municipalities into different categories**

**If the clusters are not  distinct, it might suggest that waste production and urbanization are not strongly related, and other variables may need to be considered to clearly understand the patterns in the data**

**Installing the umap-learn package**

In [None]:
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82830 sha256=27a1412992f23c8251a97a8b0a743cad1a67c135ce26a7949f268b0bfde59a4f
  Stored in directory: /root/.cache/pip/wheels/f4/3e/1c/596d0a463d17475af648688443fa4846fef624d1390339e7e9
  Buil

**Standardize the features of the dataset using StandardScaler from sklearn.preprocessing.**

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataset)

**This step perfroms UMAP to reduce the dimensionality of the dataset and also it gives us the variance**

In [None]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=2,
                            random_state=42).fit_transform(data_scaled)
umap_var = np.var(umap_embeddings)
print('UMAP variance:', umap_var)                           

UMAP variance: 24.558424


**This step creates a scatter plot to visualize the data**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=umap_embeddings[:, 0], y=umap_embeddings[:, 1], mode='markers',
                          marker=dict(color=dataset['urb'], size=5, opacity=0.8,
                                      showscale=True, colorscale='Magma'),
                          text=dataset.index))

fig.update_layout(title='UMAP Visualization of Municipalities Dataset',
                  xaxis_title='UMAP Component 1',
                  yaxis_title='UMAP Component 2')

fig.show()

**In the scatter plot obtained each point represents a municipality, colored by its urbanization index, 1 is low while 3 is high The x and y coordinates  corresponds to the two components obtained from UMAP**

**There are distinct clusters visible in the plot,this indicates the waste produced and urbanization index are highly correlated and can be used to group municipalities into different categories**

**If the clusters are not distinct, it might suggest that waste production and urbanization are not strongly related, and other variables may need to be considered to clearly understand the patterns in the data**

**n this case, we can't directly compare the variance of PCA (0.4418) with the variance of UMAP (24.5584), as they have different scales. Variance is a measure of the spread of data, but it doesn't directly indicate the quality of the visualization also they have different ways of reducing the dimensionality of the data**

**To determine which technique works better, we need to look at the visualizations  and assess how well they separate the data into distinct clusters or groups**


**So, in our visualisations UMAP seems to be a better technique as it's clusters seem to quite distinctly seperated when compared to the PCA clusters**