# Unsupervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of unsupervised learning model evaluation.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
                    
                    
from sklearn.datasets import load_boston, load_iris
from sklearn.decomposition import PCA
import statsmodels.formula.api as smf 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.cluster import DBSCAN                                     

In [2]:
from sklearn import datasets

data = datasets.load_wine()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.Series(data["target"])

## 1. Train a [KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) clustering model on the data set using 8 clusters and compute the [silhouette score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) for the model.

In [4]:
pca_pipeline = make_pipeline(
                    
                    StandardScaler(),
                    
                    PCA(2) )           


pca_pipeline.fit(X)

pca = pca_pipeline.named_steps['pca']

pca.explained_variance_ratio_

array([0.36198848, 0.1920749 ])

transformed_X = pca_pipeline.transform(X)
pd.DataFrame(transformed_X)

In [5]:
wine = pd.DataFrame(transformed_X, columns = ['PCA1', 'PCA2'])
wine.head()


Unnamed: 0,PCA1,PCA2
0,3.316751,-1.443463
1,2.209465,0.333393
2,2.51674,-1.031151
3,3.757066,-2.756372
4,1.008908,-0.869831


## 2. Train a KMeans clustering model on the data set using 5 clusters and compute the silhouette score for the model.

In [6]:
k_means = KMeans(n_clusters=5, random_state=0)

k_means.fit(X)

data['cluster'] = k_means.labels_

sns.scatterplot('PCA1', 'PCA2', data=data, hue='cluster', s=100)



ValueError: Could not interpret value `PCA1` for parameter `x`

## 3. Train a KMeans clustering model on the data set using 3 clusters and compute the silhouette score for the model.

In [7]:
k_means = KMeans(n_clusters=3, random_state=0)

k_means.fit(X)

data['cluster'] = k_means.labels_

sns.scatterplot('PCA1', 'PCA2', data=data, hue='cluster', s=100)



ValueError: Could not interpret value `PCA1` for parameter `x`

## 4. Use elbow curve visualizations to see if you can determine the best number of clusters to use.

## 5. Try performing the same elbow tests with an [AgglomerativeClustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html) model and compare the results you get to the KMeans results.

## 6. Create and plot a scatter matrix (pairplot) showing how the clusters are grouped across all the different combinations of variables in the data.

Use the model and number of clusters that returned the best result above.

## 7. Apply a [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) transform and plot the first two principle components with the plot point colors determined by cluster.

## 8. Generate a series of [t-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) plots showing the clusters at a variety of perplexities.