# Unsupervised Learning in Python

## Clustering for dataset exploration

### Clustering 2D points


In [None]:
import os
import pandas as pd
import numpy as np

DATA_PATH = "../data/raw"

points = pd.read_csv(os.path.join(DATA_PATH, "points.csv")).values
new_points = pd.read_csv(os.path.join(DATA_PATH, "new_points.csv")).values


In [None]:
# Import KMeans
from sklearn.cluster import KMeans

# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)

# Fit model to points
model.fit(points)

# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)

# Print cluster labels of new_points
print(labels)


### Inspect your clustering


In [None]:
# Import pyplot
import matplotlib.pyplot as plt

# Assign the columns of new_points: xs and ys
xs = new_points[:, 0]
ys = new_points[:, 1]

# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs, ys, c=labels, alpha=0.5)

# Assign the cluster centers: centroids
centroids = model.cluster_centers_

# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:, 0]
centroids_y = centroids[:, 1]

# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x, centroids_y, marker="D", s=50)
plt.show()


### How many clusters of grain?


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "samples.csv")).values
ks = range(1, 6)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    model.fit(samples)

    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)

# Plot ks vs inertias
plt.plot(ks, inertias, "-o")
plt.xlabel("number of clusters, k")
plt.ylabel("inertia")
plt.xticks(ks)
plt.show()


### Evaluating the grain clustering


In [None]:
varieties = [
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
]


In [None]:
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters=3)

# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(samples)

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({"labels": labels, "varieties": varieties})

# Create crosstab: ct
ct = pd.crosstab(df["labels"], df["varieties"])

# Display ct
print(ct)


### Scaling fish data for clustering


In [None]:
# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Create scaler: scaler
scaler = StandardScaler()

# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)


### Clustering the fish data


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "fish.csv")).values
species = [
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Bream",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Roach",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Smelt",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
    "Pike",
]


In [None]:
# Import pandas
import pandas as pd

# Fit the pipeline to samples
pipeline.fit(samples)

# Calculate the cluster labels: labels
labels = pipeline.predict(samples)

# Create a DataFrame with labels and species as columns: df
df = pd.DataFrame({"labels":labels, "species": species})

# Create crosstab: ct
ct = pd.crosstab(df["labels"],df["species"])

# Display ct
print(ct)


### Clustering stocks using KMeans


In [None]:
movements = pd.read_csv(os.path.join(DATA_PATH, "movements.csv")).values


In [None]:
# Import Normalizer
from sklearn.preprocessing import Normalizer

# Create a normalizer: normalizer
normalizer = Normalizer()

# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters=10)

# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(normalizer, kmeans)

# Fit pipeline to the daily price movements
pipeline.fit(movements)


### Which stocks move together?


In [None]:
companies = [
    "Apple",
    "AIG",
    "Amazon",
    "American express",
    "Boeing",
    "Bank of America",
    "British American Tobacco",
    "Canon",
    "Caterpillar",
    "Colgate-Palmolive",
    "ConocoPhillips",
    "Cisco",
    "Chevron",
    "DuPont de Nemours",
    "Dell",
    "Ford",
    "General Electrics",
    "Google/Alphabet",
    "Goldman Sachs",
    "GlaxoSmithKline",
    "Home Depot",
    "Honda",
    "HP",
    "IBM",
    "Intel",
    "Johnson & Johnson",
    "JPMorgan Chase",
    "Kimberly-Clark",
    "Coca Cola",
    "Lookheed Martin",
    "MasterCard",
    "McDonalds",
    "3M",
    "Microsoft",
    "Mitsubishi",
    "Navistar",
    "Northrop Grumman",
    "Novartis",
    "Pepsi",
    "Pfizer",
    "Procter Gamble",
    "Philip Morris",
    "Royal Dutch Shell",
    "SAP",
    "Schlumberger",
    "Sony",
    "Sanofi-Aventis",
    "Symantec",
    "Toyota",
    "Total",
    "Taiwan Semiconductor Manufacturing",
    "Texas instruments",
    "Unilever",
    "Valero Energy",
    "Walgreen",
    "Wells Fargo",
    "Wal-Mart",
    "Exxon",
    "Xerox",
    "Yahoo",
]


In [None]:
# Import pandas
import pandas as pd

# Predict the cluster labels: labels
labels = pipeline.predict(movements)

# Create a DataFrame aligning labels and companies: df
df = pd.DataFrame({"labels": labels, "companies": companies})

# Display df sorted by cluster label
print(df.sort_values(by="labels"))


---

## Visualization with hierarchical clustering and t-SNE

### Hierarchical clustering of the grain data


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "samples.csv")).values


In [None]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(samples, method="complete")

# Plot the dendrogram, using varieties as labels
dendrogram(
    mergings,
    labels=varieties,
    leaf_rotation=90,
    leaf_font_size=6,
)
plt.show()


### Hierarchies of stocks


In [None]:
# Import normalize
from sklearn.preprocessing import normalize

# Normalize the movements: normalized_movements
normalized_movements = normalize(movements)

# Calculate the linkage: mergings
mergings = linkage(normalized_movements, method="complete")

# Plot the dendrogram
dendrogram(mergings, labels=companies, leaf_rotation=90, leaf_font_size=6)
plt.show()


### Different linkage, different hierarchical clustering!


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "eurovision.csv")).values
country_names = [
    "Albania",
    "Armenia",
    "Australia",
    "Austria",
    "Azerbaijan",
    "Belarus",
    "Belgium",
    "Bosnia & Herzegovina",
    "Bulgaria",
    "Croatia",
    "Cyprus",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "F.Y.R. Macedonia",
    "Finland",
    "France",
    "Georgia",
    "Germany",
    "Greece",
    "Hungary",
    "Iceland",
    "Ireland",
    "Israel",
    "Italy",
    "Latvia",
    "Lithuania",
    "Malta",
    "Moldova",
    "Montenegro",
    "Norway",
    "Poland",
    "Russia",
    "San Marino",
    "Serbia",
    "Slovenia",
    "Spain",
    "Sweden",
    "Switzerland",
    "The Netherlands",
    "Ukraine",
    "United Kingdom",
]


In [None]:
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Calculate the linkage: mergings
mergings = linkage(samples, method="single")

# Plot the dendrogram
dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6)
plt.show()


### Extracting the cluster labels


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "samples.csv")).values
varieties = [
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Kama wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Rosa wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
    "Canadian wheat",
]


In [None]:
# Perform the necessary imports
import pandas as pd
from scipy.cluster.hierarchy import fcluster

# Use fcluster to extract labels: labels
labels = fcluster(mergings, 6, criterion="distance")

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({"labels": labels, "varieties": varieties})

# Create crosstab: ct
ct = pd.crosstab(df["labels"], df["varieties"])

# Display ct
print(ct)


### t-SNE visualization of grain dataset


In [None]:
variety_numbers = [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
]


In [None]:
# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=200)

# Apply fit_transform to samples: tsne_features
tsne_features = model.fit_transform(samples)

# Select the 0th feature: xs
xs = tsne_features[:, 0]

# Select the 1st feature: ys
ys = tsne_features[:, 1]

# Scatter plot, coloring by variety_numbers
plt.scatter(xs, ys, c=variety_numbers)
plt.show()


### A t-SNE map of the stock market


In [None]:
# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=50)

# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(normalized_movements)

# Select the 0th feature: xs
xs = tsne_features[:, 0]

# Select the 1th feature: ys
ys = tsne_features[:, 1]

# Scatter plot
plt.scatter(xs, ys, alpha=0.5)

# Annotate the points
for x, y, company in zip(xs, ys, companies):
    plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
plt.show()


---

## Decorrelating your data and dimension reduction

### Correlated data in nature


In [None]:
grains = pd.read_csv(os.path.join(DATA_PATH, "grains.csv")).values


In [None]:
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Assign the 0th column of grains: width
width = grains[:, 0]

# Assign the 1st column of grains: length
length = grains[:, 1]

# Scatter plot width vs length
plt.scatter(width, length)
plt.axis("equal")
plt.show()

# Calculate the Pearson correlation
correlation, pvalue = pearsonr(width, length)

# Display the correlation
print(correlation)


### Decorrelating the grain measurements with PCA


In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Create PCA instance: model
model = PCA()

# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(grains)

# Assign 0th column of pca_features: xs
xs = pca_features[:, 0]

# Assign 1st column of pca_features: ys
ys = pca_features[:, 1]

# Scatter plot xs vs ys
plt.scatter(xs, ys)
plt.axis("equal")
plt.show()

# Calculate the Pearson correlation of xs and ys
correlation, pvalue = pearsonr(xs, ys)

# Display the correlation
print(correlation)


### The first principal component


In [None]:
# Make a scatter plot of the untransformed points
plt.scatter(grains[:, 0], grains[:, 1])

# Create a PCA instance: model
model = PCA(copy=True)

# Fit model to points
_ = model.fit(grains)

# Get the mean of the grain samples: mean
mean = model.mean_

# Get the first principal component: first_pc
first_pc = model.components_[0, :]

# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color="red", width=0.01)

# Keep axes on same scale
plt.axis("equal")
plt.show()


### Variance of the PCA features


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "fish.csv")).values


In [None]:
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA(copy=True)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
_ = pipeline.fit(samples)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel("PCA feature")
plt.ylabel("variance")
plt.xticks(features)
plt.show()


### Dimension reduction of the fish measurements


In [None]:
scaled_samples = np.array(
    [
        [-0.50109735, -0.36878558, -0.34323399, -0.23781518, 1.0032125, 0.25373964],
        [-0.37434344, -0.29750241, -0.26893461, -0.14634781, 1.15869615, 0.44376493],
        [-0.24230812, -0.30641281, -0.25242364, -0.15397009, 1.13926069, 1.0613471],
        [-0.18157187, -0.09256329, -0.04603648, 0.02896467, 0.96434159, 0.20623332],
        [-0.00464454, -0.0747425, -0.04603648, 0.06707608, 0.8282934, 1.0613471],
        [0.04816959, -0.04801131, 0.01175193, 0.12043205, 1.08095432, 0.63379021],
        [0.18020491, -0.04801131, 0.01175193, 0.10518748, 1.26559115, 1.15635974],
        [-0.11027279, 0.02327186, 0.03651839, 0.14329889, 0.78942248, 0.25373964],
        [0.04816959, 0.02327186, 0.03651839, 0.15092117, 1.14897842, 0.44376493],
        [0.18020491, 0.10346543, 0.09430679, 0.23476627, 1.09067205, 0.39625861],
        [0.11418725, 0.09455503, 0.11907325, 0.23476627, 1.10038978, 0.58628389],
        [0.18020491, 0.12128622, 0.11907325, 0.23476627, 1.12954296, 0.20623332],
        [0.18020491, 0.1569278, 0.16035069, 0.25001083, 0.94490613, -0.41134885],
        [0.44427556, 0.18365899, 0.20162812, 0.31098909, 1.1781316, 0.49127125],
        [0.44427556, 0.18365899, 0.20162812, 0.31098909, 1.30446206, 1.01384078],
        [0.7083462, 0.27276296, 0.28418298, 0.39483418, 1.04208341, 0.44376493],
        [0.7083462, 0.27276296, 0.28418298, 0.41007875, 1.04208341, 0.30124596],
        [0.47068262, 0.31731494, 0.32546042, 0.41770103, 1.20728478, 0.20623332],
        [0.57631088, 0.32622533, 0.32546042, 0.42532331, 0.90603522, 0.91882813],
        [0.3782579, 0.35295652, 0.36673785, 0.48630156, 0.99349477, 0.58628389],
        [0.66873561, 0.36186692, 0.36673785, 0.46343472, 1.23643797, 0.39625861],
        [0.49708969, 0.37077732, 0.40801528, 0.50154612, 1.07123659, 0.20623332],
        [0.65553208, 0.3975085, 0.44929271, 0.57014666, 0.97405931, 1.0613471],
        [0.7083462, 0.4064189, 0.44929271, 0.56252438, 1.16841387, 0.44376493],
        [0.77436387, 0.3975085, 0.44929271, 0.5930135, 1.15869615, 0.91882813],
        [0.76116033, 0.4153293, 0.44929271, 0.57014666, 1.18784933, 1.01384078],
        [0.74531609, 0.47770207, 0.53184758, 0.63874719, 1.13926069, 0.58628389],
        [1.10445217, 0.48661247, 0.53184758, 0.64636947, 1.21700251, 0.96633446],
        [1.50055814, 0.54898524, 0.61440245, 0.72259229, 1.5959939, 1.25137238],
        [1.28930162, 0.68264119, 0.73823474, 0.83692651, 1.2461557, 0.68129653],
        [1.38172635, 0.68264119, 0.73823474, 0.82930423, 1.26559115, 0.68129653],
        [1.30250516, 0.78956594, 0.82078961, 0.92839389, 1.29474434, 0.96633446],
        [1.43454048, 0.8964907, 0.94462191, 0.97412758, 1.21700251, 0.87132181],
        [1.36852282, 0.94995308, 0.94462191, 1.01986127, 0.95462386, 0.39625861],
        [-1.03452005, -1.2865564, -1.27610397, -1.28969003, -0.24065667, 0.53877757],
        [-0.95793956, -0.96578213, -0.93762902, -0.97717649, -0.19206803, 0.49127125],
        [-0.93417321, -0.87667817, -0.8880961, -0.90857596, -0.17263258, 0.39625861],
        [-0.91040685, -0.8143054, -0.80554124, -0.83235314, -0.26980986, 0.68129653],
        [-0.82326354, -0.77866381, -0.78903027, -0.83235314, -0.0074312, 1.5364103],
        [-1.14014831, -0.74302223, -0.74775283, -0.78661945, 0.03143971, 0.87132181],
        [-0.8496706, -0.73411183, -0.72298637, -0.76375261, -0.13376167, 0.87132181],
        [-0.82326354, -0.70738064, -0.7064754, -0.71801892, -0.22122122, 0.49127125],
        [-0.74404234, -0.61827668, -0.62392054, -0.6417961, -0.44472896, 1.10885342],
        [-0.75724587, -0.60936628, -0.62392054, -0.67228523, -0.0754553, 0.82381549],
        [-0.71763528, -0.60936628, -0.5826431, -0.59606241, -0.02686666, 1.0613471],
        [-0.77044941, -0.5648143, -0.5826431, -0.61892926, -0.18235031, 0.20623332],
        [-0.71763528, -0.5559039, -0.5826431, -0.61892926, -0.24065667, 1.10885342],
        [-0.69386892, -0.47571034, -0.4588108, -0.45123907, -0.03658439, 0.58628389],
        [-0.71499457, -0.47571034, -0.50834372, -0.48935047, -0.21150349, 0.34875228],
        [-0.61200702, -0.46679994, -0.50008824, -0.48172819, -0.04630212, 1.20386606],
        [-0.66482115, -0.33314399, -0.35974497, -0.39788309, -0.26009213, 0.53877757],
        [-0.37434344, -0.29750241, -0.29370107, -0.29879344, 0.22579427, 1.20386606],
        [-0.42187616, -0.20839845, -0.21114621, -0.19208149, -0.0074312, 1.2988787],
        [-0.11027279, 0.19256939, 0.17686166, 0.14329889, -0.09489075, 1.15635974],
        [-1.12245558, -1.60733067, -1.63108989, -1.70129323, -1.16384082, -1.50399423],
        [-1.12034301, -1.5449579, -1.57330149, -1.64031498, -1.07638127, -1.36147526],
        [-1.12166336, -1.5360475, -1.565046, -1.64031498, -1.28045356, -1.40898159],
        [-1.11453346, -1.50931631, -1.53202405, -1.60982586, -0.95005081, -0.64888045],
        [-1.11426939, -1.48258512, -1.51551308, -1.57933673, -1.09581673, -1.2189563],
        [-1.11717416, -1.47367473, -1.5072576, -1.56409217, -1.20271174, -1.26646262],
        [-1.11374125, -1.42912274, -1.46598016, -1.52598076, -1.086099, -1.45648791],
        [-1.11400532, -1.42912274, -1.46598016, -1.52598076, -1.086099, -1.88404479],
        [-1.11426939, -1.42021235, -1.44946919, -1.51835848, -1.10553446, -1.97905744],
        [-1.10793169, -1.41130195, -1.43295822, -1.50311391, -1.21242946, -1.17144998],
        [-1.10476284, -1.39348116, -1.41644724, -1.49549163, -0.97920399, -1.64651319],
        [-1.10793169, -1.35783957, -1.36691432, -1.47262479, -1.12496991, -1.78903215],
        [-1.08812639, -1.25982521, -1.259593, -1.36591285, -0.89174444, 0.34875228],
        [-1.08759825, -1.20636284, -1.20180459, -1.28969003, -0.96948627, -0.60137413],
        [-0.61200702, 0.23712137, 0.22639458, 0.12805433, -1.17355855, -1.50399423],
        [-0.34793638, 0.38859811, 0.36673785, 0.35672277, -1.2610181, -0.88641206],
        [-0.34793638, 0.47770207, 0.44929271, 0.43294559, -1.24158265, -0.74389309],
        [-0.34793638, 0.6648204, 0.6391689, 0.5091684, -1.19299401, -1.31396894],
        [-0.00464454, 0.72719317, 0.69695731, 0.56252438, -0.97920399, -0.74389309],
        [-0.22910458, 0.77174515, 0.73823474, 0.60063578, -1.21242946, -1.50399423],
        [0.06401383, 1.128161, 1.0684542, 0.94363845, -1.17355855, -1.59900687],
        [0.20661198, 1.128161, 1.0684542, 0.94363845, -1.27073583, -1.45648791],
        [0.28583317, 1.1370714, 1.10973164, 0.9665053, -1.07638127, -0.79139942],
        [0.18020491, 1.30636893, 1.27484137, 1.13419549, -1.31932447, -1.26646262],
        [0.35713225, 1.41329369, 1.35739623, 1.18755146, -1.17355855, -1.36147526],
        [0.89319566, 1.55586003, 1.52250596, 1.3781085, -1.27073583, -1.12394366],
        [1.36852282, 1.8677239, 1.82795897, 1.67537748, -1.1541231, -0.79139942],
        [2.16073475, 2.19740857, 2.18294489, 2.02600243, -0.98892172, -0.55386781],
        [3.08498201, 2.55382442, 2.51316435, 2.35376053, -1.27073583, -1.55150055],
        [2.95294669, 2.55382442, 2.51316435, 2.35376053, -1.27073583, -1.55150055],
        [3.21701733, 2.82113631, 2.79385089, 2.65865179, -1.18327628, -0.88641206],
    ]
)


In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Create a PCA model with 2 components: pca
pca = PCA(n_components=2)

# Fit the PCA instance to the scaled samples
_ = pca.fit(scaled_samples)

# Transform the scaled samples: pca_features
pca_features = pca.transform(scaled_samples)

# Print the shape of pca_features
print(pca_features.shape)


### A tf-idf word-frequency array


In [None]:
documents = ["cats say meow", "dogs say woof", "dogs chase cats"]


In [None]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

# Get the words: words
words = tfidf.get_feature_names()

# Print words
print(words)


### Clustering Wikipedia part I


In [None]:
# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd, kmeans)


### Clustering Wikipedia part II


In [None]:
# from scipy.sparse import csr_matrix

# articles = csr_matrix(np.fromfile(os.path.join(DATA_PATH, "articles.csv"), sep=","))
# titles = [
#     "HTTP 404",
#     "Alexa Internet",
#     "Internet Explorer",
#     "HTTP cookie",
#     "Google Search",
#     "Tumblr",
#     "Hypertext Transfer Protocol",
#     "Social search",
#     "Firefox",
#     "LinkedIn",
#     "Global warming",
#     "Nationally Appropriate Mitigation Action",
#     "Nigel Lawson",
#     "Connie Hedegaard",
#     "Climate change",
#     "Kyoto Protocol",
#     "350.org",
#     "Greenhouse gas emissions by the United States",
#     "2010 United Nations Climate Change Conference",
#     "2007 United Nations Climate Change Conference",
#     "Angelina Jolie",
#     "Michael Fassbender",
#     "Denzel Washington",
#     "Catherine Zeta-Jones",
#     "Jessica Biel",
#     "Russell Crowe",
#     "Mila Kunis",
#     "Dakota Fanning",
#     "Anne Hathaway",
#     "Jennifer Aniston",
#     "France national football team",
#     "Cristiano Ronaldo",
#     "Arsenal F.C.",
#     "Radamel Falcao",
#     "Zlatan Ibrahimović",
#     "Colombia national football team",
#     "2014 FIFA World Cup qualification",
#     "Football",
#     "Neymar",
#     "Franck Ribéry",
#     "Tonsillitis",
#     "Hepatitis B",
#     "Doxycycline",
#     "Leukemia",
#     "Gout",
#     "Hepatitis C",
#     "Prednisone",
#     "Fever",
#     "Gabapentin",
#     "Lymphoma",
#     "Chad Kroeger",
#     "Nate Ruess",
#     "The Wanted",
#     "Stevie Nicks",
#     "Arctic Monkeys",
#     "Black Sabbath",
#     "Skrillex",
#     "Red Hot Chili Peppers",
#     "Sepsis",
#     "Adam Levine",
# ]


```python
# Import pandas
import pandas as pd

# Fit the pipeline to articles
_ = pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({"label": labels, "article": titles})

# Display df sorted by cluster label
print(df.sort_values("label"))

```

---

## Discovering interpretable features

### NMF applied to Wikipedia articles

```python
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
_ = model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features.round(2))

```

### NMF features of the Wikipedia articles


```python
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc["Anne Hathaway"])

# Print the row for 'Denzel Washington'
print(df.loc["Denzel Washington"])

```

### NMF learns topics of documents


```python
# Import pandas
import pandas as pd

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)

# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())

```

### Explore the LED digits dataset


In [None]:
samples = pd.read_csv(os.path.join(DATA_PATH, "led.csv")).values


In [None]:
# Import pyplot
from matplotlib import pyplot as plt

# Select the 0th row: digit
digit = samples[0]

# Print digit
print(digit)

# Reshape digit to a 13x8 array: bitmap
bitmap = digit.reshape((13, 8))

# Print bitmap
print(bitmap)

# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap="gray", interpolation="nearest")
plt.colorbar()
plt.show()


### NMF learns the parts of images


In [None]:
def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap="gray", interpolation="nearest")
    plt.colorbar()
    plt.show()


In [None]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF model: model
model = NMF(n_components=7)

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)

# Assign the 0th row of features: digit_features
digit_features = features[0]

# Print digit_features
print(digit_features)


### PCA doesn't learn parts


In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Create a PCA instance: model
model = PCA(n_components=7)

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)


### Which articles are similar to 'Cristiano Ronaldo'?


In [None]:
nmf_features = pd.read_csv(os.path.join(DATA_PATH, "nmf.csv"), header=None).values
titles = [
    "HTTP 404",
    "Alexa Internet",
    "Internet Explorer",
    "HTTP cookie",
    "Google Search",
    "Tumblr",
    "Hypertext Transfer Protocol",
    "Social search",
    "Firefox",
    "LinkedIn",
    "Global warming",
    "Nationally Appropriate Mitigation Action",
    "Nigel Lawson",
    "Connie Hedegaard",
    "Climate change",
    "Kyoto Protocol",
    "350.org",
    "Greenhouse gas emissions by the United States",
    "2010 United Nations Climate Change Conference",
    "2007 United Nations Climate Change Conference",
    "Angelina Jolie",
    "Michael Fassbender",
    "Denzel Washington",
    "Catherine Zeta-Jones",
    "Jessica Biel",
    "Russell Crowe",
    "Mila Kunis",
    "Dakota Fanning",
    "Anne Hathaway",
    "Jennifer Aniston",
    "France national football team",
    "Cristiano Ronaldo",
    "Arsenal F.C.",
    "Radamel Falcao",
    "Zlatan Ibrahimović",
    "Colombia national football team",
    "2014 FIFA World Cup qualification",
    "Football",
    "Neymar",
    "Franck Ribéry",
    "Tonsillitis",
    "Hepatitis B",
    "Doxycycline",
    "Leukemia",
    "Gout",
    "Hepatitis C",
    "Prednisone",
    "Fever",
    "Gabapentin",
    "Lymphoma",
    "Chad Kroeger",
    "Nate Ruess",
    "The Wanted",
    "Stevie Nicks",
    "Arctic Monkeys",
    "Black Sabbath",
    "Skrillex",
    "Red Hot Chili Peppers",
    "Sepsis",
    "Adam Levine",
]


In [None]:
# Perform the necessary imports
import pandas as pd
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc["Cristiano Ronaldo"]

# Compute the dot products: similarities
similarities = df.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest())


### Recommend musical artists part I


In [None]:
from scipy.sparse import csr_matrix

artists = csr_matrix(
    np.asmatrix(pd.read_csv(os.path.join(DATA_PATH, "artists.csv"), header=None).values)
)


In [None]:
# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)

# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists)


### Recommend musical artists part II


In [None]:
artist_names = [
    "Massive Attack",
    "Sublime",
    "Beastie Boys",
    "Neil Young",
    "Dead Kennedys",
    "Orbital",
    "Miles Davis",
    "Leonard Cohen",
    "Van Morrison",
    "NOFX",
    "Rancid",
    "Lamb",
    "Korn",
    "Dropkick Murphys",
    "Bob Dylan",
    "Eminem",
    "Nirvana",
    "Van Halen",
    "Damien Rice",
    "Elvis Costello",
    "Everclear",
    "Jimi Hendrix",
    "PJ Harvey",
    "Red Hot Chili Peppers",
    "Ryan Adams",
    "Soundgarden",
    "The White Stripes",
    "Madonna",
    "Eric Clapton",
    "Bob Marley",
    "Dr. Dre",
    "The Flaming Lips",
    "Tom Waits",
    "Moby",
    "Cypress Hill",
    "Garbage",
    "Fear Factory",
    "50 Cent",
    "Ani DiFranco",
    "Matchbox Twenty",
    "The Police",
    "Eagles",
    "Phish",
    "Stone Temple Pilots",
    "Black Sabbath",
    "Britney Spears",
    "Fatboy Slim",
    "System of a Down",
    "Simon & Garfunkel",
    "Snoop Dogg",
    "Aimee Mann",
    "Less Than Jake",
    "Rammstein",
    "Reel Big Fish",
    "The Prodigy",
    "Pantera",
    "Foo Fighters",
    "The Beatles",
    "Incubus",
    "Audioslave",
    "Bright Eyes",
    "Machine Head",
    "AC/DC",
    "Dire Straits",
    "MotÃ¶rhead",
    "Ramones",
    "Slipknot",
    "Me First and the Gimme Gimmes",
    "Bruce Springsteen",
    "Queens of the Stone Age",
    "The Chemical Brothers",
    "Bon Jovi",
    "Goo Goo Dolls",
    "Alice in Chains",
    "Howard Shore",
    "Barenaked Ladies",
    "Anti-Flag",
    "Nick Cave and the Bad Seeds",
    "Static-X",
    "Misfits",
    "2Pac",
    "Sparta",
    "Interpol",
    "The Crystal Method",
    "The Beach Boys",
    "Goldfrapp",
    "Bob Marley & the Wailers",
    "Kylie Minogue",
    "The Blood Brothers",
    "Mirah",
    "Ludacris",
    "Snow Patrol",
    "The Mars Volta",
    "Yeah Yeah Yeahs",
    "Iced Earth",
    "Fiona Apple",
    "Rilo Kiley",
    "Rufus Wainwright",
    "Flogging Molly",
    "Hot Hot Heat",
    "Dredg",
    "Switchfoot",
    "Tegan and Sara",
    "Rage Against the Machine",
    "Keane",
    "Jet",
    "Franz Ferdinand",
    "The Postal Service",
    "The Dresden Dolls",
    "The Killers",
    "Death From Above 1979",
]


In [None]:
# Import pandas
import pandas as pd

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=artist_names)

# Select row of 'Bruce Springsteen': artist
artist = df.loc["Bruce Springsteen"]

# Compute cosine similarities: similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())
