# Unsupervised learning with Python

## Cluster

```
from sklearn.cluster import KMeans

model = KMeans(n_clusters = 3)
model.fit(new_points)
labels = model.predict(new_points)
print(labels)


import matplotlib.pyplot as plt

# Assign the columns of new_points: xs and ys
xs = new_points[:, 0]
ys = new_points[:, 1]

# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs, ys, c = labels, alpha = 0.5)

# Assign the cluster centers: centroids
centroids = model.cluster_centers_

# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]

plt.scatter(centroids_x, centroids_y, marker = 'D', s= 50)
plt.show()
```

## Evaluate cluster
1. Internia

```
ks = range(1, 6)
inertias = []

for k in ks:
    model = KMeans(n_clusters = k)
    model.fit(samples)
    inertias.append(model.inertia_)  # Append the inertia to the list of inertias
    
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
```
2. Crosstab

```
model = KMeans(n_clusters = 3)
labels = model.fit_predict(samples)

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': varieties})

# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])

print(ct)
```
## Transform features
1. StandardScaler

```
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
kmeans = KMeans(n_clusters = 4)
pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(samples)
labels = pipeline.predict(samples)

import pandas as pd
df = pd.DataFrame({'labels': labels, 'species': species})
ct = pd.crosstab(df['labels'], df['species'])
print(ct)
```
2. Normalizer

```
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()

kmeans = KMeans(n_clusters = 10)
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
labels = pipeline.predict(movements)

import pandas as pd
df = pd.DataFrame({'labels': labels, 'companies': companies})
print(df.sort_values(by = 'labels'))

```

## Visualisation with t-SNE

```
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate = 50)

# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(normalized_movements)

xs = tsne_features[:, 0]
ys = tsne_features[:,1]
plt.scatter(xs, ys, alpha = 0.5)

# Annotate the points
for x, y, company in zip(xs, ys, companies):
    plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
plt.show()
```
## Decorrelating and dimension reduction
1. Correlate data

```
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

width = grains[:, 0]
length = grains[:, 1]

plt.scatter(width, length)
plt.axis('equal')
plt.show()

correlation, pvalue = pearsonr(width, length)
print(correlation)
```
2. Decorrelate data

```
from sklearn.decomposition import PCA

# Create PCA instance: model
model = PCA()

# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(grains)

xs = pca_features[:,0]
ys = pca_features[:,1]

plt.scatter(xs, ys)
plt.axis('equal')
plt.show()

correlation, pvalue = pearsonr(xs, ys)
print(correlation)
```
3. Variance of the PCA features

```
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

scaler = StandardScaler()
pca = PCA()
pipeline = make_pipeline(scaler, pca)
pipeline.fit(samples)

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
```
4. Dimension reduction of the fish measurements

```
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
pca.fit(scaled_samples)
pca_features = pca.transform(scaled_samples)

print(pca_features.shape)
```
5. Simple tf-idf word-frequency array

```
from sklearn.feature_extraction.text import TfidfVectorizer 

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer() 

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

# Get the words: words
words = tfidf.get_feature_names()

# Print words
print(words)
```
6. Cluster Wikipedia

```
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
import pandas as pd

svd = TruncatedSVD(n_components = 50)
kmeans = KMeans(n_clusters = 6)
pipeline = make_pipeline(svd, kmeans)

pipeline.fit(articles)
labels = pipeline.predict(articles)
df = pd.DataFrame({'label': labels, 'article': titles})

print(df.sort_values('label'))
```
## Non-negative matrix factorisation
1. NMF features

```
from sklearn.decomposition import NMF
import pandas as pd

model = NMF(n_components = 6)
model.fit(articles)
nmf_features = model.transform(articles)

print(nmf_features)

df = pd.DataFrame(nmf_features, index = titles)
print(df.loc['Anne Hathaway'])
print(df.loc['Denzel Washington'])
```
2. NMF components

```
import pandas as pd

components_df = pd.DataFrame(model.components_, columns = words)

print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())
```
3. Decompose images

```
from matplotlib import pyplot as plt
from sklearn.decomposition import NMF

digit = samples[0, :]
print(digit)

bitmap = digit.reshape(13, 8)
print(bitmap)

# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
plt.show()

model = NMF(n_components = 7)
features = model.fit_transform(samples)

for component in model.components_:
    show_as_image(component)

digit_features = features[0, :]

print(digit_features)
```
4. Recommender system

```
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline
import pandas as pd

scaler = MaxAbsScaler()

nmf = NMF(n_components=20)
normalizer = Normalizer()
pipeline = make_pipeline(scaler, nmf, normalizer)
norm_features = pipeline.fit_transform(artists)

df = pd.DataFrame(norm_features, index=artist_names)
artist = df.loc['Bruce Springsteen']
similarities = df.dot(artist)

print(similarities.nlargest())
```