# Decomposition

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA

x = np.random.uniform(0.1,5,100)
noise = np.random.normal(scale=0.3, size=x.size)

## Intuition: factorization
Why is it useful to express something as a few parts multiplied together?
To convey more information 

In [None]:
# at what points does y=0?
# y = -x**3 + 7*x**2 - 14*x + 8
y = (4-x) * (2-x) * (1-x)
#x = independant
# y = dep

In [None]:
pd.DataFrame({"x": x, "y": y+noise}).plot.scatter(x="x", y="y")
plt.hlines(0, -1, 6, color="k")

## Some cool dimensionality reduction examples:
https://pair-code.github.io/understanding-umap/ \
https://distill.pub/2016/misread-tsne/ 

# Matrix Multiplication

In [None]:
A = np.random.normal(size=(9, 7))
B = np.random.normal(size=(7, 14))
C = np.random.normal(size=(14, 3))
D = np.random.normal(size=(3, 10))

In [None]:
print(A.shape, B.shape, C.shape, D.shape)

In [None]:
# 1. The matrix multiplication is possible when the second & first sizes of consecutive matrices match
# 2. size of the final: row of the first matrix, and column of the last
(A @ B @ C @ D).shape

Question: Is it possible to use fewer columns to represent this dataframe?

In [None]:
df = pd.DataFrame(make_blobs(centers=2, random_state=320)[0], columns=["A", "B"])
df["C"] = df["A"] * 2
df["D"] = df["A"] - df["B"]
df.head()

A: Yes. C is two times of A and D is A - B, so we only need A & B and their relationship to C & D to represent the dataframe.

## Decomposition with Principal Component Analysis (PCA)

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition

```python
from sklearn.decomposition import PCA
```

In [None]:
p = PCA()
W = p.fit_transform(df)
C = p.components_
m = p.mean_

In [None]:
W.shape

In [None]:
# W @ C

In [None]:
C.shape

In [None]:
df.head()

In [None]:
# Attempt to use W and C to reconstruct the original dataframe
pd.DataFrame(W @ C).head()

In [None]:
# PCA will first find the mean
m = p.mean_
print(m.shape)
print(m)

In [None]:
df.mean()

In [None]:
# use W and C to reconstruct the original dataframe
pd.DataFrame((W @ C) + m).head()

In [None]:
df.head()

C is called the **component matrix** \
first row of C is the most important component \
second row of C is the second most important component \
and so on ...

Each row is in the form of the slope of the componenet

In [None]:
# two components for 2d data
C

For the first component, PCA will try to fit a line that corss the mean point and 
has the largest spreadout in terms of points. \
The second component will be prependicular to the first component, corssing the mean point, 
and has the largest spreadout in its direction. 

First column of W represents relative positions of points along the first component \
Second column of W represents relative positions of points along the second component \
and so on ...

In [None]:
print(W.shape, C.shape)

In [None]:
# us the first two components to reconstruct the dataframe
pd.DataFrame(W[:, :2] @ C[:2, :] + m).head()

In [None]:
df.head()

In [None]:
# use only the first component to approximately reconstruct the dataframe
# the first column of W (relative position of W along the first component) multiply the first row of C (the first component)
pd.DataFrame(W[:, :1] @ C[:1, :] + p.mean_).head()

## Explained Variance
 * Let's check how close the above dataframe to the orginal dataframe

In [None]:
a = np.array([1.1, 1.9, 3.2])
a

In [None]:
b = np.array([1, 2, 3])
b

In [None]:
a - b

In [None]:
before = a.var()

In [None]:
after = (a - b).var()

In [None]:
improvement = (before - after)/before
improvement

In [None]:
1 - (a - b).var() / a.var()

In [None]:
# the amount of variance explained by each components
# the first component has largest explained variance ratio
# the second component has the second largest explained variance ratio
# and so on 
explained_variance = p.explained_variance_.round(2)
explained_variance

In [None]:
(explained_variance / explained_variance.sum()).round(2)

In [None]:
# explained variance percentage wise
p.explained_variance_ratio_.round(2)

### Cumulative plot of explained variance ratio

In [None]:
# cumsum() compute the cumulative sum
s = pd.Series(p.explained_variance_ratio_.cumsum(), index=range(1,5))
ax = s.plot.line()
ax.set_ylabel("Cumulative Explained Variance")
ax.set_xlabel("Number of Components")

In [None]:
# cumsum() compute the cumulative sum
s = pd.Series(p.explained_variance_ratio_.cumsum(), index=range(1,5))
ax = s.plot.line(ylim=0)
ax.set_ylabel("Cumulative Explained Variance")
ax.set_xlabel("Number of Components")

# Dimensionality Reduction on Feature Columns

In [None]:
# use the W columns for machine leearning and visualization, because this tells
# us a lot about the original 4 columns
pd.DataFrame(W).head()

In [None]:
# pass an int to say how many weight columns and component rows we want to slice out
p = PCA(2)
W = p.fit_transform(df)
C = p.components_
m = p.mean_

In [None]:
W

In [None]:
print(W.shape, C.shape)

In [None]:
# pass a float to indicate how much variance we want to explain (explained_variance_ratio_}
p = PCA(0.96)
W = p.fit_transform(df)
C = p.components_
m = p.mean_

In [None]:
print(W.shape, C.shape)

In [None]:
df

In [None]:
pipe = Pipeline([
    ("pca", PCA(2)), 
    ("km", KMeans(3)),
])

# pipe.fit_transform(df) # fit PCA, transform using PCA, fit KMeans using output from PCA

groups = pipe.fit_predict(df)
groups

In [None]:
# pipe["pca"].transform(df)

In [None]:
pd.DataFrame(pipe["pca"].transform(df)).plot.scatter(x=0, y=1, c=groups)

# Lossy Compression

Use PCA to extract the most important information and throw away the less important ones

In [None]:
img = plt.imread("bug.jpg")
plt.imshow(img)

In [None]:
type(img)

In [None]:
img.shape

In [None]:
# averaging the color dimension to make it a bit more easy to handle
img = img.mean(axis=2)
img.shape

In [None]:
plt.imshow(img, cmap="gray")

In [None]:
# img

In [None]:
# we want to explian 95% of the variance
p = PCA(0.95)
W = p.fit_transform(img)
C = p.components_
m = p.mean_

In [None]:
original_size = len(img.reshape(-1))
original_size

In [None]:
compressed_size = len(W.reshape(-1)) + len(C.reshape(-1)) + len(m.reshape(-1))
compressed_size

In [None]:
# compression ratio
original_size / compressed_size

In [None]:
C.shape

In [None]:
plt.imshow(W @ C + m, cmap="gray")

In [None]:
# saves numpy arrays into .npz format
# use wb to write in binary format
with open("img1.npz", "wb") as f: 
    np.savez(f, img)

In [None]:
with open("img2.npz", "wb") as f: 
    np.savez(f, W, C, m)

In [None]:
with np.load("img2.npz") as f: 
    W, C, m = f.values()

In [None]:
plt.imshow(W @ C + m, cmap="gray")

In [None]:
# original plot size vs the compressed plot size
!ls -lh