<a target="_blank" href="https://colab.research.google.com/github/LuWidme/uk259/blob/main/demos/Dimensionality%20Reduction%20Demo.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



# Dimensionality Reduction

The following is a small demonstration of PCa and some practical applications for the method

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)


print(pca.explained_variance_ratio_)




## Visualizing 4D data
### Idea 1

3D Plot with 4th dimension encoded as colour / point size / symbol.

In [None]:

#run this to enable 3d plots
# %matplotlib widget
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
df

In [None]:

%matplotlib widget

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

x = df['sepal length']
y = df['sepal width']
z = df['petal length']
s = df['petal width']
c = df['target']
colors = {'Iris-setosa':"red", 'Iris-versicolor':"blue", 'Iris-virginica':"green"}
print(c.unique())

ax.set_xlabel("sepal length")
ax.set_ylabel("sepal width")
ax.set_zlabel("petal length")

ax.scatter(x, y, z, s = df['petal width']*15 , c=c.map(colors))

plt.show()

## Idea 2: Principal Component Analysis 
Principal Component Analysis (PCA) attempts to identify the principal components (called "eigenvectors") that best describe the variance in the data using these PCs, we can visualize points in a lower dimension (*d*) by only using a combination of the first *d* PCs.

In [None]:
from sklearn import datasets
from sklearn.decomposition import PCA
iris = datasets.load_iris(as_frame=True)

X = iris.data
y = iris.target
target_names = iris.target_names
print (X.shape)
sns.pairplot(data=iris.frame, hue='target')

Calculate the variance explained by each PC:

In [None]:

pca = PCA(n_components=4)
X_r = pca.fit_transform(X)

# lda = LinearDiscriminantAnalysis(n_components=2)
# X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each component
print(
    "explained variance ratio for each component: %s"
    % str(pca.explained_variance_ratio_)
)

print("\ntotal variance explained by first 2 components:\n%f" %sum(pca.explained_variance_ratio_[:2]))


In [None]:


plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(
        X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
    )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA of IRIS dataset")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2') 
"""
plt.figure()

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(
        X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name
    )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("LDA of IRIS dataset")
"""
plt.show()

In [None]:
# Generates the base diagram
# PCA example
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
# There is nearly a linear shape
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal')


In [None]:
# Drawing Vector arrow
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0, color='red')
    ax.annotate('', v1, v0, arrowprops=arrowprops)
# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')


In [None]:
# Removing axis:
# When using it for recuding the dimensionality
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)
X_pca = pca.transform(X)
print("original shape:   ", X.shape)
print("transformed shape:", X_pca.shape) # Reduced to onl
sns.displot(X_pca,kind="kde")

In [None]:

# Convert to PCA and draw the original version in red
# To get a better picture let's reverse the entire thing and plot the PCA version and the original version
# All points get pulled to the new axis and the machine still has a good idea of where the cluser is located.
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:, 0], X[:, 1], alpha=0.2, color="red", s =5)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.6,  color="dodgerblue", s =5)
plt.axis('equal')
