In [1]:
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [2]:
X_centred = X - X.mean(axis = 0)
U, s, V = np.linalg.svd(X_centred)
c1 = V.T[:,0]
c2 = V.T[:,1]

In [3]:
W2 = V.T[:,:2]
X2D = X_centred.dot(W2)

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)
pca.components_

array([[-0.93636116, -0.29854881, -0.18465208],
       [ 0.34027485, -0.90119108, -0.2684542 ]])

In [5]:
pca.explained_variance_ratio_

array([ 0.84248607,  0.14631839])

In [6]:
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmin(cumsum >= 0.95) + 1

In [7]:
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X)

In [8]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST Original")
X = mnist["data"]
y = mnist["target"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [10]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X_train)

In [13]:
m = len(X_train)
n = len(X_train[0])
X_mm = np.memmap(X_train, dtype="float32", mode="readonly", shape=(m,n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8d in position 213: invalid start byte

In [14]:
rnd_pca = PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression()),
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]

In [None]:
grid_search = GridSearchCV(clf, param_grid)
grid_search.fit(X, y)

In [None]:
rbf_pca = PCA(n_components=2, kernel="rbf", gamma = 0.0433, fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components = 2, n_neighbors = 10)
X_reduced = lle.fit_transform(X)

In [12]:
import time
from sklearn.ensemble import RandomForestClassifier

start = time.time()
forest_clf = RandomForestClassifier()
forest_clf.fit(X_train, y_train)
print(time.time() - start, "seconds to train")

predictions = forest_clf.predict(X_test)

8.213322639465332 seconds to train


In [14]:
start_2 = time.time()

pca = PCA(n_components = 0.95)
X_train_reduced = pca.fit_transform(X_train)

forest_clf_pca = RandomForestClassifier()
forest_clf_pca = forest_clf_pca.fit(X_train_reduced, y_train)
print(time.time() - start_2, "seconds to train")

dimensions = pca.n_components_
pca_test = PCA(n_components=dimensions)
X_test_reduced = pca.fit_transform(X_test)
predictions_pca = forest_clf_pca.predict(X_test_reduced)

32.61481523513794 seconds to train


ValueError: Number of features of the model must match the input. Model n_features is 154 and input n_features is 153 

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
dimension_reduced_accuracy = accuracy_score(y_test, predictions_pca)
print("Accuracy:",accuracy)
print("Accuracy after PCA:", dimension_reduced_accuracy)

In [None]:
from sklearn.manifold import TSNE

t_sne = TSNE(n_components=2, random_state=42)
sne_train = t_sne.fit_transform(X_train)
sne_train

In [None]:
from matplotlib.pyplot import plt