In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt                           # plotting
import matplotlib.ticker as mtick
cmap = plt.cm.inferno

# Problem 1.3

## **Read Data**

In [2]:
df = pd.read_csv('w5.csv')
print(df.shape)
df.head()

(200, 6)


Unnamed: 0,y,x1,x2,x3,x4,x5
0,1.764742,0.925627,1.204426,0.949487,0.921375,0.95883
1,3.209386,0.459292,1.420216,-0.311133,0.967812,1.251093
2,-1.393265,0.844528,-0.634767,0.971616,0.020121,-0.224823
3,3.289348,0.946019,0.641351,1.06961,0.717315,0.612031
4,2.728297,0.024489,1.213368,0.26171,0.681006,0.864209


In [3]:
X = df.iloc[:,1:].to_numpy()
y = df.iloc[:,0].to_numpy().reshape(-1,1)

## Single Value Decomposition

$$ X = P \cdot D \cdot Q^{T} $$

In [25]:
P, d, Q_T = np.linalg.svd(X, full_matrices=False)

**1.1 Reconstruction**

In [26]:
X_reconstructed = P @ D @ Q_T
np.std(X - X_reconstructed)

6.090673275959986e-16

In [28]:
Q = Q_T.T
P = P.T
D = np.diag(d)  # D**2 are the eigenvalues
print('X: ', X.shape)
print('y: ', y.shape)
print('P: ', P.shape)
print('D: ', D.shape)
print('Q: ',Q_T.shape)

X:  (200, 5)
y:  (200, 1)
P:  (200, 5)
D:  (5, 5)
Q:  (5, 5)


**1.1 Reconstruction**

In [24]:
X_reconstructed = P @ D @ Q_T
np.std(X - X_reconstructed)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 5 is different from 200)

In [8]:
with np.printoptions(precision=2, suppress=True, threshold=5):
    print("PXQ':\n", P @ X @ Q)
    print("\n\nD:\n", D)

PXQ':
 [[18.91  0.   -0.    0.    0.  ]
 [-0.   10.88 -0.   -0.    0.  ]
 [ 0.   -0.    3.37  0.    0.  ]
 [ 0.    0.    0.    1.25 -0.  ]
 [-0.    0.    0.   -0.    0.88]]


D:
 [[18.91  0.    0.    0.    0.  ]
 [ 0.   10.88  0.    0.    0.  ]
 [ 0.    0.    3.37  0.    0.  ]
 [ 0.    0.    0.    1.25  0.  ]
 [ 0.    0.    0.    0.    0.88]]


**1.2 Transform the Data**

In [9]:
z = P.T @ y

**1.3 Linear Regression**

Recall Multiple Linear Regression

$$ \hat\beta = (X'X)^{-1}X'y $$

Recall Transform Model

$$ z = D \cdot \gamma $$

In [10]:
reg = LinearRegression(fit_intercept = False).fit(D, z) 
reg.coef_ 

array([[-1.09360122, -0.37678565, -0.67855824,  2.15723033, -1.26843182]])

In [13]:
gamma_hat = np.linalg.inv(D.T @ D) @ D.T @ z
print("gamma_hat: ", gamma_hat.flatten())

gamma_hat:  [-1.09360122 -0.37678565 -0.67855824  2.15723033 -1.26843182]


Get Beta: Two options:
- gamma = Q * B
- Get Beta from Data ( works because we are using all the PC)

In [15]:
beta_hat = gamma_hat.flatten() @ np.linalg.inv(Q)
print("beta_hat: ", beta_hat)

beta_hat:  [ 0.67177036  1.31224093 -0.27019562  1.99255642 -1.35808537]


In [17]:
beta_hat = np.linalg.inv(X.T @ X) @ X.T @ y
print("beta_hat: ", beta_hat.flatten())

beta_hat:  [ 0.67177036  1.31224093 -0.27019562  1.99255642 -1.35808537]


# Problem 1.4: using only 4 principle components.

In [19]:
D_4 = D[:4, :4]
z_4 = P.T[:4, :] @ y
gamma_hat_4 = np.linalg.inv(D_4.T @ D_4) @ D_4.T @ z_4
print("gamma_hat_4: ", gamma_hat_4.flatten())

gamma_hat_4:  [-1.09360122 -0.37678565 -0.67855824  2.15723033]


In [21]:
print("beta_hat_4: ", (Q_T[:, :4] @ gamma_hat_4).flatten())

beta_hat_4:  [-0.46332873  0.69326518 -0.39259119 -0.11947841 -2.36396319]


In [22]:
print("beta_hat_4: ", (Q[:, :4] @ gamma_hat_4).flatten())

beta_hat_4:  [ 1.12427654  1.7212518  -0.28414886  0.89901372 -1.15600315]


In [None]:
pca = PCA(n_components = 4)
X_pca_4 = pca.fit_transform(X) # Data is Centered
pca.components_

In [None]:
reg = LinearRegression(fit_intercept = False).fit(D, z) 
reg.coef_ 

In [None]:
z_4.shape

In [None]:
X = df.iloc[:,1:].to_numpy()
X = (X - np.mean(X, axis = 0))
P, d, Q_T = np.linalg.svd(X, full_matrices=False)
D = np.diag(d)  # D**2 are the eigenvalues
X_reconstructed = P @ D @ Q_T
np.std(X - X_reconstructed)

In [None]:
Q_T

In [None]:
pca = PCA(n_components = 5)
X_pca_5 = pca.fit_transform(X)
pca.components_

In [None]:
X_pca_5.shape

In [None]:
pca = PCA(n_components = 4)
X_pca_4 = pca.fit_transform(X)
pca.components_

In [None]:
X_pca_4.shape

In [None]:
pca.singular_values_

In [None]:
P.T.shape

**1.3 Linear Regression**

**Feature Vector & Data Transform**

Each row is a component, top row is the first principal component (PC1). 

The columns are the features ordered from x1 through x5

In [None]:
pca_f5 = PCA(n_components=5)
X_pca = pca_f5.fit_transform(X)
print(np.round(pca_f5.components_,3))    

**Explained Variance Vector**

In [None]:
print('Variance explained by PC1 thru PC5 =', np.round(pca_f5.explained_variance_ratio_,3))

In [None]:
f, (ax10, ax11) = plt.subplots(1, 2,figsize=(15,6))
f.subplots_adjust(wspace=0.5,hspace = 0.3)

ax10.plot(np.arange(1,6,1),pca_f5.explained_variance_ratio_*100,color='blue',alpha=0.4)
ax10.scatter(np.arange(1,6,1),pca_f5.explained_variance_ratio_*100,color='blue',alpha=0.4,edgecolor='black')
ax10.set_xlabel('PC #'); ax10.set_ylabel('Variance Explained'); ax10.set_title('Variance Explained by each PC')
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt); ax10.set_xlim(1,6); ax10.set_ylim(0,100.0)
ax10.yaxis.set_major_formatter(yticks)

ax11.plot(np.arange(1,6,1),np.cumsum(pca_f5.explained_variance_ratio_*100),color='blue',alpha=0.4)
ax11.scatter(np.arange(1,6,1),np.cumsum(pca_f5.explained_variance_ratio_*100),color='blue',alpha=0.4,edgecolor='black')
ax11.plot([1,6],[95,95], color='red',linestyle='dashed')
ax11.set_xlabel('PC #'); ax11.set_ylabel('Cum. Variance Explained'); ax11.set_title('Cum. Variance Explained by each PC')
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt); ax11.set_xlim(1,6); ax11.set_ylim(0,100.0); ax11.annotate('95% variance explained',[4.05,90])
ax11.yaxis.set_major_formatter(yticks)

# plt.show()

**Transform Original Data**

In [None]:
X.shape

In [None]:
transformdata = np.matmul(pca_f5.components_ , X.T)  # Each row is an eigenvector and each row in X is a feature
transformdata.shape 

In [None]:
transformdata = transformdata.T 

In [None]:
reg = LinearRegression().fit(transformdata, y)

In [None]:
reg.coef_

## 1.3

$$ \hat\gamma = [-1.094,-0.377,-0.679,2.157,-1.268] $$

$$ \hat\beta =  [0.672,1.312,-0.27,1.993,-1.358] $$

## 1.4

$$ \hat\gamma = [-1.094,-0.377,-0.679,2.157] $$

$$ \hat\beta =  [1.124,1.721,-0.284,0.899,-1.156] $$

## 1.5

$$ \hat\gamma = [-1.094,-0.377] $$

$$ \hat\beta = [0.381,0.698,0.299,0.523,0.585] $$