In [None]:
import numpy as np

# Original town data matrix (each row is a person, each column is a feature)
data_matrix = np.array([
    [170, 68, 30, 40000],
    [168, 70, 32, 42000],
    [170, 65, 29, 39000],
    [160, 55, 25, 35000],
    [172, 67, 33, 41000],
    [170, 68, 31, 40000],
    [160, 56, 26, 34000],
    [175, 75, 40, 50000],
    [170, 68, 30, 40000],
    [162, 58, 27, 36000]
])

Normalization: Since the income column has a much larger scale than the other columns, we scale it down to avoid it dominating the results.

In [None]:
# Step 1: Normalize the income column (since it has a different scale)
data_matrix[:, -1] = data_matrix[:, -1] / 1000  # scale income to thousands for consistency
print(data_matrix)

[[170  68  30  40]
 [168  70  32  42]
 [170  65  29  39]
 [160  55  25  35]
 [172  67  33  41]
 [170  68  31  40]
 [160  56  26  34]
 [175  75  40  50]
 [170  68  30  40]
 [162  58  27  36]]


SVD: We apply Singular Value Decomposition to decompose the matrix into three parts: U, S, and VT.

In [None]:
# Step 2: Perform Singular Value Decomposition
U, S, VT = np.linalg.svd(data_matrix, full_matrices=True)

Reduced Rank Selection: We choose a rank (e.g., rank 2) for the low-rank approximation. This rank determines the "amount of information" we keep in the approximation.

In [None]:
print(S)

[590.81035003  17.66003324   5.41904229   1.69926461]


In [None]:
# Step 3: Set the rank for low-rank approximation (e.g., rank 2)
rank = 2
S_reduced = np.diag(S[:rank])
U_reduced = U[:, :rank]
VT_reduced = VT[:rank, :]

**U_reduced:** This is the compressed version of U, containing only the first 2 columns. These columns represent the key patterns or directions of variance in the data.

**S_reduced:** This is a diagonal matrix with only the top 2 singular values. These values represent the strength of each pattern retained in the approximation.

**VT_reduced:** This is the compressed version of VT, containing only the first 2 rows. This matrix represents the relationship between the features in the data, corresponding to the key patterns.

In [None]:
print("Compressed U (Rank 2):")
print(np.round(U_reduced, 2))

print("\nCompressed S (Rank 2):")
print(np.round(S_reduced, 2))

print("\nCompressed VT (Rank 2):")
print(np.round(VT_reduced, 2))

Compressed U (Rank 2):
[[-0.32  0.03]
 [-0.32  0.26]
 [-0.32 -0.12]
 [-0.3  -0.43]
 [-0.32  0.06]
 [-0.32  0.06]
 [-0.3  -0.4 ]
 [-0.34  0.69]
 [-0.32  0.03]
 [-0.3  -0.3 ]]

Compressed S (Rank 2):
[[590.81   0.  ]
 [  0.    17.66]]

Compressed VT (Rank 2):
[[-0.9  -0.35 -0.16 -0.21]
 [-0.43  0.59  0.49  0.47]]


Reconstruction: We reconstruct the matrix using only the top 2 singular values and corresponding vectors, resulting in a rank-2 approximation of the original matrix.

In [None]:
# Step 4: Reconstruct the matrix using the reduced rank
low_rank_approximation = np.dot(U_reduced, np.dot(S_reduced, VT_reduced))

Rescaling: After obtaining the approximation, we rescale the income column back to the original scale.

In [None]:
# Step 5: Scale back the income column
low_rank_approximation[:, -1] = low_rank_approximation[:, -1] * 1000

In [None]:
# Print the original and the low-rank approximation matrices
print("Original Matrix:")
print(data_matrix)
print("\nLow-Rank Approximation (Rank 2):")
print(np.round(low_rank_approximation, 2))

Original Matrix:
[[170  68  30  40]
 [168  70  32  42]
 [170  65  29  39]
 [160  55  25  35]
 [172  67  33  41]
 [170  68  31  40]
 [160  56  26  34]
 [175  75  40  50]
 [170  68  30  40]
 [162  58  27  36]]

Low-Rank Approximation (Rank 2):
[[1.701700e+02 6.657000e+01 3.119000e+01 4.072840e+04]
 [1.681500e+02 6.881000e+01 3.308000e+01 4.252206e+04]
 [1.700700e+02 6.444000e+01 2.961000e+01 3.913531e+04]
 [1.598900e+02 5.632000e+01 2.466000e+01 3.358510e+04]
 [1.718900e+02 6.760000e+01 3.177000e+01 4.141302e+04]
 [1.701000e+02 6.691000e+01 3.145000e+01 4.099332e+04]
 [1.598900e+02 5.678000e+01 2.501000e+01 3.393727e+04]
 [1.747700e+02 7.719000e+01 3.861000e+01 4.846343e+04]
 [1.701700e+02 6.657000e+01 3.119000e+01 4.072840e+04]
 [1.619000e+02 5.890000e+01 2.637000e+01 3.542022e+04]]
