### Singular Value Decomposition Example

In [1]:
import numpy as np
import pandas as pd
from scipy.linalg import svd

In [2]:
# Create a sample matrix with uniform random float numbers between 1 and 5
np.random.seed(42)
matrix = np.random.uniform(1, 5, size = (5, 5))

# Randomly assign NaN values
matrix[np.random.random(matrix.shape) < 0.3] = np.nan

# Create a DataFrame from the matrix
df = pd.DataFrame(matrix)
df

Unnamed: 0,0,1,2,3,4
0,2.49816,,3.927976,3.394634,
1,1.623978,,,3.40446,3.83229
2,1.082338,4.879639,,1.849356,1.7273
3,,2.216969,,2.72778,
4,3.447412,1.557975,2.168579,2.465447,


In [3]:
def svd_impute(matrix, k):

    """
    Impute missing values in a matrix using Singular Value Decomposition.

    Parameters:
    - matrix: Input matrix with missing values
    - k: Number of singular values and vectors to use for reconstruction
    
    Returns:
    - Matrix with imputed values
    """

    # Fill NaN values in the matrix with the mean of each column
    filled_matrix = np.where(np.isnan(matrix), np.nanmean(matrix, axis = 0), matrix)
    
    # Perform Singular Value Decomposition
    U, s, Vt = svd(filled_matrix, full_matrices = False)

    print("\nSigma matrix:")
    print(s)

    # Reconstruct matrix using top k singular values and vactors
    reconstructed = np.dot(U[:, :k], np.dot(np.diag(s[:k]), Vt[:k, :]))

    # Replace the NaN values in the filled_matrix with the corresponding values from the reconstructed matrix
    filled_matrix[np.isnan(matrix)] = reconstructed[np.isnan(matrix)]
    
    return filled_matrix, s

In [4]:
# Perform Singular Value Decomposition imputation
k = 2  # Number of singular values to use
imputed_matrix, Sigma = svd_impute(matrix, k) # Impute the missing values in the matrix using SVD

# Clip values to ensure they stay within 1 to 5
imputed_matrix = np.clip(imputed_matrix, 1, 5)

# Convert the imputed matrix into a DataFrame
df_imputed = pd.DataFrame(imputed_matrix)

print("\nImputed matrix:")
print(df_imputed)

# Create a mask to identify the positions of the original NaN values in the matrix
imputed_mask = np.isnan(matrix)

def highlight_imputed(matrix, imputed_matrix, mask):
        
    """
    Highlight imputed values in the DataFrame.

    Parameters:
    - matrix: Original DataFrame with NaN values
    - imputed_matrix: Imputed matrix with filled values
    - mask: Boolean mask identifying original NaN positions

    Returns:
    - DataFrame with imputed values highlighted
    """

    # Create a new DataFrame for highlighting
    highlight_df = pd.DataFrame(imputed_matrix, columns = matrix.columns)
    
    # If the original value was NaN, format it with brackets
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            if mask[i, j]:
                highlight_df.iloc[i, j] = f"[{highlight_df.iloc[i, j]:.3f}]"
            else:
                highlight_df.iloc[i, j] = f"{highlight_df.iloc[i, j]:.3f}"

    return highlight_df

# Highlight imputed values in the DataFrame
highlighted_imputed_matrix = highlight_imputed(df, df_imputed, imputed_mask)
print("\nImputed matrix (imputed values in brackets):")
print(highlighted_imputed_matrix)


Sigma matrix:
[13.79267417  3.16189592  1.46989499  0.95074141  0.15299826]

Imputed matrix:
          0         1         2         3         4
0  2.498160  3.102215  3.927976  3.394634  3.231569
1  1.623978  2.991536  3.330003  3.404460  3.832290
2  1.082338  4.879639  3.200333  1.849356  1.727300
3  2.266037  2.216969  2.839124  2.727780  2.791106
4  3.447412  1.557975  2.168579  2.465447  2.967974

Imputed matrix (imputed values in brackets):
         0        1        2      3        4
0    2.498  [3.102]    3.928  3.395  [3.232]
1    1.624  [2.992]  [3.330]  3.404    3.832
2    1.082    4.880  [3.200]  1.849    1.727
3  [2.266]    2.217  [2.839]  2.728  [2.791]
4    3.447    1.558    2.169  2.465  [2.968]


  highlight_df.iloc[i, j] = f"{highlight_df.iloc[i, j]:.3f}"
  highlight_df.iloc[i, j] = f"[{highlight_df.iloc[i, j]:.3f}]"
  highlight_df.iloc[i, j] = f"{highlight_df.iloc[i, j]:.3f}"
  highlight_df.iloc[i, j] = f"{highlight_df.iloc[i, j]:.3f}"
  highlight_df.iloc[i, j] = f"[{highlight_df.iloc[i, j]:.3f}]"


Way to determine K

In [5]:
squared_values = Sigma ** 2 # Square the singular values
total_variance = np.sum(squared_values) # Calculate the total variance
variance_explained = squared_values / total_variance # Calculate the variance explained by each singular value
cumulative_variance = np.cumsum(variance_explained) # Calculate the cumulative variance explained by the singular values

print(variance_explained)
print(cumulative_variance)

[9.35641949e-01 4.91708678e-02 1.06263802e-02 4.44567339e-03
 1.15129265e-04]
[0.93564195 0.98481282 0.9954392  0.99988487 1.        ]
