CS3101 - PREFINALS

Stan Kiefer E. Gallego

PCA & SVD - own code

In [12]:
def parse_arff(file_path):
    data_started = False
    attrs = []
    data = []
    nom_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attrs.append((attr_name, 'nominal', values))
                    attr_info = ((attr_name, 'nominal', values))

                    attr_name, attr_type, attr_values = attr_info

                    nom_mapping.append({value: index for index, value in enumerate(attr_values)})
                else:
                    attrs.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')

                data.append(line.split(','))

    return attrs, data

def standardize_data(features):
    mean_X = []
    std_X = []
    std_Features = features

    for index in range(len(features)):
        mean_X.append(sum(features[index]) / len(features[index]))

        std_X.append(((sum((x - mean_X[index]) ** 2 for x in features[index])) / (len(features[index]))) ** 0.5)

    for indexStd in range(len(std_Features)):
        std_Features[indexStd] = [((x - mean_X[indexStd]) / std_X[indexStd]) for x in std_Features[indexStd]]

    return std_Features, mean_X, std_X

def calculate_covariance_matrix(features):

    n = len(features[0])
    num_samples = len(features)
    
    mean_values = [sum(feature) / num_samples for feature in features]

    cov_matrix = [[0] * num_samples for _ in range(num_samples)]

    for x in range(num_samples):
        for y in range(num_samples):
            cov_matrix[x][y] = sum((features[x][i] - mean_values[x]) * (features[y][i] - mean_values[y]) for i in range(n)) / (n - 1)
    
    return cov_matrix

def dot_prod(vec1, vec2):
    return sum(x * y for x, y in zip(vec1, vec2))

def scalar_mult(scalar, vec):
    return [scalar * x for x in vec]

def subtract_vecs(vec1, vec2):
    return [x - y for x, y in zip(vec1, vec2)]

def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mat_mult(mat1, mat2):
    return [[dot_prod(row, col) for col in transpose(mat2)] for row in mat1]

def svd(data):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)

    singular_vec = [1.0] * len(cov_matrix[0])
    for _ in range(50):
        singular_vec = mat_mult([singular_vec], cov_matrix)[0]
        magnitude = sum(x ** 2 for x in singular_vec) ** 0.5
        singular_vec = scalar_mult(1.0 / magnitude, singular_vec)

    u_mat = [singular_vec]
    v_mat = [singular_vec]

    singular_val = dot_prod(centered_data[0], singular_vec)
    s_mat = [[singular_val if i == j else 0.0 for j in range(len(u_mat))] for i in range(len(v_mat))]

    return u_mat, s_mat, transpose(v_mat)

def perform_pca(data, num_components):
    mean_vec = [sum(feature) / len(data) for feature in transpose(data)]
    centered_data = [subtract_vecs(row, mean_vec) for row in data]

    cov_matrix = mat_mult(transpose(centered_data), centered_data)
    u_mat, _, v_t = svd(cov_matrix)

    principal_comp = [row[:num_components] for row in transpose(v_t)]
    projected_data = mat_mult(centered_data, principal_comp)

    return projected_data

def main():
    datasetFile = ['./dataset/2017.arff','./dataset/2018.arff','./dataset/2019.arff','./dataset/2020.arff','./dataset/2021 Q1.arff']
    
    for file_path in datasetFile:
            dataset_label = file_path.split('/')[-1].split('.')[0]
    
            attrs, data = parse_arff(file_path)
    
            for row in data:
                for i in range(len(attrs)):
                    attr_name, attr_type, attr_values = attrs[i]
                    if attr_type == 'nominal':
                        nom_mapping = {value: index for index, value in enumerate(attr_values)}
                        row[i] = nom_mapping.get(row[i])
                    elif attr_type == 'numeric':
                        try:
                            row[i] = float(row[i])
                        except Exception as e:
                            row[i] = -1
    
    data_as_list = [list(map(float, row)) for row in data]
    
    num_comp_pca = 3
    projected_data_pca = perform_pca(data_as_list, num_comp_pca)
    
    print("\nProjected Data (PCA):")
    for row in projected_data_pca:
                print(row)
    
    u_svd, s_svd, v_t_svd = svd(data_as_list)
    
    print("\nMatrix U (SVD):")
    for row in u_svd:
        print(row)
    
    print("\nMatrix S (SVD):")
    for row in s_svd:
        print(row)
    
    print("\nMatrix V^T (SVD):")
    for row in v_t_svd:
        print(row)

if __name__ == "__main__":
    main()


Projected Data (PCA):
[0.06595656696010836, 2.7955231469703128e-05, 0.00020791176080777593]
[0.062283811491332025, 2.6398559647724302e-05, 0.00019633430776975594]
[0.06075349671267522, 2.5749946388566456e-05, 0.00019151036900391427]
[0.046674600749032596, 1.9782704404314278e-05, 0.0001471301323581709]
[0.04636853779330124, 1.965298175248271e-05, 0.00014616534460500257]
[0.03841090094428584, 1.6280192804861916e-05, 0.0001210808630226259]
[0.026474445670762754, 1.1221009383430722e-05, 8.345414064906087e-05]
[0.02555625680356867, 1.0831841427936014e-05, 8.055977738955587e-05]
[0.015456179264433746, 6.550993917494236e-06, 4.8721781535000856e-05]
[0.002601535123716571, 1.1026425405683369e-06, 8.200695901930837e-06]
[-0.009640983105537881, -4.086263532694425e-06, -3.0390814224802516e-05]
[-0.013619801530045578, -5.7726580065048225e-06, -4.293305501599086e-05]
[-0.015456179264433746, -6.550993917494236e-06, -4.8721781535000856e-05]
[-0.019741060644672802, -8.367111043136203e-06, -6.222881007

**CONCLUSION**

The two approaches have significant differences. Notably, the first method uses a proprietary mapping to convert nominal attributes to numeric values, whereas the second approach uses scikit-learn's LabelEncoder. The former returns -1 for missing numeric values, while the latter uses scikit-learn's SimpleImputer for imputation. Furthermore, the first technique uses a simplified Singular Value Decomposition (SVD) with a fixed iteration count, whereas the latter uses scikit-learn's TruncatedSVD, which may use more advanced algorithms. In terms of Principal Component Analysis (PCA), the first method derives PCA manually from SVD results, whereas the second approach incorporates scikit-learn's PCA module. Despite the fact that they both project onto three components, the former handles data more manually, whereas the latter processes data using Pandas DataFrames and scikit-learn utilities. Discrepancies in the SVD results' scales and values can be attributed to factors such as the initial approach's custom SVD implementation, differences in handling nominal attributes and missing values, and potential differences in the PCA computation algorithm. The observed variations may also be influenced by numerical precision during calculations.

