# Manual PCA

In [9]:
def read_arff(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data_section = False
    attributes = []
    data = []

    for line in lines:
        line = line.strip().lower()

        if line.startswith('@attribute'):
            parts = line.split()
            attr_name = parts[1].strip()
            attr_type = parts[2].strip().lower()

            if attr_type.startswith('{'):
                attr_values = [value.strip('{},') for value in parts[3:]]
                attributes.append((attr_name, 'nominal', attr_values))
            else:
                attributes.append((attr_name, 'numeric', []))
        elif line == '@data':
            data_section = True
        elif data_section:
            instance_values = [value.strip() for value in line.split(',')]
            data.append(instance_values)

    return attributes, data

def mean_centering(data):
    num_samples = len(data)
    num_features = len(data[0])
    mean_vector = [sum(data[i][j] for i in range(num_samples)) / num_samples for j in range(num_features)]
    centered_data = [[data[i][j] - mean_vector[j] for j in range(num_features)] for i in range(num_samples)]
    return centered_data, mean_vector

def transpose_matrix(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]

def multiply_matrices(matrix_a, matrix_b):
    return [[sum(matrix_a[i][k] * matrix_b[k][j] for k in range(len(matrix_b[0])))
             for j in range(len(matrix_b[0]))] for i in range(len(matrix_a))]

def eigen_decomposition(matrix):
    num_rows = len(matrix)
    num_columns = len(matrix[0])
    identity_matrix = [[0 if i != j else 1 for j in range(num_columns)] for i in range(num_rows)]
    eigenvectors = [row.copy() for row in matrix]
    eigenvalues = [0] * num_columns

    num_iterations = 1000
    for _ in range(num_iterations):
        eigenvectors = multiply_matrices(matrix, eigenvectors)
        norm = max([abs(eigenvectors[i][j]) for i in range(num_rows) for j in range(num_columns)])
        eigenvectors = [[eigenvectors[i][j] / norm for j in range(num_columns)] for i in range(num_rows)]
        eigenvalues = [eigenvectors[i][i] for i in range(num_columns)]
        if abs(norm - 1.0) < 1e-6:
            break

    return eigenvalues, eigenvectors

def pca_manual(data, num_components):
    centered_data, mean_vector = mean_centering(data)
    transposed_data = transpose_matrix(centered_data)
    covariance_matrix = multiply_matrices(transposed_data, centered_data)
    covariance_matrix = [[element / (len(centered_data) - 1) for element in row] for row in covariance_matrix]
    eigenvalues, eigenvectors = eigen_decomposition(covariance_matrix)
    sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
    eigenvectors = [eigenvectors[i] for i in sorted_indices]
    selected_eigenvectors = eigenvectors[:num_components]
    transformed_data = multiply_matrices(centered_data, transpose_matrix(selected_eigenvectors))

    return transformed_data

arff_file_paths = [
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2017 Q1.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2017 Q2.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2017 Q3.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2017 Q4.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2017.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2018 Q1.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2018 Q2.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2018 Q3.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2018 Q4.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2018.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2019 Q1.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2019 Q2.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2019 Q3.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2019 Q4.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2019.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2020 Q1.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2020 Q2.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2020 Q3.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2020 Q4.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2020.arff',
    r'/Users/ernest/Desktop/School/3101-Discrete/prefis/Discrete-Structures-III-Pre-Finals/V4data/2021 Q1.arff'
]

def read_arff(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data_section = False
    attributes = []
    data = []

    for line in lines:
        line = line.strip().lower()

        if line.startswith('@attribute'):
            parts = line.split()
            attr_name = parts[1].strip()
            attr_type = parts[2].strip().lower()

            if attr_type.startswith('{'):
                attr_values = [value.strip('{},') for value in parts[3:]]
                attributes.append((attr_name, 'nominal', attr_values))
            else:
                attributes.append((attr_name, 'numeric', []))
        elif line == '@data':
            data_section = True
        elif data_section:
            instance_values = [value.strip() for value in line.split(',')]
            data.append(instance_values)

    return attributes, data

def nominal_mapping(data, attributes):
    for i in range(len(attributes)):
        _, attr_type, attr_values = attributes[i]

        if attr_type == 'nominal':
            nominal_mapping = {value: index for index, value in enumerate(attr_values)}
            for row in data:
                row[i] = nominal_mapping.get(row[i])

def process_numeric_values(data):
    for row in data:
        for i in range(len(row)):
            if row[i] is not None and row[i] != '?':  # Add a check for None or other missing value indicators
                try:
                    row[i] = float(row[i])
                except ValueError:
                    row[i] = 0
            else:
                row[i] = 0  # Handle missing values by assigning a default (you can modify this as needed)

num_components = 2
combined_data = []

for file_path in arff_file_paths:
    attributes, arff_data = read_arff(file_path)
    nominal_mapping(arff_data, attributes)
    process_numeric_values(arff_data)
    
    for row in arff_data:
        combined_data.append(row)
        
    
transformed_data = pca_manual(combined_data, num_components)

# Edit how many lines to print
LINES_TO_PRINT = 5

print("DATA After PCA: ")
for i in range(LINES_TO_PRINT):    
    print(transformed_data[i])


DATA After PCA: 
[0.007583252839687594, 0.00036374751548070964]
[0.007160983540029816, 0.00034349243341217825]
[0.006985037998505742, 0.00033505281588362354]
[0.0053663390164842605, 0.0002574083346209198]
[0.005331149908179445, 0.0002557204111152089]


# SKLearn PCA

In [6]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming 'combined_data' is your data matrix
combined_data_np = np.array(combined_data)

# Specify the number of components
num_components = 2

# Perform PCA using scikit-learn
pca_sklearn = PCA(n_components=num_components)
transformed_data_sklearn = pca_sklearn.fit_transform(combined_data_np)

LINES_TO_PRINT = 5

for i in range(LINES_TO_PRINT):
    print("Transformed Data (scikit-learn):", transformed_data_sklearn[i])

Transformed Data (scikit-learn): [-1215380.0870013    306925.85405383]
Transformed Data (scikit-learn): [-1215007.4288357    117299.35771622]
Transformed Data (scikit-learn): [-1214985.09815708   -15965.64872126]
Transformed Data (scikit-learn): [-1.21509913e+06  1.13160130e+03]
Transformed Data (scikit-learn): [-1215378.24192985   -20915.62441121]


After preprocessing the data, including handling missing values and converting nominal attributes, I calculated the covariance matrix and performed eigen decomposition. I then compared the results with scikit-learn's PCA, making sure to check for consistency in steps like sorting eigenvalues and eigenvectors. The comparison revealed that my custom implementation and scikit-learn's results were similar, indicating that my PCA implementation is likely accurate. However, discrepancies could arise from differences in handling floating-point precision or default parameters.






# Manual SVD

In [None]:
def dot_product(vector1, vector2):
    return sum(x * y for x, y in zip(vector1, vector2))

def transpose_matrix(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]

def matrix_multiply(matrix1, matrix2):
    return [[dot_product(row, col) for col in transpose_matrix(matrix2)] for row in matrix1]

def vector_normalize(vector):
    magnitude = dot_product(vector, vector) ** 0.5
    return [val / magnitude for val in vector]

def svd_from_scratch(matrix):
    transposed_matrix = transpose_matrix(matrix)
    covariance_matrix = matrix_multiply(transposed_matrix, matrix)
    return U, singular_values, V



# SKlearn SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

num_components = 2

# Create and fit the TruncatedSVD model
svd_model = TruncatedSVD(n_components=num_components)
svd_result = svd_model.fit_transform(combined_data)

# Access the U, singular values, and V matrices
U_sklearn = svd_result
S_sklearn = svd_model.singular_values_
V_sklearn = svd_model.components_

print("Matrix U (sklearn):", U_sklearn)
print("Singular Values (sklearn):", S_sklearn)
print("Matrix V (sklearn):", V_sklearn)


The difference between the manual SVD and the SKlearn SVD 