# CS 3101 - PREFINAL EXAM

## Part1: PCA and SVD from Scratch

In [34]:
def parse_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    attribute_info = ((attr_name, 'nominal', values))
                    attr_name, attr_type, attr_values = attribute_info
                    nominal_mapping.append({value: index for index, value in enumerate(attr_values)})

                else:
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data.append(line.split(','))

    return attributes, data


def standardize(matrix):
    means = [mean(col) for col in zip(*matrix)]
    std_devs = [((sum((x - means[i]) ** 2 for x in col) / len(col)) ** 0.5) for i, col in enumerate(zip(*matrix))]
    return [[(col[i] - means[i]) / std_devs[i] for i in range(len(col))] for col in matrix]

def mean(vector):
    return sum(vector) / len(vector)

def subtract_mean(matrix):
    col_means = [mean(col) for col in zip(*matrix)]
    return [[col - col_means[i] for i, col in enumerate(row)] for row in matrix]

def covariance_matrix(matrix):
    n = len(matrix)
    num_features = len(matrix[0])
    cov_matrix = [[0] * num_features for _ in range(num_features)]

    for i in range(num_features):
        for j in range(num_features):
            mean_i = mean(matrix[i])
            mean_j = mean(matrix[j])
            values_i = [float(val) for val in matrix[i]]
            values_j = [float(val) for val in matrix[j]]
            cov_matrix[i][j] = sum((val_i - mean_i) * (val_j - mean_j) for val_i, val_j in zip(values_i, values_j)) / (n - 1)

    return cov_matrix

def normalize(vector):
    norm = sum(x**2 for x in vector)**0.5
    return [x / norm for x in vector]

def multiply(matrix, vector):
    return [sum(x*y for x, y in zip(row, vector)) for row in matrix]

def eigenvalues_and_eigenvectors(matrix, num_simulations=1000):
    n = len(matrix)
    vec = [1] * n

    for _ in range(num_simulations):
        new_vec = multiply(matrix, vec)
        vec = normalize(new_vec)

    eigenvalue = sum(x*y for x, y in zip(multiply(matrix, vec), vec))
    eigenvector = vec

    return eigenvalue, eigenvector

def transform(matrix, eigenvectors, k):
    return [
        [sum(row[j] * eigenvectors[i][j] for j in range(len(row))) for i in range(k)]
        for row in matrix
    ]
    
def pca(matrix, k):
    num_features = len(matrix[0])
    standardized_matrix= standardize(matrix)
    cov_matrix = covariance_matrix(standardized_matrix)
    eigenvalues, eigenvectors = eigenvalues_and_eigenvectors(cov_matrix)
    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues, reverse=True)
    eigenvectors = [[eigenvectors[i] for j in sorted_indices] for i in range(num_features)]
    transformed_matrix = transform(standardized_matrix, eigenvectors, k)
    return transformed_matrix 


def subtractVector(vevtor1, vector2):
    return [x - y for x, y in zip(vevtor1, vector2)]

def transposeMatrix(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mutilpySVD(mult1, mult):
    return [[dotProd(row, col) for col in transposeMatrix(mult)] for row in mult1]

def dotProd(vevtor1, vector2):
    return sum(x * y for x, y in zip(vevtor1, vector2))
    
def scalarMultiply(scalar, vec):
    return [scalar * x for x in vec]

def svd(data):
    mean_vec = [sum(feature) / len(data) for feature in transposeMatrix(data)]
    centered_data = [subtractVector(row, mean_vec) for row in data]

    cov_matrix = mutilpySVD(transposeMatrix(centered_data), centered_data)

    singular_vec = [1.0] * len(cov_matrix[0])
    for _ in range(50):
        singular_vec = mutilpySVD([singular_vec], cov_matrix)[0]
        magnitude = sum(x ** 2 for x in singular_vec) ** 0.5
        singular_vec = scalarMultiply(1.0 / magnitude, singular_vec)

    u_mat = [singular_vec]
    v_mat = [singular_vec]

    singular_val = dotProd(centered_data[0], singular_vec)
    s_mat = [[singular_val if i == j else 0.0 for j in range(len(u_mat))] for i in range(len(v_mat))]

    return u_mat, s_mat, transposeMatrix(v_mat)





def main():
    dataset_files = ['./samples/2019.arff',
                     './samples/2018.arff',
                    './samples/2017.arff' ]


    for file_path in dataset_files:
        dataset_label = file_path.split('/')[-1].split('.')[0]

        print(f"\n Year dataset: {dataset_label}")
        attrs, data = parse_arff(file_path)

        for row in data:
            for i in range(len(attrs)):
                attr_name, attr_type, attr_values = attrs[i]
                if attr_type == 'nominal':
                    nom_mapping = {value: index for index, value in enumerate(attr_values)}
                    row[i] = nom_mapping.get(row[i])
                elif attr_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = -1

        data_as_list = [list(map(float, row)) for row in data]

        numPCA = 5
        transformed_data = pca(data, numPCA)

    
        print(f"\nTransformed Data Size: {len(transformed_data)}")
        print("\nTransformed Data (after PCA):")
        for sample in transformed_data:
            print(sample)


        
        u_svd, s_svd, v_t_svd = svd(data_as_list)

        print("\nMatrix U (SVD):")
        for row in u_svd:
            print(row)

        print("\nMatrix S (SVD):")
        for row in s_svd:
            print(row)

        print("\nMatrix V^T (SVD):")
        for row in v_t_svd:
            print(row)

if __name__ == "__main__":
    main()


 Year dataset: 2019


NameError: name 'num_components' is not defined