In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [2]:
def prepareRawData(file):
    d = pd.read_csv(file)
    X = d.select_dtypes(exclude = "int64")
    Y = d["label"]

    fft = X.select_dtypes("float64").T

    finite_X = X.drop(np.where(np.max(fft) == np.inf)[0])
    finite_Y = Y.drop(np.where(np.max(fft) == np.inf)[0])

    finite_Y = finite_Y[~finite_X.isna().any(axis = 1)]
    finite_X = finite_X[~finite_X.isna().any(axis = 1)]

    finite_X = finite_X.reset_index(drop=True)
    finite_Y = finite_Y.reset_index(drop=True)
    
    clean_X = finite_X.drop(["0", "ID"], axis = 1)

    clean_X = clean_X.reset_index(drop=True)
    finite_Y = np.reshape(np.array(finite_Y), (-1,1))
    
    return clean_X, finite_Y



In [3]:
def oneHotEncode(Y):
    o = OneHotEncoder()
    return o.fit_transform(Y).toarray()

In [4]:
def split(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
        
    return x_train, x_test, y_train, y_test

In [5]:
def normalize(x_train, x_test, scale = 1):
    n_train = np.reshape(np.linalg.norm(x_train, axis = 1), (-1, 1))
    n_test = np.reshape(np.linalg.norm(x_test, axis = 1), (-1,1))
    
    normalized_x_train = (x_train / n_train) * scale
    normalized_x_test = (x_test / n_test) * scale
    
    return normalized_x_train, normalized_x_test
    

In [6]:
def pcaReduction(normalized_x_train, explained_variance_ratio_threshold):
    test_p = PCA()
    test_p.fit(normalized_x_train)
    
    p = PCA(n_components=len(np.where(np.cumsum(test_p.explained_variance_ratio_) <= 0.9999999)[0]))
    p.fit(normalized_x_train)
    return p
    

In [7]:
X, Y = prepareRawData("CH3.csv")
Y = oneHotEncode(Y)
x_train, x_test, y_train, y_test = split(X, Y)
x_train, x_test = normalize(x_train, x_test)
reducer = pcaReduction(x_train, 0.9999)

compressed_x_train = reducer.transform(x_train)
compressed_x_test = reducer.transform(x_test)