In [2]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
import seaborn as sns
import pandas as pd
import pickle

## Data pre-processing

In [3]:
# Utility functions
def load_data():
    """load data."""
    f = open(f"dataset/x_train.csv")
    features = f.readline()
    feature_names = features.split(',')
    data = np.loadtxt(f"dataset/x_train.csv", delimiter=",", skiprows=1, dtype=str)
    return data,feature_names

def convert_row_to_float(row):
    """Convert values in row to float or np.nan."""
    new_row = []
    for item in row:
        try:
            new_row.append(float(item))
        except ValueError:
            new_row.append(np.nan)
    return np.array(new_row)

def convert_all_rows(data):
    """Convert all rows to float or np.nan."""
    new_data = []
    for row in data:
        new_data.append(convert_row_to_float(row))
    return np.array(new_data)

def column_NAN(array):
    nan=0
    for i in range(len(array)):
        if np.isnan(array[i]):
                nan += 1
    return nan

def train_validation_split(data, ratio, seed):
    """Split data into training and validation set."""
    np.random.seed(seed)
    np.random.shuffle(data)
    split_index = int(len(data) * ratio)
    return data[:split_index], data[split_index:]

def k_fold_split(data, k, seed):
    """Split data into k folds."""
    np.random.seed(seed)
    np.random.shuffle(data)
    return np.array_split(data, k)

def standardize_data(data):
    """Standardize data."""
    mean = np.nanmean(data, axis=0)
    std = np.nanstd(data, axis=0)
    return (data - mean) / std

#### Loading data

In [4]:
numpy_data, features = load_data()

#### Converting to float

In [5]:
data = convert_all_rows(numpy_data)

#### Finding number of NaN for each feature

In [7]:
NaNs=np.zeros(len(features))
for i in features:
    NaNs[features.index(i)]=column_NAN(data[:,features.index(i)])

#### Removing features with too many NaNs

In [10]:
Removed_features=[]
for i in range(len(NaNs)):
    if NaNs[i] > round(len(data))*0.1:
        Removed_features.append(i)

In [13]:
reduced_data = data

In [14]:
np.delete(reduced_data, Removed_features, 1)

array([[0.00000e+00, 5.30000e+01, 1.10000e+01, ..., 1.00000e+00,
        1.00000e+00, 2.00000e+00],
       [1.00000e+00, 3.30000e+01, 1.20000e+01, ..., 9.00000e+00,
        9.00000e+00,         nan],
       [2.00000e+00, 2.00000e+01, 1.00000e+01, ..., 1.00000e+00,
        1.00000e+00, 2.00000e+00],
       ...,
       [3.28132e+05, 3.90000e+01, 1.00000e+01, ..., 1.00000e+00,
        2.00000e+00, 2.00000e+00],
       [3.28133e+05, 3.30000e+01, 1.20000e+01, ..., 1.00000e+00,
        1.00000e+00, 2.00000e+00],
       [3.28134e+05, 3.20000e+01, 9.00000e+00, ..., 1.00000e+00,
        1.00000e+00, 2.00000e+00]])