## Load data

#### Without using third party packages
- Load file
- Load csv
- Load JSON
- Load txt

In [None]:
def load_txt(file_name):
    f_data = []
    with open(file_name, 'r') as f:
        for line in f:
            f_data.append(line)

In [None]:
# Load data from a CSV file
def load_csv_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        data.append(line.strip().split(','))
    return data

# Load data from a JSON file
def load_json_data(file_path):
    import json
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Load data from a text file
def load_text_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        data.append(line.strip())
    return data

# Usage examples
csv_data = load_csv_data('data.csv')
json_data = load_json_data('data.json')
text_data = load_text_data('data.txt')

## Preprocessing

### Missing values
- Remove entire rows
- Filling missing values with a specific value (e.g., 0 or a placeholder)
- Filling with mean (for numeric columns)

In [None]:
def remove_missing(data):
    return [row for row in data if None not in row]

In [None]:
def fill_with_value(data, value=0):
    return [[item if item is not None else value for item in row] for row in data]

In [None]:
def fill_with_mean(data):
    # Calculate mean of all the columns without missing values
    n_rows, n_cols = len(data), len(data[0])
    column_means = [0] * n_cols
		
    for i in range(n_cols):
        col_count = 0
        for row in data:
            col_i = row[i]
            if col_i is not None:
                column_means[i] += col_i
                col_count += 1
        if col_count != 0: 
            column_means[i] = column_means[i] / col_count
    
    for i, row in enumerate(data):
        for j in range(n_cols):
            if row[j] is None:
                data[i][j] = column_means[j]
		
	return data

#### Filling missing values with numpy

In [None]:
import numpy as np

def fill_missing_values(data, fill_value=0):
    data_with_no_nan = np.copy(data)
    # Replace np.nan with `fill_value`
    data_with_no_nan[np.isnan(data_with_no_nan)] = fill_value
    return data_with_no_nan

### Normalization

#### Min-max scaling $\frac{x-min}{max-min}$

In [None]:
# without numpy
def min_max_scaling(data):
    min_val, max_val = min(data), max(data)
    if max_val - min_val == 0:
        return [0 for _ in data]
    return [(x - min_val) / (max_val - min_val) for x in data]

# with numpy
def normalize_data(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

#### Z-score normalization

In [None]:
# without numpy
def z_score_normalization(data):
    mean = sum(data) / len(data)
    variance = sum([(x - mean) ** 2 for x in data]) / len(data)
    std_dev = variance ** 0.5
    return [(x - mean) / std_dev for x in data]

# with numpy
def standardize_data(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

### Noises

#### Statistical: 3-sigma

In [None]:
def remove_outliers(data):
    mean = sum(data) / len(data)
    variance = sum([(d - mean) ** 2 for d in data]) / len(data)
    std_dev = variance ** 0.5
    new_data = [d for d in data if d > mean + 3 * std_dev or d < mean - 3 * std_dev]
    return new_data

## Feature Engineering

### One-hot vector

In [None]:
class OneHotEncoder:
    def __init__(self, data: list):
        unique_labels = set(data)
        self.element_to_index = {element: i for i, element in enumerate(unique_labels)}
        self.num_elements = len(self.element_to_index)				

    def category_to_one_hot(self, input_data: list):
        encoded_data = []
        for d in input_data:
            vector = [0] * self.num_elements
            vector[self.element_to_index[d]] = 1
            encoded_data.append(vector)
        return encoded_data