In [60]:
import os 
from math import sqrt
from csv import reader

In [61]:
def load_csv(filename):
    dataset = list()
    with open(filename, "r") as file:
        lines = reader(file)
        for row in lines:
            if not row:
                continue
            dataset.append(row)
    return dataset

def str_col_to_float(dataset, column):
    for row in dataset:
        row[column] = float(str(row[column]).strip())

def str_col_to_int(dataset, column):
    class_values = [ row[column] for row in dataset]
    unique_values = set(class_values)

    lookup = dict()
    for i, value in enumerate(unique_values):
        lookup[value] = i

    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [62]:
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [ row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [71]:
# Normalize dataset
# scaled_value = (value - min)/ (max - min)
# rescaling an input variable to the range between 0 and 1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0])/(minmax[i][1] - minmax[i][0])

In [76]:
# Standardization is a rescaling technique that refers to centering the distribution of the data on
# the value 0 and the standard deviation to the value 1. 
# Together, the mean and the standard deviation can be used to summarize a normal distribution,
# also called the Gaussian distribution or bell curve.
# It requires that the mean and standard deviation of the values for each column be known
# prior to scaling. 

# mean = ( sum of values  )/ ( number of values )


def column_mean(dataset):
    means = [ 0 for i in range(len(dataset[0])) ]
    for i in range(len(dataset[0])):
        col_values = [ row[i] for row in dataset ]
        means[i] = sum(col_values)/len(col_values)

    return means

# The standard deviation describes the average spread of values from the mean. It can be
# calculated as the square root of the sum of the squared di erence between each value and the
# mean and dividing by the number of values minus 1.
# standard deviation = square_root((Summation i=1 to N ( value_i - mean )^2)/count(values)-1)

def column_stdevs(dataset, means):
    stdevs = [ 0 for i in range(len(dataset[0])) ]
    for i in range(len(dataset[0])):
        variance = [ pow(row[i] - means[i], 2) for row in dataset ]
        stdevs[i] = sqrt(sum(variance)/(len(variance)-1))

    return stdevs

# standardize_value = (value - mean)/ std
def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = ( row[i] - means[i] )/stdevs[i]

In [77]:
filename = "pima-indians-diabetes.data.csv"
dataset = load_csv("datasets/"+filename)
print(f"Loaded {filename} dataset with {len(dataset)} rows and {len(dataset[0])} columns.")

print(dataset[0])

for i in range(len(dataset[0])):
    str_col_to_float(dataset, i)
print(dataset[0])

Loaded pima-indians-diabetes.data.csv dataset with 768 rows and 9 columns.
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


In [78]:
minmax = dataset_minmax(dataset)
print(minmax)

[[0.0, 17.0], [0.0, 199.0], [0.0, 122.0], [0.0, 99.0], [0.0, 846.0], [0.0, 67.1], [0.078, 2.42], [21.0, 81.0], [0.0, 1.0]]


In [79]:
means = column_mean(dataset)
print(means)

[3.8450520833333335, 120.89453125, 69.10546875, 20.536458333333332, 79.79947916666667, 31.992578124999977, 0.4718763020833327, 33.240885416666664, 0.3489583333333333]


In [80]:
stdevs = column_stdevs(dataset, means)
print(stdevs)

[3.3695780626988623, 31.97261819513622, 19.355807170644777, 15.952217567727677, 115.24400235133837, 7.8841603203754405, 0.33132859501277484, 11.76023154067868, 0.4769513772427971]


In [85]:
dummy = [[50, 30], [20, 90], [30, 50]]


In [87]:
print(dummy)
dummy_mean = column_mean(dummy)
print(dummy_mean)

[[50, 30], [20, 90], [30, 50]]
[33.333333333333336, 56.666666666666664]


In [89]:
dummy_stds = column_stdevs(dummy, dummy_mean)
print(dummy_stds)
dummy_standardized = standardize_dataset(dummy, dummy_mean, dummy_stds)


[15.275252316519467, 30.550504633038933]
None


In [90]:
print(dummy)

[[1.0910894511799618, -0.8728715609439694], [-0.8728715609439697, 1.091089451179962], [-0.21821789023599253, -0.2182178902359923]]
