<h1>Data Preprocessing</h1>

For detailed explanations, see https://machinelearningmastery.com/prepare-data-machine-learning-python-scikit-learn/

In [94]:
import numpy
import scipy
import pandas
url = "inputDiabetesData.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
print(X)

[[6.00 nan 72.00 ... 33.60 0.63 50.00]
 [1.00 85.00 66.00 ... 26.60 0.35 31.00]
 [8.00 183.00 64.00 ... 23.30 0.67 32.00]
 ...
 [5.00 121.00 72.00 ... 26.20 0.24 30.00]
 [1.00 126.00 60.00 ... 30.10 0.35 47.00]
 [1.00 93.00 70.00 ... 30.40 0.32 23.00]]


<h3>Cleaning - Drop the whole row</h3>

In [95]:
cleaneddata = dataframe.dropna()
array = cleaneddata.values
# Convert exponential format to float
# https://stackoverflow.com/questions/21008858/formatting-floats-in-a-numpy-array/21009774#21009774
float_formatter = lambda x: "%.2f" % x
numpy.set_printoptions(formatter={'float_kind':float_formatter})
X = array[:,0:8]
print(X)

# Print output 2D list to a file
# https://stackoverflow.com/questions/21023523/writing-multi-dimensional-list-value-to-a-file-in-python/21023654#21023654
import csv
with open("output0.1CleanDeletion.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(X)

[[1.00 85.00 66.00 ... 26.60 0.35 31.00]
 [8.00 183.00 64.00 ... 23.30 0.67 32.00]
 [1.00 89.00 66.00 ... 28.10 0.17 21.00]
 ...
 [5.00 121.00 72.00 ... 26.20 0.24 30.00]
 [1.00 126.00 60.00 ... 30.10 0.35 47.00]
 [1.00 93.00 70.00 ... 30.40 0.32 23.00]]


<h3>Cleaning - Substitution</h3>

In [96]:
cleaneddata = dataframe.fillna(0)
array = cleaneddata.values
# Convert exponential format to float
# https://stackoverflow.com/questions/21008858/formatting-floats-in-a-numpy-array/21009774#21009774
float_formatter = lambda x: "%.2f" % x
numpy.set_printoptions(formatter={'float_kind':float_formatter})
X = array[:,0:8]
print(X)
with open("output0.2CleanSubstitution.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(X)

[[6.00 0.00 72.00 ... 33.60 0.63 50.00]
 [1.00 85.00 66.00 ... 26.60 0.35 31.00]
 [8.00 183.00 64.00 ... 23.30 0.67 32.00]
 ...
 [5.00 121.00 72.00 ... 26.20 0.24 30.00]
 [1.00 126.00 60.00 ... 30.10 0.35 47.00]
 [1.00 93.00 70.00 ... 30.40 0.32 23.00]]


<h3>Resizing/MinMaxScaling</h3>

In [102]:
# Rescale data (between 0 and 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])
with open("output1.1MinMaxScaling.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(rescaledX)

[[1.765 0.    2.951 1.768 0.    2.504 1.172 2.417]
 [0.294 2.136 2.705 1.465 0.    1.982 0.583 0.833]
 [2.353 4.598 2.623 0.    0.    1.736 1.268 0.917]
 [0.294 2.236 2.705 1.162 0.556 2.094 0.19  0.   ]
 [0.    3.442 1.639 1.768 0.993 3.212 4.718 1.   ]]


<h3>Standardization/StandardScaling</h3>

In [98]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])
with open("output1.2Standardization.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(rescaledX)

[[ 0.64  -3.745  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.108 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.933 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.984 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.506 -1.505  0.907  0.766  1.41   5.485 -0.02 ]]


<h3>Normalization</h3>

In [99]:
# Normalize data (length of 1)
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(normalizedX[0:5,:])
with open("output1.3Normalization.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(normalizedX)

[[0.06  0.    0.717 0.349 0.    0.335 0.006 0.498]
 [0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]]


<h3>Binarization</h3>

In [100]:
# binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=10).fit(X)
binaryX = binarizer.transform(X)
# Convert floats in numpy 2D arrays to ints
# https://stackoverflow.com/questions/43197594/convert-numbers-in-a-list-of-lists-to-float-in-python/48838744#48838744
binaryX = numpy.array(binaryX, int)
# summarize transformed data
print(binaryX[0:5,:])
with open("output1.4Binarization.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(binaryX)

[[0 0 1 1 0 1 0 1]
 [0 1 1 1 0 1 0 1]
 [0 1 1 0 0 1 0 1]
 [0 1 1 1 1 1 0 1]
 [0 1 1 1 1 1 0 1]]


<h3>Discretization</h3>

In [101]:
# Discretization
roundedX = numpy.round(rescaledX, 1)
# summarize transformed data
print(roundedX[0:5,:])
with open("output4Discretization.csv", 'w') as outputFile:
   writer = csv.writer(outputFile, delimiter=',')
   writer.writerows(roundedX)

[[ 0.6 -3.7  0.1  0.9 -0.7  0.2  0.5  1.4]
 [-0.8 -1.1 -0.2  0.5 -0.7 -0.7 -0.4 -0.2]
 [ 1.2  1.9 -0.3 -1.3 -0.7 -1.1  0.6 -0.1]
 [-0.8 -1.  -0.2  0.2  0.1 -0.5 -0.9 -1. ]
 [-1.1  0.5 -1.5  0.9  0.8  1.4  5.5 -0. ]]
