<a href="https://colab.research.google.com/github/Mulat-K/Machine-Learning-Mastery-with-Python/blob/main/PDFML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Prepare Your Data For Machine
 Learning**



# **Rescale Data**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load the dataset assuming no header
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric data and handle any potential parsing issues
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Rescale features
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# Set print options and display sample
np.set_printoptions(precision=3)
print("Rescaled data sample:")
print(rescaledX[:5])  # print first 5 rows


Rescaled data sample:
[[0.353 0.744 0.59  0.354 0.    0.501 0.234 0.483]
 [0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]]


# **Standardize Data**

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset (no header in the file)
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Separate input and output features
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Standardize input features
scaler = StandardScaler()
standardizedX = scaler.fit_transform(X)

# Print first 5 standardized rows
np.set_printoptions(precision=3)
print("Standardized data sample:")
print(standardizedX[:5])


Standardized data sample:
[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.998 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.504 -1.505  0.907  0.766  1.41   5.485 -0.02 ]]


# **Normalize Data**

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer

filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset assuming no header
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Convert all values to numeric and drop any problematic rows
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Separate input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Normalize input features (L2 norm by default)
scaler = Normalizer()
normalizedX = scaler.fit_transform(X)

# Show the first 5 normalized rows
np.set_printoptions(precision=3)
print("Normalized data sample:")
print(normalizedX[:5])


Normalized data sample:
[[0.034 0.828 0.403 0.196 0.    0.188 0.004 0.28 ]
 [0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]]


# **Binarize Data (Make Binary)**

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Binarizer

filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset assuming no header
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure numeric data and handle any parsing issues
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Separate input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Apply binarization (1 if > threshold, else 0)
binarizer = Binarizer(threshold=0.0)
binaryX = binarizer.fit_transform(X)

# Display first 5 binarized samples
np.set_printoptions(precision=3)
print("Binarized data sample:")
print(binaryX[:5])


Binarized data sample:
[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]
