In [28]:
# Handling Numerical Values ​​in the context of machine learning means processing and working with numerical data,
# which represent quantitative values.
# Numeric values ​​can be attributes or features that characterize objects in a data set.


# One type of processing of numeric data is decoding - the process of turning numeric data into categorical
# Methods:

# Discretization - turning continuous data into discrete data (those limited to a maximum value) and dividing it into categories
# Binarization - turning a data stream into a binary format

# example code for binarization (without module using)

# data['column_name'] = data['column_name'].apply(lambda x: 1 if x == 'one_data_type' else 0)


# --------------------------------------------------------------------------------------------------------------------------------------------------

# There are several other aspects of Handling Numerical Values ​​in machine learning problems, including:
# Normalization and standardization of data:

# Normalization: The process of reducing numerical data to a range from 0 to 1. This is often used when feature values ​​have different scales.
# Standardization: The process of reducing numerical data to a standard normal distribution with a mean of 0 and a standard deviation of 1.
# Handling outliers:

# Identification and processing of outliers (anomalies) in numerical data to improve model quality.
# Feature engineering:

# Create new features based on existing numerical data to improve the predictive ability of the model.
# Handling missing values:

# Fill in or remove missing values ​​in numeric data.
# Dimension reduction:

# Apply dimensionality reduction techniques such as Principal Component Analysis (PCA) to work with large numerical data sets.
# Feature selection:

# Selection of the most important numerical features to build a more efficient model.


In [29]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer

In [30]:
decoder = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy='uniform')

# n_bins=3: This parameter specifies the number of bins into which continuous values ​​will be divided.
# In this case, the data will be divided into 3 intervals.

# encode="ordinal": This parameter specifies the encoding method for sampled values.
# "ordinal" means that after sampling the values ​​will be encoded as integers,
# representing ordinal intervals.

In [41]:
x = load_iris()
x = pd.DataFrame(data=x.data, columns=x.feature_names)

In [42]:
x

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [43]:
print(decoder.fit_transform(x))

[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 2. 0. 0.]
 [1. 2. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [1. 2. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [1. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [0. 2. 0. 0.]
 [0. 1. 0. 0.]
 [2. 1. 1. 1.]
 [1. 1. 1. 1.]
 [2. 1. 1. 1.]
 [1. 0. 1. 1.]
 [1. 0. 1. 1.]
 [1. 0. 1. 1.]
 [1. 1. 1. 1.]
 [0. 0. 1. 1.]
 [1. 1. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [1. 1. 1. 1.]
 [1. 0. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [2. 1. 1. 1.]
 [1. 1. 1.



In [34]:
# The output is the result of data sampling.
# For each observation (row) and each feature (column) of your Iris dataset,
# values ​​were converted to ordinal intervals (integers), according to a uniform split into 3 intervals.
# Each number in the output indicates which interval the original value of the feature fell into.

# [0., 1., 0., 0.]
# First number (0.): The value of the first feature (for example, sepal length) fell into the first interval.
# Second number (1.): The value of the second feature (for example, the width of the sepal) fell into the second interval.
# Third and fourth numbers (0. and 0.): The values ​​of the third and fourth characteristics (pistil characteristics) fell into the first interval.



In [35]:
# This code displays the boundaries of the bins (intervals) that were used to discretize the data using the KBinsDiscretizer.
bins = decoder.bin_edges_
print(bins)

# First sign (at index 0):
# Bin boundaries: [4.3, 5.5, 6.7, 7.9]
# This means that the first feature was divided into four intervals, where values ​​from 4.3 (inclusive) to 5.5 (not inclusive) belong to the first interval, and so on.

# Second sign (at index 1):
# Bin boundaries: [2. , 2.8, 3.6, 4.4]
# The second sign was divided into four intervals with boundaries from 2.0 to 4.4.

# Third sign (at index 2):
# Bin boundaries: [1. , 2.96666667, 4.93333333, 6.9]
# The third sign was divided into four intervals with boundaries from 1.0 to 6.9.

# Fourth sign (at index 3):
# Bin boundaries: [0.1, 0.9, 1.7, 2.5]
# The fourth sign was divided into four intervals with boundaries from 0.1 to 2.5.

# Each feature corresponds to a column in the dataset

[array([4.3, 5.5, 6.7, 7.9]) array([2. , 2.8, 3.6, 4.4])
 array([1.        , 2.96666667, 4.93333333, 6.9       ])
 array([0.1, 0.9, 1.7, 2.5])]


In [46]:
from sklearn.preprocessing import Binarizer
print(x.head())
binarizer = Binarizer(threshold = 5)

# threshold=5 means that all values ​​in your data set that are greater than 5 will be converted to 1 (one),
# and all values ​​less than or equal to 5 will be converted to 0 (zero).



   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [47]:
binarizer.fit_transform(x["sepal length (cm)"].values.reshape(-1, 1))

# the "sepal length (cm)" feature was binarized using a threshold value (threshold).
# Each value in the returned array indicates whether the corresponding original value is greater than the threshold.

array([[1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],