In [None]:
import numpy as np
import pandas as pd


In [None]:
# create an array of numpy, which is immediately placed in the dataframe
# dataframe format (number of rows and columns) is formed by the reshape(5, 3) function - 5 rows, 3 columns
x = pd.DataFrame(np.array([2, 4, 3, 1, 5, np.NaN, 4, 1, np.NaN, 2, 5, 3, 2, np.NaN, 2]).reshape(5, 3))

In [None]:
# set column names
x.columns = ["f_1", "f_2", "f_3"]

In [None]:
x

Unnamed: 0,f_1,f_2,f_3
0,2.0,4.0,3.0
1,1.0,5.0,
2,4.0,1.0,
3,2.0,5.0,3.0
4,2.0,,2.0


In [None]:
# number of empty (missing) values ​​in each column
x.isnull().sum()

f_1    0
f_2    1
f_3    2
dtype: int64

In [None]:
# create an indicator of missing values ​​in the data.

# The Missing Value Indicator is a special variable or matrix
# which helps track and indicate the location of missing values ​​in a data set.

from sklearn.impute import MissingIndicator

# A MissingIndicator object is created indicating
# that missing values ​​in the data are represented by the value np.NaN (Not a Number).
indicator = MissingIndicator(missing_values=np.NaN)

# Training and applying the indicator to data x. The result will be a new matrix,
# in which a flag will be set for each missing value.
# If a value in the data was missing, the corresponding element in the new matrix will be 1 (true),
# otherwise - 0 (false).
indicator = indicator.fit_transform(x)

In [None]:
# convert the indicator matrix into a dataframe
# we mark only two columns in it - “f1”, “f2”, since the missing values ​​in the X dataframe are only in two columns
# if the number of columns in the new dataframe does not correspond to the number of columns with missing values ​​in the result, then this
# will cause errors

# simply put, the indicator is a dataframe with exclusively empty values

indicator = pd.DataFrame(indicator, columns=["f1", "f2"])

In [None]:
indicator

Unnamed: 0,f1,f2
0,False,False
1,False,True
2,False,True
3,False,False
4,True,False


In [30]:
# implementation of the imputing method for processing null (missing) dataset values
from sklearn.impute import SimpleImputer


# initialize the object for imputing
# defines a strategy for replacing missing values. In this case strategy='mean',
# which means using the average to replace missing values.

# in general strategy can take the following values:

# 'mean' (default): Replaces missing values ​​with the mean of the column.
# 'median': Replaces missing values ​​with the median value of the column.
# 'most_frequent': Replaces missing values ​​with the most frequently occurring value in the column.
# 'constant': Replaces missing values ​​with a constant specified by fill_value.


imp = SimpleImputer(missing_values=np.NaN, strategy='mean')
res = imp.fit_transform(x)

new_res = pd.DataFrame(res)
new_res.columns = ["f1", "f2", "f3"]
print(new_res, '\n')

print(x)


    f1    f2        f3
0  2.0  4.00  3.000000
1  1.0  5.00  2.666667
2  4.0  1.00  2.666667
3  2.0  5.00  3.000000
4  2.0  3.75  2.000000 

   f_1  f_2  f_3
0  2.0  4.0  3.0
1  1.0  5.0  NaN
2  4.0  1.0  NaN
3  2.0  5.0  3.0
4  2.0  NaN  2.0


In [31]:
new_res.isnull().sum()

f1    0
f2    0
f3    0
dtype: int64

In [32]:
# repeat the experiment using a different strategy

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.NaN, strategy='median')
res = imp.fit_transform(x)

new_res = pd.DataFrame(res)
new_res.columns = ["f1", "f2", "f3"]
print(new_res, '\n')

print(x)


    f1   f2   f3
0  2.0  4.0  3.0
1  1.0  5.0  3.0
2  4.0  1.0  3.0
3  2.0  5.0  3.0
4  2.0  4.5  2.0 

   f_1  f_2  f_3
0  2.0  4.0  3.0
1  1.0  5.0  NaN
2  4.0  1.0  NaN
3  2.0  5.0  3.0
4  2.0  NaN  2.0


In [33]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
res = imp.fit_transform(x)

new_res = pd.DataFrame(res)
new_res.columns = ["f1", "f2", "f3"]
print(new_res, '\n')

print(x)


    f1   f2   f3
0  2.0  4.0  3.0
1  1.0  5.0  3.0
2  4.0  1.0  3.0
3  2.0  5.0  3.0
4  2.0  5.0  2.0 

   f_1  f_2  f_3
0  2.0  4.0  3.0
1  1.0  5.0  NaN
2  4.0  1.0  NaN
3  2.0  5.0  3.0
4  2.0  NaN  2.0


In [34]:
# operation to remove rows containing missing values ​​from DataFrame x in pandas.


# axis=0 is a parameter indicating that the delete operation is applied to rows (observations) in the DataFrame.
# If axis=1, then the operation will be applied to columns.

new_data = x.dropna(axis=0)

In [35]:
new_data

Unnamed: 0,f_1,f_2,f_3
0,2.0,4.0,3.0
3,2.0,5.0,3.0
