# Демонстрация класса DataPreprocessor

В этом ноутбуке показана работа класса `DataPreprocessor` на датасете **Titanic**.

## Импорт библиотек и класса DataPreprocessor

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from c_task import DataPreprocessor

## Загрузка и обзор датасета Titanic

In [2]:
df = sns.load_dataset("titanic")
df.info()
df.head()

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    str     
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    str     
 8   class        891 non-null    category
 9   who          891 non-null    str     
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    str     
 13  alive        891 non-null    str     
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), str(5)
memory usage: 80.7 KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Анализ пропущенных значений

In [3]:
print("null values percentage:")
missing = df.isnull().mean().sort_values(ascending=False)
print(missing[missing > 0].to_string())

null values percentage:
deck           0.772166
age            0.198653
embarked       0.002245
embark_town    0.002245


## Демонстрация `remove_missing()`

Удалим столбцы с долей пропусков > 50 % и заполним оставшиеся пропуски.

In [4]:
prep = DataPreprocessor(df)
prep.remove_missing()

print("Removed cols:", prep.transform_log["removed_cols"], "\n")

print("Filled columns:")
for col, val in prep.transform_log["filled_cols"].items():
    print(f"{col}: mode - {val}")    

print("Remaining missing values:", prep.data.isna().sum().sum())
prep.data.head()

Removed cols: ['deck'] 

Filled columns:
age: mode - 24.0
embarked: mode - S
embark_town: mode - Southampton
Remaining missing values: 0


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


## Демонстрация `encode_categorical()`

One-Hot Encoding всех категориальных столбцов.

In [5]:
prep2 = DataPreprocessor(df)
prep2.remove_missing()

shape_before = prep2.data.shape
prep2.encode_categorical()
shape_after = prep2.data.shape

print(f"size before:  {shape_before}")
print(f"size after: {shape_after}\n")
print("one-hot columns:", prep2.transform_log["onehot_cols"])
prep2.data.head()

size before:  (891, 14)
size after: (891, 24)

one-hot columns: ['sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child', 'who_man', 'who_woman', 'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no', 'alive_yes']


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,class_Second,class_Third,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,...,0,1,0,1,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,...,0,0,0,0,1,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,...,0,1,0,0,1,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,...,0,0,0,0,1,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,...,0,1,0,1,0,0,0,1,1,0


## Демонстрация `normalize_numeric()`

In [6]:
# Min-Max
prep_mm = DataPreprocessor(df)
prep_mm.remove_missing(threshold=0.5)
prep_mm.encode_categorical()
prep_mm.normalize_numeric(method="minmax")

print("Min-Max:")
num_cols = prep_mm.data.select_dtypes(include=np.number).columns
print(prep_mm.data[num_cols].describe().loc[["min", "max"]].to_string())

Min-Max:
     survived  pclass  age  sibsp  parch  fare  sex_female  sex_male  embarked_C  embarked_Q  embarked_S  class_First  class_Second  class_Third  who_child  who_man  who_woman  embark_town_Cherbourg  embark_town_Queenstown  embark_town_Southampton  alive_no  alive_yes
min       0.0     0.0  0.0    0.0    0.0   0.0         0.0       0.0         0.0         0.0         0.0          0.0           0.0          0.0        0.0      0.0        0.0                    0.0                     0.0                      0.0       0.0        0.0
max       1.0     1.0  1.0    1.0    1.0   1.0         1.0       1.0         1.0         1.0         1.0          1.0           1.0          1.0        1.0      1.0        1.0                    1.0                     1.0                      1.0       1.0        1.0


In [7]:
# std
prep_std = DataPreprocessor(df)
prep_std.remove_missing(threshold=0.5)
prep_std.encode_categorical()
prep_std.normalize_numeric(method="std")

print("mean ≈ 0, std ≈ 1")
num_cols = prep_std.data.select_dtypes(include=np.number).columns
print(prep_std.data[num_cols].describe().loc[["mean", "std"]].to_string())

mean ≈ 0, std ≈ 1
          survived        pclass           age         sibsp         parch          fare    sex_female      sex_male    embarked_C  embarked_Q    embarked_S   class_First  class_Second   class_Third     who_child       who_man     who_woman  embark_town_Cherbourg  embark_town_Queenstown  embark_town_Southampton      alive_no     alive_yes
mean  3.987333e-17 -8.772133e-17  5.980999e-17  4.386066e-17  5.382900e-17  3.987333e-18  3.987333e-17 -1.156327e-16 -1.993666e-17    0.000000 -8.373399e-17 -7.575933e-17  1.993666e-17 -6.778466e-17 -3.389233e-17 -5.980999e-17  2.392400e-17          -1.993666e-17                0.000000            -8.373399e-17  9.170866e-17  3.987333e-17
std   1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00    1.000562  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00           1.000562e+00                1.000562             

## Полный pipeline — `fit_transform()`

In [10]:
pipeline = DataPreprocessor(df)
result = pipeline.fit_transform()

print(f"res shape: {result.shape}")
print(f"dtype count:\n{result.dtypes.value_counts()}")
print()
result.describe()

res shape: (891, 24)
dtype count:
float64    22
bool        2
Name: count, dtype: int64



Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,...,class_Second,class_Third,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.654321,0.353694,0.065376,0.063599,0.062858,0.352413,0.647587,0.188552,0.08642,...,0.20651,0.551066,0.093154,0.602694,0.304153,0.188552,0.08642,0.725028,0.616162,0.383838
std,0.486592,0.418036,0.165865,0.137843,0.134343,0.096995,0.47799,0.47799,0.391372,0.281141,...,0.405028,0.497665,0.290811,0.489615,0.460306,0.391372,0.281141,0.446751,0.486592,0.486592
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.271174,0.0,0.0,0.01544,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.296306,0.0,0.0,0.028213,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,0.434531,0.125,0.0,0.060508,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Базовая обработка ошибок

In [9]:
try:
    DataPreprocessor([1, 2, 3])
except TypeError as e:
    print(f"TypeError:  {e}")

try:
    DataPreprocessor(df).remove_missing(threshold=1.5)
except ValueError as e:
    print(f"ValueError: {e}")

try:
    DataPreprocessor(df).normalize_numeric(method="invalid")
except ValueError as e:
    print(f"ValueError: {e}")

TypeError:  init value need to be pd.dataframe, got <class 'list'>
ValueError: threshold need to be in [0, 1]
ValueError: Invalid method, method need to be 'minmax' or 'std'
