# Feature Engineering

In [1]:
import datetime
import pandas as pd
import numpy as np
from main import one_hot_encoding, label_encoding, ordinal_encoding, create_interaction_features, create_polynomial_features, standardize_features, normalize_features, discretize_features, remove_highly_correlated_features, remove_features_with_high_missing_data, extract_time_features, create_time_features, extract_text_features, transform_text_to_features

In [2]:
data = pd.read_csv('heart_disease_risk.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,decision
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
293,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
294,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
295,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1


In [3]:
categorical_columns = ['cp', 'thal']
ordinal_mapping = {
    'cp': {'typical angina': 1, 'atypical angina': 2, 'non-anginal pain': 3, 'asymptomatic': 4},
    'thal': {'normal': 3, 'fixed defect': 6, 'reversible defect': 7}
}
degree = 2

In [4]:
# 3. Standaryzacja i normalizacja
# data = standardize_features(data)
data = normalize_features(data)

In [5]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,decision
0,0.708333,1.0,0.000000,0.481132,0.244292,1.0,1.0,0.603053,0.0,0.370968,1.0,0.000000,0.75,0.0
1,0.791667,1.0,1.000000,0.622642,0.365297,0.0,1.0,0.282443,1.0,0.241935,0.5,1.000000,0.00,1.0
2,0.791667,1.0,1.000000,0.245283,0.235160,0.0,1.0,0.442748,1.0,0.419355,0.5,0.666667,1.00,1.0
3,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.0,0.885496,0.0,0.564516,1.0,0.000000,0.00,0.0
4,0.250000,0.0,0.333333,0.339623,0.178082,0.0,1.0,0.770992,0.0,0.225806,0.0,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,0.583333,0.0,1.000000,0.433962,0.262557,0.0,0.0,0.396947,1.0,0.032258,0.5,0.000000,1.00,1.0
293,0.333333,1.0,0.000000,0.150943,0.315068,0.0,0.0,0.465649,0.0,0.193548,0.5,0.000000,1.00,1.0
294,0.812500,1.0,1.000000,0.471698,0.152968,1.0,0.0,0.534351,0.0,0.548387,0.5,0.666667,1.00,1.0
295,0.583333,1.0,1.000000,0.339623,0.011416,0.0,0.0,0.335878,1.0,0.193548,0.5,0.333333,1.00,1.0


In [None]:
# 1. Kodowanie zmiennych kategorycznych
data = one_hot_encoding(data, categorical_columns)
data = label_encoding(data, categorical_columns)
data = ordinal_encoding(data, categorical_columns, ordinal_mapping)

In [None]:
# 2. Tworzenie nowych cech
data = create_polynomial_features(data, degree)
data = create_interaction_features(data)

In [None]:
# 4. Dyskretyzacja
data = discretize_features(data, continuous_columns, n_bins)

In [None]:
# 5. Usuwanie cech
data = remove_highly_correlated_features(data, threshold)
data = remove_features_with_high_missing_data(data, missing_threshold)

In [None]:
# 6. Inżynieria czasu
data = extract_time_features(data, time_column)
data = create_time_features(data, time_column)

In [None]:
# 7. Tworzenie cech z danych tekstowych
data = extract_text_features(data, text_column)
data = transform_text_to_features(data, text_column)