#[UCI Heart Disease Data](https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data)(Kaggle) - klasyfikacja choroby serca



In [1]:
import kagglehub
import os
import pandas as pd

path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")

print("Path to dataset files:", path)
print(os.listdir(path))

df = pd.read_csv(os.path.join(path, "heart_disease_uci.csv"))


Downloading from https://www.kaggle.com/api/v1/datasets/download/redwankarimsony/heart-disease-data?dataset_version_number=6...


100%|██████████| 12.4k/12.4k [00:00<00:00, 19.7MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/redwankarimsony/heart-disease-data/versions/6
['heart_disease_uci.csv']





In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


#Wymiary
Rekordy: 920
Kolumny: 16 cech

#Opis cech:

###Numeryczne (int64 / float64): 8 kolumn

id – unikalny identyfikator pacjenta

age – wiek pacjenta

trestbps – ciśnienie krwi w spoczynku

chol – cholesterol w mg/dl

thalach – maksymalne tętno osiągnięte

oldpeak – ST depression wywołane wysiłkiem

ca – liczba naczyń głównych pokolorowanych fluoroskopowo

num – przewidywana etykieta choroby serca


###Kategoryczne / obiektowe (object): 8 kolumn

sex – płeć

dataset – pochodzenie badania

cp – typ bólu w klatce piersiowej

fbs – cukier na czczo > 120 mg/dl (True/False)

restecg – wynik EKG w spoczynku

exang – dławica wywołana wysiłkiem

slope – nachylenie ST segment

thal – typ defektu (normal, fixed defect, reversible defect)


###Brakujące dane

trestbps – 861/920 → ok. 6.4% braków

chol – 890/920 → ok. 3.3% braków

fbs – 830/920 → ok. 9.8% braków

slope – 611/920 → ok. 33.5% braków

ca – 309/920 → ok. 66.4% braków

thal – 434/920 → ok. 52.8% braków

In [3]:
n_rows, n_cols = df.shape
print(f"\nLiczba rekordów: {n_rows}")
print(f"Liczba cech (kolumn): {n_cols}")

print("\n--- Przykładowe wiersze ---")
print(df.head())

print("\n--- Statystyki opisowe ---")
print(df.describe())



Liczba rekordów: 920
Liczba cech (kolumn): 16

--- Przykładowe wiersze ---
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable de

In [5]:
import numpy as np

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

target_col = 'num' if 'num' in df.columns else None
if target_col is not None:
    if target_col in num_cols:
        num_cols.remove(target_col)
    if target_col in cat_cols:
        cat_cols.remove(target_col)

print("num_cols:", num_cols)
print("cat_cols:", cat_cols)

num_cols: ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
cat_cols: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']


In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

num_df = df[num_cols].copy()
cat_df = df[cat_cols].copy()

half = len(num_cols) // 2
num_easy = num_cols[:half]
num_hard = num_cols[half:]

simple_imputer = SimpleImputer(strategy='mean')
num_easy_imputed = pd.DataFrame(simple_imputer.fit_transform(df[num_easy]), columns=num_easy)

knn_imputer = KNNImputer(n_neighbors=5)
num_hard_imputed = pd.DataFrame(knn_imputer.fit_transform(df[num_hard]), columns=num_hard)

num_imputed = pd.concat([num_easy_imputed, num_hard_imputed], axis=1)

cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputed = pd.DataFrame(cat_imputer.fit_transform(cat_df), columns=cat_cols)

df_imputed = pd.concat([num_imputed, cat_imputed], axis=1)

print("✅ Dane uzupełnione:")
print(df_imputed.info())

✅ Dane uzupełnione:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    float64
 1   age       920 non-null    float64
 2   trestbps  920 non-null    float64
 3   chol      920 non-null    float64
 4   thalch    920 non-null    float64
 5   oldpeak   920 non-null    float64
 6   ca        920 non-null    float64
 7   sex       920 non-null    object 
 8   dataset   920 non-null    object 
 9   cp        920 non-null    object 
 10  fbs       920 non-null    object 
 11  restecg   920 non-null    object 
 12  exang     920 non-null    object 
 13  slope     920 non-null    object 
 14  thal      920 non-null    object 
dtypes: float64(7), object(8)
memory usage: 107.9+ KB
None


In [8]:
from sklearn.preprocessing import OrdinalEncoder

df_encoded = df_imputed.copy()

ordinal_features = {
    'cp': ['asymptomatic', 'non-anginal', 'atypical angina', 'typical angina'],
    'restecg': ['normal', 'st-t abnormality', 'lv hypertrophy'],
    'slope': ['upsloping', 'flat', 'downsloping'],
    'thal': ['normal', 'reversable defect', 'fixed defect']
}

encoder = OrdinalEncoder(categories=[ordinal_features[col] for col in ordinal_features])
df_encoded[list(ordinal_features.keys())] = encoder.fit_transform(df_encoded[list(ordinal_features.keys())])

nominal_features = ['sex', 'dataset', 'fbs', 'exang']
df_encoded = pd.get_dummies(df_encoded, columns=nominal_features, drop_first=True)

print("✅ Wszystkie kolumny zakodowane numerycznie:")
print(df_encoded.info())
df_encoded.head()


✅ Wszystkie kolumny zakodowane numerycznie:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     920 non-null    float64
 1   age                    920 non-null    float64
 2   trestbps               920 non-null    float64
 3   chol                   920 non-null    float64
 4   thalch                 920 non-null    float64
 5   oldpeak                920 non-null    float64
 6   ca                     920 non-null    float64
 7   cp                     920 non-null    float64
 8   restecg                920 non-null    float64
 9   slope                  920 non-null    float64
 10  thal                   920 non-null    float64
 11  sex_Male               920 non-null    bool   
 12  dataset_Hungary        920 non-null    bool   
 13  dataset_Switzerland    920 non-null    bool   
 14  dataset_VA Lon

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,cp,restecg,slope,thal,sex_Male,dataset_Hungary,dataset_Switzerland,dataset_VA Long Beach,fbs_True,exang_True
0,1.0,63.0,145.0,233.0,150.0,2.3,0.0,3.0,2.0,2.0,2.0,True,False,False,False,True,False
1,2.0,67.0,160.0,286.0,108.0,1.5,3.0,0.0,2.0,1.0,0.0,True,False,False,False,False,True
2,3.0,67.0,120.0,229.0,129.0,2.6,2.0,0.0,2.0,1.0,1.0,True,False,False,False,False,True
3,4.0,37.0,130.0,250.0,187.0,3.5,0.0,1.0,0.0,2.0,0.0,True,False,False,False,False,False
4,5.0,41.0,130.0,204.0,172.0,1.4,0.0,2.0,2.0,0.0,0.0,False,False,False,False,False,False
