In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [131]:
df = pd.read_csv('/Users/madinayelmuratova/Desktop/dataset.csv')

In [132]:
print(df.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  


In [133]:
print(df.tail())
print(df.info())

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  target  
298      1   0     3       0  
299      1   0     3       0  
300      1   2     3       0  
301      1   1     3       0  
302      1   1     2       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303

In [134]:
# Проверяем пропущенные значения и типы данных
print(df.isnull().sum())
print(df.dtypes)

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


In [135]:
print("Перед удалением пропущенных значений:")
print(df.isnull().sum())



Перед удалением пропущенных значений:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [136]:
df_dropped = df.dropna()
print("После удаления пропущенных значений:")
print(df_dropped.isnull().sum())



После удаления пропущенных значений:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [137]:
df_filled = df.fillna(df.mean())  # Заполнение средним
print(df_filled.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [138]:
# Заполнение значений frwrd
df_ffill = df.ffill()
print(df_ffill.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [139]:
df_bfill = df.bfill()
print(df_bfill.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [140]:
# Определение числовых признаков
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

# Нормализация числовых признаков
min_max_scaler = MinMaxScaler()
df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])

print(df.head())



        age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333    1   3  0.481132  0.244292    1        0  0.603053      0   
1  0.166667    1   2  0.339623  0.283105    0        1  0.885496      0   
2  0.250000    0   1  0.339623  0.178082    0        0  0.770992      0   
3  0.562500    1   1  0.245283  0.251142    0        1  0.816794      0   
4  0.583333    0   0  0.245283  0.520548    0        1  0.702290      1   

    oldpeak  slope   ca  thal  target  
0  0.370968      0  0.0     1       1  
1  0.564516      0  0.0     2       1  
2  0.225806      2  0.0     2       1  
3  0.129032      2  0.0     2       1  
4  0.096774      2  0.0     2       1  


In [141]:
# Определение категориальных признаков
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Кодирование категориальных признаков
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

print(df_encoded.head())

        age  trestbps      chol   thalach   oldpeak   ca  target  sex_1  cp_1  \
0  0.708333  0.481132  0.244292  0.603053  0.370968  0.0       1      1     0   
1  0.166667  0.339623  0.283105  0.885496  0.564516  0.0       1      1     0   
2  0.250000  0.339623  0.178082  0.770992  0.225806  0.0       1      0     1   
3  0.562500  0.245283  0.251142  0.816794  0.129032  0.0       1      1     1   
4  0.583333  0.245283  0.520548  0.702290  0.096774  0.0       1      0     0   

   cp_2  cp_3  fbs_1  restecg_1  restecg_2  exang_1  slope_1  slope_2  thal_1  \
0     0     1      1          0          0        0        0        0       1   
1     1     0      0          1          0        0        0        0       0   
2     0     0      0          0          0        0        0        1       0   
3     0     0      0          1          0        0        0        1       0   
4     0     0      0          1          0        1        0        1       0   

   thal_2  thal_3  
0     

In [142]:
# Бинирование переменной age
df_encoded['age_bins'] = pd.cut(df_encoded['age'], bins=[20, 30, 40, 50, 60, 70, 80], labels=['20-30', '30-40', '40-50', '50-60', '60-70', '70-80'])

print(df_encoded.head())

        age  trestbps      chol   thalach   oldpeak   ca  target  sex_1  cp_1  \
0  0.708333  0.481132  0.244292  0.603053  0.370968  0.0       1      1     0   
1  0.166667  0.339623  0.283105  0.885496  0.564516  0.0       1      1     0   
2  0.250000  0.339623  0.178082  0.770992  0.225806  0.0       1      0     1   
3  0.562500  0.245283  0.251142  0.816794  0.129032  0.0       1      1     1   
4  0.583333  0.245283  0.520548  0.702290  0.096774  0.0       1      0     0   

   cp_2  ...  fbs_1  restecg_1  restecg_2  exang_1  slope_1  slope_2  thal_1  \
0     0  ...      1          0          0        0        0        0       1   
1     1  ...      0          1          0        0        0        0       0   
2     0  ...      0          0          0        0        0        1       0   
3     0  ...      0          1          0        0        0        1       0   
4     0  ...      0          1          0        1        0        1       0   

   thal_2  thal_3  age_bins  
0 

In [143]:
# Создание новых признаков
df['chol_age_interaction'] = df['chol'] * df['age']
df['age_squared'] = df['age'] ** 2
df['chol_squared'] = df['chol'] ** 2

# высокий риск
df['high_risk'] = ((df['chol'] > 240) | (df['trestbps'] > 140) | (df['oldpeak'] > 1.0)).astype(int)

print(df.head())

        age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333    1   3  0.481132  0.244292    1        0  0.603053      0   
1  0.166667    1   2  0.339623  0.283105    0        1  0.885496      0   
2  0.250000    0   1  0.339623  0.178082    0        0  0.770992      0   
3  0.562500    1   1  0.245283  0.251142    0        1  0.816794      0   
4  0.583333    0   0  0.245283  0.520548    0        1  0.702290      1   

    oldpeak  slope   ca  thal  target  chol_age_interaction  age_squared  \
0  0.370968      0  0.0     1       1              0.173040     0.501736   
1  0.564516      0  0.0     2       1              0.047184     0.027778   
2  0.225806      2  0.0     2       1              0.044521     0.062500   
3  0.129032      2  0.0     2       1              0.141267     0.316406   
4  0.096774      2  0.0     2       1              0.303653     0.340278   

   chol_squared  high_risk  
0      0.059679          0  
1      0.080148          0  
2    

In [144]:
# Признак взаимодействия высокого холестерина и высокого давления
df['high_risk_interaction'] = ((df['chol'] > 240) & (df['trestbps'] > 140)).astype(int)
print(df.head())

        age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333    1   3  0.481132  0.244292    1        0  0.603053      0   
1  0.166667    1   2  0.339623  0.283105    0        1  0.885496      0   
2  0.250000    0   1  0.339623  0.178082    0        0  0.770992      0   
3  0.562500    1   1  0.245283  0.251142    0        1  0.816794      0   
4  0.583333    0   0  0.245283  0.520548    0        1  0.702290      1   

    oldpeak  slope   ca  thal  target  chol_age_interaction  age_squared  \
0  0.370968      0  0.0     1       1              0.173040     0.501736   
1  0.564516      0  0.0     2       1              0.047184     0.027778   
2  0.225806      2  0.0     2       1              0.044521     0.062500   
3  0.129032      2  0.0     2       1              0.141267     0.316406   
4  0.096774      2  0.0     2       1              0.303653     0.340278   

   chol_squared  high_risk  high_risk_interaction  
0      0.059679          0              

In [145]:

df['age_bins'] = pd.cut(df['age'], bins=[20, 30, 40, 50, 60, 70, 80], labels=['20-30', '30-40', '40-50', '50-60', '60-70', '70-80'])

print(df.head())

        age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333    1   3  0.481132  0.244292    1        0  0.603053      0   
1  0.166667    1   2  0.339623  0.283105    0        1  0.885496      0   
2  0.250000    0   1  0.339623  0.178082    0        0  0.770992      0   
3  0.562500    1   1  0.245283  0.251142    0        1  0.816794      0   
4  0.583333    0   0  0.245283  0.520548    0        1  0.702290      1   

    oldpeak  slope   ca  thal  target  chol_age_interaction  age_squared  \
0  0.370968      0  0.0     1       1              0.173040     0.501736   
1  0.564516      0  0.0     2       1              0.047184     0.027778   
2  0.225806      2  0.0     2       1              0.044521     0.062500   
3  0.129032      2  0.0     2       1              0.141267     0.316406   
4  0.096774      2  0.0     2       1              0.303653     0.340278   

   chol_squared  high_risk  high_risk_interaction age_bins  
0      0.059679          0     

In [146]:
# полиномиальные признаки
df['age_squared'] = df['age'] ** 2
df['chol_squared'] = df['chol'] ** 2

print(df.head())

        age  sex  cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333    1   3  0.481132  0.244292    1        0  0.603053      0   
1  0.166667    1   2  0.339623  0.283105    0        1  0.885496      0   
2  0.250000    0   1  0.339623  0.178082    0        0  0.770992      0   
3  0.562500    1   1  0.245283  0.251142    0        1  0.816794      0   
4  0.583333    0   0  0.245283  0.520548    0        1  0.702290      1   

    oldpeak  slope   ca  thal  target  chol_age_interaction  age_squared  \
0  0.370968      0  0.0     1       1              0.173040     0.501736   
1  0.564516      0  0.0     2       1              0.047184     0.027778   
2  0.225806      2  0.0     2       1              0.044521     0.062500   
3  0.129032      2  0.0     2       1              0.141267     0.316406   
4  0.096774      2  0.0     2       1              0.303653     0.340278   

   chol_squared  high_risk  high_risk_interaction age_bins  
0      0.059679          0     

In [147]:
df = df.drop_duplicates()

print(f"Количество строк после удаления дубликатов: {len(df)}")


Количество строк после удаления дубликатов: 302


In [148]:
# Выбираем числовые признаки
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

In [149]:
# Метод Z-оценки
z_scores = stats.zscore(df[numerical_features])
df_z_cleaned = df[(abs(z_scores) < 3).all(axis=1)]
print(f"Количество строк после удаления выбросов по Z-оценке: {len(df_z_cleaned)}")


Количество строк после удаления выбросов по Z-оценке: 289


In [150]:
# Приведение текстовых значений к единому формату
df_cleaned = df_z_cleaned.copy()  # набор данных после Z-оценки


In [151]:
#стандартизации текстовых данных для столбца
df_cleaned['sex'] = df_cleaned['sex'].map({0: 'female', 1: 'male'}).str.lower()

In [152]:
# Пример объединения категорий для столбца 'cp' (chest pain type)
df_cleaned['cp'] = df_cleaned['cp'].replace({
    1: 'typical angina', 
    2: 'atypical angina', 
    3: 'non-anginal pain', 
    4: 'asymptomatic'
}).str.lower()

In [153]:
print(f"Количество строк после полной очистки данных: {len(df_cleaned)}")
print(df_cleaned.head())

Количество строк после полной очистки данных: 289
        age     sex                cp  trestbps      chol  fbs  restecg  \
0  0.708333    male  non-anginal pain  0.481132  0.244292    1        0   
1  0.166667    male   atypical angina  0.339623  0.283105    0        1   
2  0.250000  female    typical angina  0.339623  0.178082    0        0   
3  0.562500    male    typical angina  0.245283  0.251142    0        1   
4  0.583333  female               NaN  0.245283  0.520548    0        1   

    thalach  exang   oldpeak  slope   ca  thal  target  chol_age_interaction  \
0  0.603053      0  0.370968      0  0.0     1       1              0.173040   
1  0.885496      0  0.564516      0  0.0     2       1              0.047184   
2  0.770992      0  0.225806      2  0.0     2       1              0.044521   
3  0.816794      0  0.129032      2  0.0     2       1              0.141267   
4  0.702290      1  0.096774      2  0.0     2       1              0.303653   

   age_squared  ch

In [154]:
 # Сохранение очищенного набора данных в файл
df_cleaned.to_csv('heart_cleaned.csv', index=False)


In [155]:
df = pd.read_csv('heart_cleaned.csv')

# Проверка первых строк загруженного набора данных
print(df.head())

        age     sex                cp  trestbps      chol  fbs  restecg  \
0  0.708333    male  non-anginal pain  0.481132  0.244292    1        0   
1  0.166667    male   atypical angina  0.339623  0.283105    0        1   
2  0.250000  female    typical angina  0.339623  0.178082    0        0   
3  0.562500    male    typical angina  0.245283  0.251142    0        1   
4  0.583333  female               NaN  0.245283  0.520548    0        1   

    thalach  exang   oldpeak  slope   ca  thal  target  chol_age_interaction  \
0  0.603053      0  0.370968      0  0.0     1       1              0.173040   
1  0.885496      0  0.564516      0  0.0     2       1              0.047184   
2  0.770992      0  0.225806      2  0.0     2       1              0.044521   
3  0.816794      0  0.129032      2  0.0     2       1              0.141267   
4  0.702290      1  0.096774      2  0.0     2       1              0.303653   

   age_squared  chol_squared  high_risk  high_risk_interaction  age_

In [156]:
# Проверка уникальных значений в категориальном признаке 'sex'
print(df_cleaned['sex'].unique())

['male' 'female']


In [157]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
X = df.drop(columns=['target'])  # Все признаки (фичи)
y = df['target']  # Целевая переменная


In [158]:
#  (80-20)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Обучающая выборка (80-20): {X_train_80.shape}, Тестовая выборка: {X_test_80.shape}")

Обучающая выборка (80-20): (231, 19), Тестовая выборка: (58, 19)


In [166]:
# (70-30)
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Обучающая выборка (70-30): {X_train_70.shape}, Тестовая выборка: {X_test_70.shape}")

Обучающая выборка (70-30): (202, 19), Тестовая выборка: (87, 19)


In [171]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

In [172]:
# Функция для разделения данных
def evaluate_model(split_ratio):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)
    
    # пайплайн для обработки данных и модели
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_features),
            ('cat', categorical_pipeline, categorical_features)
        ])
    
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ])
    
    # Обучение модели
    model_pipeline.fit(X_train, y_train)
    
    # оценка точности модели
    y_pred = model_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [170]:

# Оценка точности модели при разделении 80-20 и 70-30
accuracy_80 = evaluate_model(0.2)
print(f"Точность модели при разделении 80-20: {accuracy_80}")

accuracy_70 = evaluate_model(0.3)
print(f"Точность модели при разделении 70-30: {accuracy_70}")

Точность модели при разделении 80-20: 0.8620689655172413
Точность модели при разделении 70-30: 0.8620689655172413
