Задача: Обучить нейронную сеть на основе критериев из датасета (Пол, общее состояние здоровья, физическая активность и т.д.) определять , происходил-ли у человека сердечный приступ.

In [1]:
import numpy as np              # Массивы (матрицы, векторы, линейная алгебра)
import matplotlib.pyplot as plt # Научная графика
%matplotlib inline 
    # Говорим jupyter'у, чтобы весь графический вывод был в браузере, а не в отдельном окне
import pandas as pd             # Таблицы и временные ряды (dataframe, series)
import seaborn as sns           # Еще больше красивой графики для визуализации данных
import sklearn                  # Алгоритмы машинного обучения


In [2]:
url = "./heart_2022_no_nans.csv"
data = pd.read_csv(url)
type(data)

pandas.core.frame.DataFrame

In [3]:
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [4]:
data.columns

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')

In [5]:
random_subset = np.random.choice(np.arange(data.shape[0]), size=10000, replace=False)

**Готовим данные**

In [6]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
print(categorical_columns)
print(numerical_columns)

['State', 'Sex', 'GeneralHealth', 'LastCheckupTime', 'PhysicalActivities', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']


In [7]:
for c in categorical_columns:
    print(c, data[c].unique())

State ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
Sex ['Female' 'Male']
GeneralHealth ['Very good' 'Fair' 'Good' 'Excellent' 'Poor']
LastCheckupTime ['Within past year (anytime less than 12 months ago)'
 '5 or more years ago'
 'Within past 2 years (1 year but less than 2 years ago)'
 'Within past 5 years (2 years but less than 5 years ago)']
PhysicalActivities ['Yes' 'No']
RemovedTeeth ['None of t

In [8]:
data_describe = data.describe(include = [object])
for c in categorical_columns:
    data[c] = data[c].fillna(data_describe[c]['top'])

In [9]:
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print(binary_columns, nonbinary_columns)

['Sex', 'PhysicalActivities', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear'] ['State', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory', 'TetanusLast10Tdap', 'CovidPos']


In [10]:
for c in categorical_columns:
    print(c, data[c].unique())

State ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
Sex ['Female' 'Male']
GeneralHealth ['Very good' 'Fair' 'Good' 'Excellent' 'Poor']
LastCheckupTime ['Within past year (anytime less than 12 months ago)'
 '5 or more years ago'
 'Within past 2 years (1 year but less than 2 years ago)'
 'Within past 5 years (2 years but less than 5 years ago)']
PhysicalActivities ['Yes' 'No']
RemovedTeeth ['None of t

In [11]:
data['Sex'].unique()

array(['Female', 'Male'], dtype=object)

In [12]:
data.loc[data['Sex'] == 'Female', 'Sex'] = 0
data.loc[data['Sex'] == 'Male', 'Sex'] = 1

In [13]:
data = data.replace(['No','Yes'],[0,1])

In [14]:
data['GeneralHealth'].unique()

array(['Very good', 'Fair', 'Good', 'Excellent', 'Poor'], dtype=object)

In [15]:
data.loc[data['GeneralHealth'] == 'Poor', 'GeneralHealth'] = 0
data.loc[data['GeneralHealth'] == 'Fair', 'GeneralHealth'] = 0.25
data.loc[data['GeneralHealth'] == 'Good', 'GeneralHealth'] = 0.5
data.loc[data['GeneralHealth'] == 'Very good', 'GeneralHealth'] = 0.75
data.loc[data['GeneralHealth'] == 'Excellent', 'GeneralHealth'] = 1

In [16]:
data['LastCheckupTime'].unique()

array(['Within past year (anytime less than 12 months ago)',
       '5 or more years ago',
       'Within past 2 years (1 year but less than 2 years ago)',
       'Within past 5 years (2 years but less than 5 years ago)'],
      dtype=object)

In [17]:
data.loc[data['LastCheckupTime'] == 'Within past year (anytime less than 12 months ago)', 'LastCheckupTime'] = 1
data.loc[data['LastCheckupTime'] == '5 or more years ago', 'LastCheckupTime'] = 0.5
data.loc[data['LastCheckupTime'] == 'Within past 2 years (1 year but less than 2 years ago)', 'LastCheckupTime'] = 0.25
data.loc[data['LastCheckupTime'] == 'Within past 5 years (2 years but less than 5 years ago)', 'LastCheckupTime'] = 0

In [18]:
data['RemovedTeeth'].unique()

array(['None of them', '6 or more, but not all', '1 to 5', 'All'],
      dtype=object)

In [19]:
data.loc[data['RemovedTeeth'] == 'None of them', 'RemovedTeeth'] = 0
data.loc[data['RemovedTeeth'] == '1 to 5', 'RemovedTeeth'] = 0.33
data.loc[data['RemovedTeeth'] == '6 or more, but not all', 'RemovedTeeth'] = 0.66
data.loc[data['RemovedTeeth'] == 'All', 'RemovedTeeth'] = 1

In [20]:
data['HadDiabetes'].unique()

array([0, 1, 'Yes, but only during pregnancy (female)',
       'No, pre-diabetes or borderline diabetes'], dtype=object)

In [21]:
data.loc[data['HadDiabetes'] == 'Yes, but only during pregnancy (female)', 'HadDiabetes'] = 0.5
data.loc[data['HadDiabetes'] == 'No, pre-diabetes or borderline diabetes', 'HadDiabetes'] = 0.5

In [22]:
data['SmokerStatus'].unique()

array(['Former smoker', 'Never smoked',
       'Current smoker - now smokes every day',
       'Current smoker - now smokes some days'], dtype=object)

In [23]:
data.loc[data['SmokerStatus'] == 'Former smoker', 'SmokerStatus'] = 0.5
data.loc[data['SmokerStatus'] == 'Never smoked', 'SmokerStatus'] = 0
data.loc[data['SmokerStatus'] == 'Current smoker - now smokes some days', 'SmokerStatus'] = 0.75
data.loc[data['SmokerStatus'] == 'Current smoker - now smokes every day', 'SmokerStatus'] = 1

In [24]:
data['ECigaretteUsage'].unique()

array(['Never used e-cigarettes in my entire life', 'Use them some days',
       'Not at all (right now)', 'Use them every day'], dtype=object)

In [25]:
data.loc[data['ECigaretteUsage'] == 'Never used e-cigarettes in my entire life', 'ECigaretteUsage'] = 0
data.loc[data['ECigaretteUsage'] == 'Not at all (right now)', 'ECigaretteUsage'] = 0.5
data.loc[data['ECigaretteUsage'] == 'Use them every day','ECigaretteUsage'] = 1
data.loc[data['ECigaretteUsage'] == 'Use them some days', 'ECigaretteUsage'] = 0.75

In [26]:
data['AgeCategory'].unique()

array(['Age 65 to 69', 'Age 70 to 74', 'Age 75 to 79', 'Age 80 or older',
       'Age 50 to 54', 'Age 40 to 44', 'Age 60 to 64', 'Age 55 to 59',
       'Age 45 to 49', 'Age 35 to 39', 'Age 25 to 29', 'Age 30 to 34',
       'Age 18 to 24'], dtype=object)

In [27]:
data.loc[data['AgeCategory'] == 'Age 65 to 69', 'AgeCategory'] = (65 + 69)/2
data.loc[data['AgeCategory'] == 'Age 70 to 74', 'AgeCategory'] = (70 + 74)/2
data.loc[data['AgeCategory'] == 'Age 75 to 79', 'AgeCategory'] = (75 + 79)/2
data.loc[data['AgeCategory'] == 'Age 80 or older', 'AgeCategory'] = (80 + 100)/2
data.loc[data['AgeCategory'] == 'Age 50 to 54', 'AgeCategory'] = (50 + 54)/2
data.loc[data['AgeCategory'] == 'Age 40 to 44', 'AgeCategory'] = (40 + 44)/2
data.loc[data['AgeCategory'] == 'Age 60 to 64', 'AgeCategory'] = (60 + 64)/2
data.loc[data['AgeCategory'] == 'Age 55 to 59', 'AgeCategory'] = (55 + 59)/2
data.loc[data['AgeCategory'] == 'Age 45 to 49', 'AgeCategory'] = (45 + 49)/2
data.loc[data['AgeCategory'] == 'Age 35 to 39', 'AgeCategory'] = (35 + 39)/2
data.loc[data['AgeCategory'] == 'Age 25 to 29', 'AgeCategory'] = (25 + 29)/2
data.loc[data['AgeCategory'] == 'Age 30 to 34', 'AgeCategory'] = (30 + 34)/2
data.loc[data['AgeCategory'] == 'Age 18 to 24', 'AgeCategory'] = (18 + 24)/2

In [28]:
data['TetanusLast10Tdap'].unique()

array(['Yes, received Tdap',
       'Yes, received tetanus shot but not sure what type',
       'No, did not receive any tetanus shot in the past 10 years',
       'Yes, received tetanus shot, but not Tdap'], dtype=object)

In [29]:
data.loc[data['TetanusLast10Tdap'] == 'Yes, received Tdap', 'TetanusLast10Tdap'] = 1
data.loc[data['TetanusLast10Tdap'] == 'Yes, received tetanus shot but not sure what type', 'TetanusLast10Tdap'] = 0.5
data.loc[data['TetanusLast10Tdap'] == 'No, did not receive any tetanus shot in the past 10 years','TetanusLast10Tdap'] = 0
data.loc[data['TetanusLast10Tdap'] == 'Yes, received tetanus shot, but not Tdap', 'TetanusLast10Tdap'] = 0.5

In [30]:
data['CovidPos'].unique()

array([0, 1,
       'Tested positive using home test without a health professional'],
      dtype=object)

In [31]:
data.loc[data['CovidPos'] == 'Tested positive using home test without a health professional', 'CovidPos'] = 0.75

In [32]:
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
print(categorical_columns)

['State', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory', 'TetanusLast10Tdap', 'CovidPos']


In [33]:
for c in categorical_columns:
    print(c, data[c].unique())

State ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
GeneralHealth [0.75 0.25 0.5 1 0]
LastCheckupTime [1 0.5 0.25 0]
RemovedTeeth [0 0.66 0.33 1]
HadDiabetes [0 1 0.5]
SmokerStatus [0.5 0 1 0.75]
ECigaretteUsage [0 0.75 0.5 1]
RaceEthnicityCategory ['White only, Non-Hispanic' 'Black only, Non-Hispanic'
 'Other race only, Non-Hispanic' 'Multiracial, Non-Hispanic' 'Hispanic']
AgeCategory [67.0 72.0 77

In [34]:
sns.pairplot(dataWithHeart.iloc[random_subset], hue='HadHeartAttack', diag_kind='hist')

pass


NameError: name 'dataWithHeart' is not defined

In [None]:
sns.jointplot(x='PhysicalHealthDays', y='BMI',hue='HadHeartAttack', alpha=1, data=dataWithHeart.iloc[random_subset])
pass

In [None]:
data.describe()

In [None]:
data.corr()

Из визуализации и основных характеристик мало что можно сказать о том как от них зависит присутствие/отсутствие сердечных приступов у человека.