In [6]:
import pandas as pd

# Carregar os dados
file_path = 'heart_attack_prediction_dataset.csv'
data = pd.read_csv(file_path)

# Verificar as primeiras linhas do dataset e obter informações gerais
data_head = data.head()
data_info = data.info()
data_description = data.describe()

data_head, data_info, data_description


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

(  Patient ID  Age     Sex  Cholesterol Blood Pressure  Heart Rate  Diabetes  \
 0    BMW7812   67    Male          208         158/88          72         0   
 1    CZE1114   21    Male          389         165/93          98         1   
 2    BNI9906   21  Female          324         174/99          72         1   
 3    JLN3497   84    Male          383        163/100          73         1   
 4    GFO8847   66    Male          318          91/88          93         1   
 
    Family History  Smoking  Obesity  ...  Sedentary Hours Per Day  Income  \
 0               0        1        0  ...                 6.615001  261404   
 1               1        1        1  ...                 4.963459  285768   
 2               0        0        0  ...                 9.463426  235282   
 3               1        1        0  ...                 7.648981  125640   
 4               1        1        1  ...                 1.514821  160555   
 
          BMI  Triglycerides  Physical Activity 

In [8]:
# Verificar duplicatas
duplicates_count = data.duplicated().sum()


In [9]:

# Identificar valores únicos em 'Blood Pressure' para verificar possíveis inconsistências
unique_blood_pressure = data['Blood Pressure'].unique()



In [10]:
# Checar valores fora do intervalo esperado em colunas numéricas contínuas
outliers_check = {
    "Cholesterol": data["Cholesterol"].describe(),
    "Triglycerides": data["Triglycerides"].describe(),
    "BMI": data["BMI"].describe(),
    "Heart Rate": data["Heart Rate"].describe()
}

duplicates_count, unique_blood_pressure, outliers_check


(np.int64(0),
 array(['158/88', '165/93', '174/99', ..., '137/94', '94/76', '119/67'],
       dtype=object),
 {'Cholesterol': count    8763.000000
  mean      259.877211
  std        80.863276
  min       120.000000
  25%       192.000000
  50%       259.000000
  75%       330.000000
  max       400.000000
  Name: Cholesterol, dtype: float64,
  'Triglycerides': count    8763.000000
  mean      417.677051
  std       223.748137
  min        30.000000
  25%       225.500000
  50%       417.000000
  75%       612.000000
  max       800.000000
  Name: Triglycerides, dtype: float64,
  'BMI': count    8763.000000
  mean       28.891446
  std         6.319181
  min        18.002337
  25%        23.422985
  50%        28.768999
  75%        34.324594
  max        39.997211
  Name: BMI, dtype: float64,
  'Heart Rate': count    8763.000000
  mean       75.021682
  std        20.550948
  min        40.000000
  25%        57.000000
  50%        75.000000
  75%        93.000000
  max       110.0000

In [14]:
# Separar a coluna 'Blood Pressure' em 'Systolic' e 'Diastolic'
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True).astype(float)



In [13]:
# Remover a coluna original 'Blood Pressure' (já dividida)
data_cleaned = data.drop(columns=['Blood Pressure'])



In [15]:
# Verificar as primeiras linhas após o ajuste
data_cleaned_head = data_cleaned.head()
data_cleaned_head


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Systolic,Diastolic
0,BMW7812,67,Male,208,72,0,0,1,0,0,...,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0,158.0,88.0
1,CZE1114,21,Male,389,98,1,1,1,1,1,...,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0,165.0,93.0
2,BNI9906,21,Female,324,72,1,0,0,0,0,...,28.176571,587,4,4,France,Europe,Northern Hemisphere,0,174.0,99.0
3,JLN3497,84,Male,383,73,1,1,1,0,1,...,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0,163.0,100.0
4,GFO8847,66,Male,318,93,1,1,1,1,0,...,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0,91.0,88.0


In [17]:
# Descrever os valores nas novas colunas 'Systolic' e 'Diastolic' para identificar possíveis outliers
outliers_blood_pressure = {
    "Systolic": data_cleaned["Systolic"].describe(),
    "Diastolic": data_cleaned["Diastolic"].describe()
}



In [18]:
# Identificar outliers nos limites inferiores e superiores
systolic_outliers = data_cleaned[
    (data_cleaned["Systolic"] < 90) | (data_cleaned["Systolic"] > 180)
]
diastolic_outliers = data_cleaned[
    (data_cleaned["Diastolic"] < 60) | (data_cleaned["Diastolic"] > 120)
]

outliers_blood_pressure, systolic_outliers.shape[0], diastolic_outliers.shape[0]


({'Systolic': count    8763.000000
  mean      135.075659
  std        26.349976
  min        90.000000
  25%       112.000000
  50%       135.000000
  75%       158.000000
  max       180.000000
  Name: Systolic, dtype: float64,
  'Diastolic': count    8763.000000
  mean       85.156111
  std        14.676565
  min        60.000000
  25%        72.000000
  50%        85.000000
  75%        98.000000
  max       110.000000
  Name: Diastolic, dtype: float64},
 0,
 0)