In [73]:
import wfdb
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from tqdm import tqdm
import math
from wfdb import processing
from numpy.linalg import norm
import pandas as pd
import plotly.express as px
from datetime import datetime

In [74]:
## Readning the csv file
df = pd.read_csv("../../Record_info.csv")

In [75]:
## Visualizing the first five rows of the csv file
df.head()

Unnamed: 0,age,sex,ECG date,Reason for admission,Acute infarction (localization),Former infarction (localization),Additional diagnoses,Smoker,Number of coronary vessels involved,Infarction date (acute),...,Infarction date,Admission date,Medication pre admission,Start lysis therapy (hh.mm),Lytic agent,Dosage (lytic agent),Additional medication,In hospital medication,Medication after discharge,idP
0,61.0,female,05/06/1997,Myocardial infarction,inferior,no,unknown,unknown,unknown,24-May-97,...,24-May-97,,,,,,,,,patient294
1,52.0,male,07/01/1993,Cardiomyopathy,no,no,"Recurrent pulmonary oedema, Hypertrophic obstr...",yes,0,,...,,,,,,,,,,patient137
2,63.0,male,22/05/1991,Myocardial infarction,anterior,no,"Arterial Hypertension, Hyperlipidemia, Hyperur...",no,unknown,10-May-91,...,10-May-91,,,,,,,,,patient128
3,63.0,male,18/10/1990,Myocardial infarction,infero-postero-lateral,no,no,yes,1,02-Oct-90,...,02-Oct-90,02-Oct-90,-,,Gamma-TPA,30 mg,Furosemide,ASA Isosorbit-Mononitrate,ASA Isosorbit-Mononitrate,patient003
4,61.0,male,06/03/1997,Myocarditis,no,no,no,unknown,unknown,,...,,,,,,,,,,patient272


## Data cleaning

In [76]:
df.columns

Index(['age', 'sex', 'ECG date', 'Reason for admission',
       'Acute infarction (localization)', 'Former infarction (localization)',
       'Additional diagnoses', 'Smoker', 'Number of coronary vessels involved',
       'Infarction date (acute)', 'Previous infarction (1) date',
       'Previous infarction (2) date', 'Catheterization date',
       'Ventriculography', 'Chest X-ray',
       'Peripheral blood Pressure (syst/diast)',
       'Pulmonary artery pressure (at rest) (syst/diast)',
       'Pulmonary artery pressure (at rest) (mean)',
       'Pulmonary capillary wedge pressure (at rest)',
       'Cardiac output (at rest)', 'Cardiac index (at rest)',
       'Stroke volume index (at rest)',
       'Pulmonary artery pressure (laod) (syst/diast)',
       'Pulmonary artery pressure (laod) (mean)',
       'Pulmonary capillary wedge pressure (load)', 'Cardiac output (load)',
       'Cardiac index (load)', 'Stroke volume index (load)',
       'Aorta (at rest) (syst/diast)', 'Aorta (at re

### Age

In [77]:
ages = df['age'].values
## Plot box blot of ages with plotly
fig = px.box(ages)
fig.show()


In [78]:
## Statistics of the ages
df['age'].describe()

count    275.000000
mean      57.247273
std       14.709497
min       17.000000
25%       48.000000
50%       59.000000
75%       68.000000
max       87.000000
Name: age, dtype: float64

In [79]:
## Verifying null values
df['age'].isnull().sum() 

15

In [80]:
## As we can see, there are 15 null values in age, I chose to replace these values with the average of the ages
df['age'].fillna(df['age'].mean(), inplace=True)


In [81]:
## Transform age to int
df['age'] = df['age'].astype(int)

In [82]:
df.head()

Unnamed: 0,age,sex,ECG date,Reason for admission,Acute infarction (localization),Former infarction (localization),Additional diagnoses,Smoker,Number of coronary vessels involved,Infarction date (acute),...,Infarction date,Admission date,Medication pre admission,Start lysis therapy (hh.mm),Lytic agent,Dosage (lytic agent),Additional medication,In hospital medication,Medication after discharge,idP
0,61,female,05/06/1997,Myocardial infarction,inferior,no,unknown,unknown,unknown,24-May-97,...,24-May-97,,,,,,,,,patient294
1,52,male,07/01/1993,Cardiomyopathy,no,no,"Recurrent pulmonary oedema, Hypertrophic obstr...",yes,0,,...,,,,,,,,,,patient137
2,63,male,22/05/1991,Myocardial infarction,anterior,no,"Arterial Hypertension, Hyperlipidemia, Hyperur...",no,unknown,10-May-91,...,10-May-91,,,,,,,,,patient128
3,63,male,18/10/1990,Myocardial infarction,infero-postero-lateral,no,no,yes,1,02-Oct-90,...,02-Oct-90,02-Oct-90,-,,Gamma-TPA,30 mg,Furosemide,ASA Isosorbit-Mononitrate,ASA Isosorbit-Mononitrate,patient003
4,61,male,06/03/1997,Myocarditis,no,no,no,unknown,unknown,,...,,,,,,,,,,patient272


### Sex

In [83]:
df['sex'].value_counts().keys()

Index(['male', 'female'], dtype='object')

In [84]:
## Bar plot of the sex with plotly
fig = px.bar(df, x=dict(df['sex'].value_counts()).keys(), y=dict(df['sex'].value_counts()).values())
fig.show()

In [85]:
df['sex'].isnull().sum()

9

In [86]:
df['sex'].value_counts()

male      200
female     81
Name: sex, dtype: int64

In [87]:
## Nas informações da base de dados, diz-se que foi coletado dados de 290 pacientes, sendo 209 homens e 81 mulheres. 
## Como podemos ver há 9 pacientes sem informação de sexo, pela contagem esses 9 pacientes são do sexo masculino.

In [88]:
df['sex'].fillna('male', inplace=True)
df['sex'].value_counts()

male      209
female     81
Name: sex, dtype: int64

### Reason for admission

In [89]:
df['Reason for admission'].isnull().sum()

22

In [90]:
df['Reason for admission'].value_counts()

Myocardial infarction     148
Healthy control            52
Cardiomyopathy             15
Bundle branch block        15
Dysrhythmia                14
Hypertrophy                 7
Valvular heart disease      6
Myocarditis                 4
Stable angina               2
Palpitation                 1
Heart failure (NYHA 4)      1
Unstable angina             1
Heart failure (NYHA 2)      1
Heart failure (NYHA 3)      1
Name: Reason for admission, dtype: int64

In [91]:
## Bar plot of the Reason for admission with plotly
fig = px.bar(df, x=dict(df['Reason for admission'].value_counts()).keys(), y=dict(df['Reason for admission'].value_counts()).values())
fig.show()

### Acute infarction (localization)

In [92]:
df['Acute infarction (localization)'].value_counts()

no                        140
inferior                   30
antero-septal              27
infero-lateral             22
anterior                   17
antero-lateral             16
infero-postero-lateral      7
postero-lateral             2
antero-septo-lateral        1
infero-poster-lateral       1
posterior                   1
unknown                     1
lateral                     1
infero-posterior            1
infero-latera               1
Name: Acute infarction (localization), dtype: int64

In [93]:
## Substituir valores com escrita errada
df['Acute infarction (localization)'].replace('infero-latera', 'infero-lateral', inplace=True)
df['Acute infarction (localization)'].replace('infero-poster-lateral', 'infero-postero-lateral', inplace=True)

In [94]:
df['Acute infarction (localization)'].value_counts()

no                        140
inferior                   30
antero-septal              27
infero-lateral             23
anterior                   17
antero-lateral             16
infero-postero-lateral      8
postero-lateral             2
antero-septo-lateral        1
posterior                   1
unknown                     1
lateral                     1
infero-posterior            1
Name: Acute infarction (localization), dtype: int64

### Former infarction (localization)

In [95]:
df['Former infarction (localization)'].value_counts()

no                                    209
anterior                               14
inferior                               13
unknown                                13
antero-lateral                          3
infero-posterior                        3
anterior (1), inferior (2)              2
infero-lateral                          2
antero-septal ?                         1
inferior ?                              1
infero-posterior (1), inferior (2)      1
inferior (1+2)                          1
anterior (1), anterior (2)              1
infero-postero-lateral                  1
anterior ?                              1
postero-lateral                         1
antero-septal                           1
Name: Former infarction (localization), dtype: int64

### Smoker

In [96]:
df['Smoker'].value_counts()

no         111
unknown     84
yes         73
Name: Smoker, dtype: int64

In [116]:
df['Smoker'].isnull().sum()

22

In [117]:
df['Smoker'].fillna('unknown', inplace=True)

In [118]:
## Plot bar chart of the Smoker with plotly
fig = px.bar(df, x=dict(df['Smoker'].value_counts()).keys(), y=dict(df['Smoker'].value_counts()).values())
fig.show()

### Admission date

In [97]:
df['ECG date'].value_counts()

27/02/1997    6
23/03/1992    5
07/01/1993    5
14/08/1991    4
18/09/1991    4
             ..
08/12/1992    1
09/05/1996    1
20/09/1991    1
05/12/1990    1
14/10/1994    1
Name: ECG date, Length: 170, dtype: int64

In [98]:
df['ECG date'].isnull().sum()

7

In [99]:
x = df['ECG date'][50]
x

'03/12/1990'

In [100]:
k = datetime.strptime(x, "%d/%m/%Y").date()
## Datetime formated in year-month-day
k.year


1990

In [101]:
## Aplly string to date in all values
df['ECG date'] = df['ECG date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").date() if pd.notnull(x) else None)

In [102]:
## Create a new column with the year of admission
df['ECG_year'] = df['ECG date'].apply(lambda x: int(x.year) if pd.notnull(x) else None)

In [103]:
## Plot box plot of the year of admission
fig = px.box(df, y='ECG_year')
fig.show()

## Podemos ver que a maior parte dos registros foram feitos entre 1991 e 1996, tendo registros feitos de 1990 até 1997.

## Unusefull Columns

In [142]:
df.shape

(290, 46)

In [141]:
## Counting nan values in each column
df.isnull().sum()

age                                                   0
sex                                                   0
ECG date                                              7
Reason for admission                                 22
Acute infarction (localization)                      22
Former infarction (localization)                     22
Additional diagnoses                                 23
Smoker                                                0
Number of coronary vessels involved                  22
Infarction date (acute)                             163
Previous infarction (1) date                        245
Previous infarction (2) date                        285
Catheterization date                                177
Ventriculography                                    175
Chest X-ray                                         188
Peripheral blood Pressure (syst/diast)              196
Pulmonary artery pressure (at rest) (syst/diast)    224
Pulmonary artery pressure (at rest) (mean)      

In [147]:
## Excluding columns that have more than 200 nan values
df = df.dropna(axis=1, thresh=200)
df.head()

Unnamed: 0,age,sex,ECG date,Reason for admission,Acute infarction (localization),Former infarction (localization),Additional diagnoses,Smoker,Number of coronary vessels involved,idP,ECG_year
0,61,female,1997-06-05,Myocardial infarction,inferior,no,unknown,unknown,unknown,patient294,1997.0
1,52,male,1993-01-07,Cardiomyopathy,no,no,"Recurrent pulmonary oedema, Hypertrophic obstr...",yes,0,patient137,1993.0
2,63,male,1991-05-22,Myocardial infarction,anterior,no,"Arterial Hypertension, Hyperlipidemia, Hyperur...",no,unknown,patient128,1991.0
3,63,male,1990-10-18,Myocardial infarction,infero-postero-lateral,no,no,yes,1,patient003,1990.0
4,61,male,1997-03-06,Myocarditis,no,no,no,unknown,unknown,patient272,1997.0


In [None]:
## Saving new table
df.to_csv('Record_info_clean.csv', index=False)

## Graphs

In [121]:
df.head()

Unnamed: 0,age,sex,ECG date,Reason for admission,Acute infarction (localization),Former infarction (localization),Additional diagnoses,Smoker,Number of coronary vessels involved,Infarction date (acute),...,Admission date,Medication pre admission,Start lysis therapy (hh.mm),Lytic agent,Dosage (lytic agent),Additional medication,In hospital medication,Medication after discharge,idP,ECG_year
0,61,female,1997-06-05,Myocardial infarction,inferior,no,unknown,unknown,unknown,24-May-97,...,,,,,,,,,patient294,1997.0
1,52,male,1993-01-07,Cardiomyopathy,no,no,"Recurrent pulmonary oedema, Hypertrophic obstr...",yes,0,,...,,,,,,,,,patient137,1993.0
2,63,male,1991-05-22,Myocardial infarction,anterior,no,"Arterial Hypertension, Hyperlipidemia, Hyperur...",no,unknown,10-May-91,...,,,,,,,,,patient128,1991.0
3,63,male,1990-10-18,Myocardial infarction,infero-postero-lateral,no,no,yes,1,02-Oct-90,...,02-Oct-90,-,,Gamma-TPA,30 mg,Furosemide,ASA Isosorbit-Mononitrate,ASA Isosorbit-Mononitrate,patient003,1990.0
4,61,male,1997-03-06,Myocarditis,no,no,no,unknown,unknown,,...,,,,,,,,,patient272,1997.0


In [113]:
## Nesse gráfico podemos ver que temos mais representatividade de homens do que mulheres na maioria das patologias cardíacas

fig = px.histogram(df, x="Reason for admission",
             color='sex', barmode='group',
             height=400)
fig.show()

In [140]:
## Box plot to the distribuition of ages in sex
fig = px.box(df, x='sex', y='age')
fig.show()

In [115]:
## Box plot to the distribuition of ages in each reason for admission
fig = px.box(df, x='Reason for admission', y='age')
fig.show()

In [137]:
df_infarction = df[(df["Reason for admission"] == "Myocardial infarction") & (df["Former infarction (localization)"] == "no")]

In [134]:
## Pie chart for Acute infarction (localization)
fig = px.pie(df_infarction, names='Acute infarction (localization)', title='Acute infarction (localization)')
fig.show()