In [39]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [40]:
df= pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")

In [41]:
# Mostrar as primeiras linhas do dataset
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [42]:
# Verificar se há valores nulos
print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [43]:
# Verificar a distribuição das classes
df['HeartDisease'].value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

In [44]:
# Verificar o tipo das colunas
print(df.dtypes)

# Selecione colunas categóricas
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(categorical_cols)

# Aplicar One-Hot Encoding nas colunas categóricas
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Verificar as primeiras linhas do dataset processado
print(df_encoded.head())


Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object
Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')
   Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease  \
0   40        140          289          0    172      0.0             0   
1   49        160          180          0    156      1.0             1   
2   37        130          283          0     98      0.0             0   
3   48        138          214          0    108      1.5             1   
4   54        150          195          0    122      0.0             0   

   Sex_F  Sex_M  ChestPainType_ASY  ...  ChestPainType_NAP  ChestPainType_TA  \
0  False   True              False  ...            

In [45]:
# 2. Preparação dos dados
X = df_encoded.drop('HeartDisease', axis=1)# Todas as colunas exceto 'HeartDisease' são desnecessárias
y = df_encoded['HeartDisease']# A coluna 'HeartDisease' é a variável alvo

In [46]:
# Padronizar as features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [47]:
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,False,True,False,True,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,True,False,False,False,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,False,True,False,True,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,True,False,True,False,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,False,True,False,False,True,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,False,True,False,False,False,True,False,True,False,True,False,False,True,False
914,68,144,193,1,141,3.4,False,True,True,False,False,False,False,True,False,True,False,False,True,False
915,57,130,131,0,115,1.2,False,True,True,False,False,False,False,True,False,False,True,False,True,False
916,57,130,236,0,174,0.0,True,False,False,True,False,False,True,False,False,True,False,False,True,False


In [48]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Normalizando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
# Convertendo para tensores PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [51]:
# 3. Construir a rede neural
class HeartDiseaseNN(nn.Module):
    def __init__(self):
        super(HeartDiseaseNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x  

In [52]:
 #Instanciar o modelo
model =HeartDiseaseNN ()

In [53]:
# Definir a função de perda e o otimizador
criterion = nn.MSELoss()  # Usando MSE porque estamos tratando a qualidade como uma variável contínua
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [54]:
# 4. Treinamento do modelo
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.4991
Epoch [2/100], Loss: 0.4387
Epoch [3/100], Loss: 0.3856
Epoch [4/100], Loss: 0.3392
Epoch [5/100], Loss: 0.2988
Epoch [6/100], Loss: 0.2636
Epoch [7/100], Loss: 0.2329
Epoch [8/100], Loss: 0.2065
Epoch [9/100], Loss: 0.1844
Epoch [10/100], Loss: 0.1662
Epoch [11/100], Loss: 0.1518
Epoch [12/100], Loss: 0.1409
Epoch [13/100], Loss: 0.1331
Epoch [14/100], Loss: 0.1276
Epoch [15/100], Loss: 0.1235
Epoch [16/100], Loss: 0.1202
Epoch [17/100], Loss: 0.1172
Epoch [18/100], Loss: 0.1146
Epoch [19/100], Loss: 0.1124
Epoch [20/100], Loss: 0.1110
Epoch [21/100], Loss: 0.1103
Epoch [22/100], Loss: 0.1102
Epoch [23/100], Loss: 0.1103
Epoch [24/100], Loss: 0.1102
Epoch [25/100], Loss: 0.1097
Epoch [26/100], Loss: 0.1087
Epoch [27/100], Loss: 0.1072
Epoch [28/100], Loss: 0.1053
Epoch [29/100], Loss: 0.1033
Epoch [30/100], Loss: 0.1012
Epoch [31/100], Loss: 0.0993
Epoch [32/100], Loss: 0.0978
Epoch [33/100], Loss: 0.0966
Epoch [34/100], Loss: 0.0957
Epoch [35/100], Loss: 0

In [55]:
# 5. Avaliação do modelo
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    mse_loss = criterion(predictions, y_test_tensor)
    mae_loss = torch.mean(torch.abs(predictions - y_test_tensor))
    print(f'Mean Squared Error on Test Set: {mse_loss.item():.4f}')
    print(f'Mean Absolute Error on Test Set: {mae_loss.item():.4f}')

Mean Squared Error on Test Set: 0.0940
Mean Absolute Error on Test Set: 0.2222


**Implementar técnica de data augmentation SMOTE**

In [56]:
from imblearn.over_sampling import SMOTE

In [57]:
#Aplicar Smote para gerar novos exemplos sinteticos
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_final = np.vstack([X_train_smote, X])
y_final = np.hstack([y_train_smote, y])

In [58]:
# Converter os dados aumentados para tensores do pytorch
X_train_tensor_smote = torch.tensor(X_train_smote, dtype=torch.float32)
y_train_tensor_smote = torch.tensor(y_train_smote, dtype=torch.long)

In [59]:
from torch.utils.data import DataLoader, TensorDataset

# Criar um dataLoder com os daods aumentados
train_dataset_smote = TensorDataset(X_train_tensor_smote, y_train_tensor_smote)
train_loader_smote = DataLoader(dataset=train_dataset_smote, batch_size=32, shuffle=True)

In [61]:
# 4. Treinamento do modelo
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.0567
Epoch [2/100], Loss: 0.0564
Epoch [3/100], Loss: 0.0562
Epoch [4/100], Loss: 0.0559
Epoch [5/100], Loss: 0.0556
Epoch [6/100], Loss: 0.0553
Epoch [7/100], Loss: 0.0550
Epoch [8/100], Loss: 0.0547
Epoch [9/100], Loss: 0.0545
Epoch [10/100], Loss: 0.0542
Epoch [11/100], Loss: 0.0539
Epoch [12/100], Loss: 0.0536
Epoch [13/100], Loss: 0.0534
Epoch [14/100], Loss: 0.0531
Epoch [15/100], Loss: 0.0528
Epoch [16/100], Loss: 0.0525
Epoch [17/100], Loss: 0.0522
Epoch [18/100], Loss: 0.0519
Epoch [19/100], Loss: 0.0517
Epoch [20/100], Loss: 0.0514
Epoch [21/100], Loss: 0.0511
Epoch [22/100], Loss: 0.0508
Epoch [23/100], Loss: 0.0505
Epoch [24/100], Loss: 0.0503
Epoch [25/100], Loss: 0.0500
Epoch [26/100], Loss: 0.0497
Epoch [27/100], Loss: 0.0494
Epoch [28/100], Loss: 0.0491
Epoch [29/100], Loss: 0.0489
Epoch [30/100], Loss: 0.0486
Epoch [31/100], Loss: 0.0483
Epoch [32/100], Loss: 0.0480
Epoch [33/100], Loss: 0.0477
Epoch [34/100], Loss: 0.0475
Epoch [35/100], Loss: 0

In [62]:
# 5. Avaliação do modelo
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    mse_loss = criterion(predictions, y_test_tensor)
    mae_loss = torch.mean(torch.abs(predictions - y_test_tensor))
    print(f'Mean Squared Error on Test Set: {mse_loss.item():.4f}')
    print(f'Mean Absolute Error on Test Set: {mae_loss.item():.4f}')

Mean Squared Error on Test Set: 0.1136
Mean Absolute Error on Test Set: 0.2241


**Inserir o Mix-Up**

In [63]:
# Função MixUp para gerar mais instâncias
def mixup(X, y, alpha=0.2, n_samples=2):
    X_mix = []
    y_mix = []
    
    for _ in range(n_samples):
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        lam = np.random.beta(alpha, alpha)
        
        X_mixed = lam * X + (1 - lam) * X_shuffled
        y_mixed = lam * y + (1 - lam) * y_shuffled
        
        X_mix.append(X_mixed)
        y_mix.append(y_mixed)
    
    return np.vstack(X_mix), np.hstack(y_mix)

In [64]:
# Aplicar MixUp dentro de cada grupo de qualidade
X_mix = []
y_mix = []

for quality in np.unique(y_final):
    X_group = X_final[y_final == quality]
    y_group = y_final[y_final == quality]
    
    X_mixed, y_mixed = mixup(X_group, y_group)
    
    X_mix.append(X_mixed)
    y_mix.append(y_mixed)

# Combinar as amostras mixadas de volta em um único conjunto de dados
X_combined = np.vstack(X_mix)
y_combined = np.hstack(y_mix)

In [65]:
# Aplicar MixUp dentro de cada grupo de qualidade
X_mix = []
y_mix = []

for quality in np.unique(y_final):
    X_group = X_final[y_final == quality]
    y_group = y_final[y_final == quality]
    
    X_mixed, y_mixed = mixup(X_group, y_group)
    
    X_mix.append(X_mixed)
    y_mix.append(y_mixed)

# Combinar as amostras mixadas de volta em um único conjunto de dados
X_combined = np.vstack(X_mix)
y_combined = np.hstack(y_mix)

In [66]:
X_combined

array([[-1.2450699541183463, -0.708989170107751, 0.3728024226715467, ...,
        3.780419547235286, -1.0332500345041449, -0.8479144462153526],
       [-1.886230578919349, -0.16628683324634222, 0.08614767388285095,
        ..., -0.26452002850644324, -1.0332500345041449,
        1.179355535378268],
       [-1.7792019783748512, -0.16591300887902563, 0.1054849555202002,
        ..., -0.26451920931281064, -1.03324373772015, 1.1793518830251717],
       ...,
       [67.99141626415458, 143.9879827698164, 193.0171674716908, ...,
        0.0, 1.0, 0.0],
       [57.00858373584542, 130.0103004830145, 131.11931392825127, ...,
        0.0, 0.9991416264154585, 0.0008583735845415497],
       [57.00085837358454, 130.00515024150727, 235.938197101913, ...,
        0.0, 1.0, 0.0]], dtype=object)

In [67]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Normalizando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convertendo para tensores PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [68]:
# 4. Treinamento do modelo
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.1605
Epoch [2/100], Loss: 0.1418
Epoch [3/100], Loss: 0.1294
Epoch [4/100], Loss: 0.1245
Epoch [5/100], Loss: 0.1231
Epoch [6/100], Loss: 0.1203
Epoch [7/100], Loss: 0.1137
Epoch [8/100], Loss: 0.1062
Epoch [9/100], Loss: 0.1024
Epoch [10/100], Loss: 0.1025
Epoch [11/100], Loss: 0.1025
Epoch [12/100], Loss: 0.1004
Epoch [13/100], Loss: 0.0977
Epoch [14/100], Loss: 0.0965
Epoch [15/100], Loss: 0.0961
Epoch [16/100], Loss: 0.0952
Epoch [17/100], Loss: 0.0938
Epoch [18/100], Loss: 0.0926
Epoch [19/100], Loss: 0.0917
Epoch [20/100], Loss: 0.0912
Epoch [21/100], Loss: 0.0908
Epoch [22/100], Loss: 0.0902
Epoch [23/100], Loss: 0.0892
Epoch [24/100], Loss: 0.0879
Epoch [25/100], Loss: 0.0868
Epoch [26/100], Loss: 0.0863
Epoch [27/100], Loss: 0.0861
Epoch [28/100], Loss: 0.0858
Epoch [29/100], Loss: 0.0851
Epoch [30/100], Loss: 0.0844
Epoch [31/100], Loss: 0.0838
Epoch [32/100], Loss: 0.0834
Epoch [33/100], Loss: 0.0831
Epoch [34/100], Loss: 0.0828
Epoch [35/100], Loss: 0

In [69]:
# 5. Avaliação do modelo
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    mse_loss = criterion(predictions, y_test_tensor)
    mae_loss = torch.mean(torch.abs(predictions - y_test_tensor))
    print(f'Mean Squared Error on Test Set: {mse_loss.item():.4f}')
    print(f'Mean Absolute Error on Test Set: {mae_loss.item():.4f}')

Mean Squared Error on Test Set: 0.0708
Mean Absolute Error on Test Set: 0.1683


**Inserir ruído gaussiano**


In [70]:
# Aplicar ruído gaussiano segmentado pela classe quality
def add_gaussian_noise(X, mean=0, stddev=0.1):
    noise = np.random.normal(mean, stddev, X.shape)
    return X + noise

In [71]:
X_noisy = []
y_noisy = []

for quality in np.unique(y_combined):
    X_group = X_combined[y_combined == quality]
    y_group = y_combined[y_combined == quality]
    
    X_group_noisy = add_gaussian_noise(X_group)
    
    X_noisy.append(X_group_noisy)
    y_noisy.append(y_group)

# Combinar as amostras com ruído de volta em um único conjunto de dados
X_final_noisy = np.vstack([X_combined] + X_noisy)
y_final_noisy = np.hstack([y_combined] + y_noisy)

In [72]:
X_final_noisy

array([[-1.2450699541183463, -0.708989170107751, 0.3728024226715467, ...,
        3.780419547235286, -1.0332500345041449, -0.8479144462153526],
       [-1.886230578919349, -0.16628683324634222, 0.08614767388285095,
        ..., -0.26452002850644324, -1.0332500345041449,
        1.179355535378268],
       [-1.7792019783748512, -0.16591300887902563, 0.1054849555202002,
        ..., -0.26451920931281064, -1.03324373772015, 1.1793518830251717],
       ...,
       [67.97091303619074, 144.16128770527226, 193.0665426956353, ...,
        0.018131344364643018, 0.9852571375421757, 0.06067340180319114],
       [57.11848122625682, 129.90005714453403, 131.07102913290652, ...,
        -0.11978621130969908, 0.8977225403856794, 0.03680036018962461],
       [56.9394326109915, 130.1240576858959, 235.90392155366504, ...,
        0.03292824298733783, 1.0172538534259514, -0.09967178282552272]],
      dtype=object)

In [73]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X_final_noisy, y_final_noisy, test_size=0.2, random_state=42)

# Normalizando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convertendo para tensores PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [75]:
# 4. Treinamento do modelo
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 0.0645
Epoch [2/100], Loss: 0.0644
Epoch [3/100], Loss: 0.0642
Epoch [4/100], Loss: 0.0641
Epoch [5/100], Loss: 0.0640
Epoch [6/100], Loss: 0.0639
Epoch [7/100], Loss: 0.0638
Epoch [8/100], Loss: 0.0636
Epoch [9/100], Loss: 0.0635
Epoch [10/100], Loss: 0.0634
Epoch [11/100], Loss: 0.0633
Epoch [12/100], Loss: 0.0632
Epoch [13/100], Loss: 0.0631
Epoch [14/100], Loss: 0.0630
Epoch [15/100], Loss: 0.0629
Epoch [16/100], Loss: 0.0627
Epoch [17/100], Loss: 0.0626
Epoch [18/100], Loss: 0.0625
Epoch [19/100], Loss: 0.0624
Epoch [20/100], Loss: 0.0623
Epoch [21/100], Loss: 0.0622
Epoch [22/100], Loss: 0.0621
Epoch [23/100], Loss: 0.0620
Epoch [24/100], Loss: 0.0619
Epoch [25/100], Loss: 0.0618
Epoch [26/100], Loss: 0.0617
Epoch [27/100], Loss: 0.0616
Epoch [28/100], Loss: 0.0615
Epoch [29/100], Loss: 0.0614
Epoch [30/100], Loss: 0.0613
Epoch [31/100], Loss: 0.0612
Epoch [32/100], Loss: 0.0611
Epoch [33/100], Loss: 0.0610
Epoch [34/100], Loss: 0.0609
Epoch [35/100], Loss: 0

In [76]:
# 5. Avaliação do modelo
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    mse_loss = criterion(predictions, y_test_tensor)
    mae_loss = torch.mean(torch.abs(predictions - y_test_tensor))
    print(f'Mean Squared Error on Test Set: {mse_loss.item():.4f}')
    print(f'Mean Absolute Error on Test Set: {mae_loss.item():.4f}')

Mean Squared Error on Test Set: 0.0595
Mean Absolute Error on Test Set: 0.1528
