# Dataset importing and refining

In [1]:
# Basic imports for data processing and model training
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


In [5]:
DATASET_PATH = '/media/jeyanth-s/DevDrive/AI_Workspace/projects/Heart Disease Project Repository/Heart-Disease-Prediction---Cognitives/Datasets/dataset3.csv'
# Load dataset (example CSV)
data = pd.read_csv(DATASET_PATH)

# Quick overview
print(data.head())
print(data.info())


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   69    1   0       160   234    1        2      131      0      0.1      1   
1   69    0   0       140   239    0        0      151      0      1.8      0   
2   66    0   0       150   226    0        0      114      0      2.6      2   
3   65    1   0       138   282    1        2      174      0      1.4      1   
4   64    1   0       110   211    0        2      144      1      1.8      1   

   ca  thal  condition  
0   1     0          0  
1   2     0          0  
2   0     0          0  
3   1     0          1  
4   0     0          0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       29

In [6]:
# Separate target column
target = data['condition'].values
data = data.drop(columns=['condition'])

# Define numerical and categorical columns based on your dataset
num_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Extract numerical and categorical data
data_num = data[num_features].copy()
data_cat = data[cat_features].copy()

# Normalize numerical features (MinMaxScaler to [0,1])
scaler = MinMaxScaler()
data_num_scaled = scaler.fit_transform(data_num)

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False)
data_cat_encoded = encoder.fit_transform(data_cat)

# Combine numerical and categorical data
data_processed = np.hstack((data_num_scaled, data_cat_encoded))

print("Processed data shape:", data_processed.shape)


Processed data shape: (297, 28)


In [7]:
def introduce_missingness(data, missing_rate=0.1):
    data_missing = data.copy()
    n_samples, n_features = data.shape
    
    # Calculate total number of values to mask
    n_missing = int(np.floor(missing_rate * n_samples * n_features))
    
    # Randomly select indices to mask
    missing_indices = (
        np.random.randint(0, n_samples, n_missing),
        np.random.randint(0, n_features, n_missing)
    )
    
    data_missing[missing_indices] = np.nan
    return data_missing


In [8]:
# Introduce 10% missing values
data_with_missing = introduce_missingness(data_processed, missing_rate=0.1)

# Check how many missing values were introduced
print("Number of missing values introduced:", np.isnan(data_with_missing).sum())

# Preview some rows with missing values
print(data_with_missing[:5])


Number of missing values introduced: 794
[[0.83333333 0.62264151 0.24657534 0.45801527 0.01612903 0.
  1.         1.         0.         0.         0.         0.
  1.         0.         0.         1.         1.                nan
  0.         1.         0.         0.         1.         0.
  0.         1.         0.         0.        ]
 [0.83333333 0.43396226 0.25799087 0.61068702 0.29032258 1.
  0.         1.                nan 0.         0.         1.
  0.         1.         0.                nan 1.         0.
  1.         0.         0.         0.         0.         1.
  0.         1.         0.                nan]
 [0.77083333 0.52830189 0.2283105  0.32824427 0.41935484 1.
  0.         1.         0.         0.         0.         1.
  0.         1.         0.         0.         1.         0.
  0.         0.         1.         1.         0.         0.
  0.         1.         0.         0.        ]
 [0.75       0.41509434 0.35616438 0.78625954 0.22580645 0.
  1.         1.         0.    

In [9]:
def knn_impute(data_missing, k=5):
    imputer = KNNImputer(n_neighbors=k)
    data_imputed = imputer.fit_transform(data_missing)
    return data_imputed

# Example: impute with k=5
data_imputed = knn_impute(data_with_missing, k=5)

# Check that no missing values remain
print("Missing values after imputation:", np.isnan(data_imputed).sum())

# Preview imputed data
print(data_imputed[:5])


Missing values after imputation: 0
[[0.83333333 0.62264151 0.24657534 0.45801527 0.01612903 0.
  1.         1.         0.         0.         0.         0.
  1.         0.         0.         1.         1.         0.2
  0.         1.         0.         0.         1.         0.
  0.         1.         0.         0.        ]
 [0.83333333 0.43396226 0.25799087 0.61068702 0.29032258 1.
  0.         1.         0.4        0.         0.         1.
  0.         1.         0.         0.         1.         0.
  1.         0.         0.         0.         0.         1.
  0.         1.         0.         0.        ]
 [0.77083333 0.52830189 0.2283105  0.32824427 0.41935484 1.
  0.         1.         0.         0.         0.         1.
  0.         1.         0.         0.         1.         0.
  0.         0.         1.         1.         0.         0.
  0.         1.         0.         0.        ]
 [0.75       0.41509434 0.35616438 0.78625954 0.22580645 0.
  1.         1.         0.         0.      

In [10]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class INAAutoencoder(nn.Module):
    def __init__(self, input_dim):
        super(INAAutoencoder, self).__init__()
        hidden_dim = input_dim // 2  # undercomplete architecture
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()  # To constrain outputs between 0 and 1 (since input is normalized)
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize model
input_dim = data_processed.shape[1]
model = INAAutoencoder(input_dim).to(device)

print(model)


Using device: cpu
INAAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=28, out_features=14, bias=True)
    (1): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=14, out_features=28, bias=True)
    (1): Sigmoid()
  )
)


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))


False


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.