In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression as mutual_info
from sklearn.feature_selection import SequentialFeatureSelector

import random
import scipy.io

import warnings
warnings.filterwarnings('ignore')


### Load the data

In [8]:
X_train = pd.read_csv('data/data_labeled/X_train.csv')
X_test = pd.read_csv('data/data_labeled/X_test.csv')

# y CSV files don't have headers, so we need to specify header=None and provide column names
y_train = pd.read_csv('data/data_labeled/y_train.csv', header=None, names=['heart_failure_risk'])
y_test = pd.read_csv('data/data_labeled/y_test.csv', header=None, names=['heart_failure_risk'])

# Explore the data
print(X_train.info())
print(y_train.info())

X_train = X_train.dropna()

# delete missing values in y_train
y_train = y_train.dropna()

X_train = X_train.drop(columns=['img_filename'])
X_test = X_test.drop(columns=['img_filename'])

X_train = pd.get_dummies(X_train, columns=['profession'])
X_test = pd.get_dummies(X_test, columns=['profession'])

# Strip whitespace from the three columns
X_train['sarsaparilla'] = X_train['sarsaparilla'].str.strip()
X_test['sarsaparilla'] = X_test['sarsaparilla'].str.strip()
X_train['smurfberry liquor'] = X_train['smurfberry liquor'].str.strip()
X_test['smurfberry liquor'] = X_test['smurfberry liquor'].str.strip()
X_train['smurfin donuts'] = X_train['smurfin donuts'].str.strip()
X_test['smurfin donuts'] = X_test['smurfin donuts'].str.strip()


X_train['sarsaparilla'] = X_train['sarsaparilla'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})
X_test['sarsaparilla'] = X_test['sarsaparilla'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})
X_train['smurfberry liquor'] = X_train['smurfberry liquor'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})
X_test['smurfberry liquor'] = X_test['smurfberry liquor'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})
X_train['smurfin donuts'] = X_train['smurfin donuts'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})
X_test['smurfin donuts'] = X_test['smurfin donuts'].map({'Very high': 4, 'High': 3, 'Moderate': 2, 'Low': 1, 'Very low': 0})

# Standardize only numerical features (not ordinal or one-hot encoded)
numerical_features = ['age', 'blood pressure', 'calcium', 'cholesterol', 'hemoglobin', 
                      'height', 'potassium', 'vitamin D', 'weight']

# Create a copy to avoid modifying original data
X_train_standardized = X_train.copy()
X_test_standardized = X_test.copy()

# Standardize only numerical features
scX = StandardScaler()
scX.fit(X_train[numerical_features])
X_train_standardized[numerical_features] = scX.transform(X_train[numerical_features])
X_test_standardized[numerical_features] = scX.transform(X_test[numerical_features])

# Update X_train and X_test
X_train = X_train_standardized
X_test = X_test_standardized

print("\nX_train columns and data types after preprocessing:")
print(X_train.dtypes)

print("\ny_train columns and data types after preprocessing:")
print(y_train.dtypes)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                1000 non-null   int64  
 1   blood pressure     1000 non-null   float64
 2   calcium            1000 non-null   float64
 3   cholesterol        1000 non-null   float64
 4   hemoglobin         1000 non-null   float64
 5   height             1000 non-null   float64
 6   potassium          1000 non-null   float64
 7   profession         1000 non-null   object 
 8   sarsaparilla       1000 non-null   object 
 9   smurfberry liquor  1000 non-null   object 
 10  smurfin donuts     1000 non-null   object 
 11  vitamin D          1000 non-null   float64
 12  weight             1000 non-null   float64
 13  img_filename       1000 non-null   object 
dtypes: float64(8), int64(1), object(5)
memory usage: 109.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 en

## KNN without PCA

### Without Feature Selection

In [4]:
# Compute the Root Mean Square Error
def compute_rmse(predict, target):

    len_predict = len(predict)
    len_target = len(target)
    
    if len_predict != len_target:
        raise ValueError("predict and target must have the same length")
    
    rmse = np.sqrt(np.mean((predict - target) ** 2))
    
    
    return rmse

from sklearn.neighbors import KNeighborsRegressor

# Create KNN regressor with k=10
knn = KNeighborsRegressor(n_neighbors=10)

# Fit on training data
knn.fit(X_train, y_train.values.ravel())

# Predict y_test
y_pred_test = knn.predict(X_test)

# Calculate metrics
test_rmse = compute_rmse(y_pred_test, y_test.values.ravel())

print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse:.4f}")





MODEL PERFORMANCE

Test Set:
  RMSE: 0.0562


### With Feature Selection

In [5]:
# Compute the Root Mean Square Error
def compute_rmse(predict, target):

    len_predict = len(predict)
    len_target = len(target)
    
    if len_predict != len_target:
        raise ValueError("predict and target must have the same length")
    
    rmse = np.sqrt(np.mean((predict - target) ** 2))
    
    
    return rmse

selected_features=["blood pressure", "cholesterol", "smurfin donuts", "weight"]

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

from sklearn.neighbors import KNeighborsRegressor

# Create KNN regressor with k=10
knn = KNeighborsRegressor(n_neighbors=10)

# Fit on training data
knn.fit(X_train_selected, y_train.values.ravel())

# Predict y_test
y_pred_test = knn.predict(X_test_selected)

# Calculate metrics
test_rmse = compute_rmse(y_pred_test, y_test.values.ravel())

print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse:.4f}")





MODEL PERFORMANCE

Test Set:
  RMSE: 0.0587


## KNN With PCA

In [6]:
### KNN with PCA (5 dimensions)
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor

# Apply PCA to reduce to 5 dimensions
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original feature dimensions: {X_train.shape[1]}")
print(f"PCA reduced dimensions: {X_train_pca.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

# Create KNN regressor with k=10
knn_pca = KNeighborsRegressor(n_neighbors=10)

# Fit on PCA-transformed training data
knn_pca.fit(X_train_pca, y_train.values.ravel())

# Predict y_test
y_pred_test_pca = knn_pca.predict(X_test_pca)

# Calculate metrics
test_rmse_pca = compute_rmse(y_pred_test_pca, y_test.values.ravel())

print("\n" + "="*50)
print("MODEL PERFORMANCE (KNN with PCA)")
print("="*50)
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse_pca:.4f}")


Original feature dimensions: 18
PCA reduced dimensions: 5
Explained variance ratio: 0.5705

MODEL PERFORMANCE (KNN with PCA)

Test Set:
  RMSE: 0.0567


## Neural Net

In [None]:
### MLP with TensorFlow (3 layers of size 20)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Convert to numpy arrays with proper dtype handling
# Convert boolean columns to int, then everything to float32
X_train_tf = X_train.copy()
X_test_tf = X_test.copy()

# Convert boolean columns to int (0/1)
bool_cols = X_train_tf.select_dtypes(include=['bool']).columns
X_train_tf[bool_cols] = X_train_tf[bool_cols].astype(int)
X_test_tf[bool_cols] = X_test_tf[bool_cols].astype(int)

# Convert to float32 numpy arrays
X_train_tf = X_train_tf.values.astype(np.float32)
X_test_tf = X_test_tf.values.astype(np.float32)
y_train_tf = y_train.values.ravel().astype(np.float32)
y_test_tf = y_test.values.ravel().astype(np.float32)

# Create MLP model with 3 intermediate layers of size 20
model = keras.Sequential([
    layers.Dense(20, activation='relu', input_shape=(X_train_tf.shape[1],)),
    layers.Dense(10, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
print("Training MLP...")
history = model.fit(X_train_tf, y_train_tf, epochs=100, batch_size=32, verbose=0)

# Predict on test set
y_pred_test_tf = model.predict(X_test_tf, verbose=0).ravel()

# Calculate metrics
test_rmse_tf = compute_rmse(y_pred_test_tf, y_test_tf)

print("\n" + "="*50)
print("MODEL PERFORMANCE (TensorFlow MLP)")
print("="*50)
print(f"\nModel Architecture:")
print(f"  Input: {X_train_tf.shape[1]} features")
print(f"  3 Hidden layers: 20 neurons each (ReLU activation)")
print(f"  Output: 1 neuron (linear)")
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse_tf:.4f}")


: 