In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

# Load your dataset
df = pd.read_csv('heart_disease_uci.csv')

# Split into features and target
X = df.drop(columns=['num'])  # Replace 'target' with actual target column
y = df['num']

# Ensure that all columns are numeric before performing Z-score outlier detection
X_numeric = X.select_dtypes(include=[np.number])

# Step 10: Outlier Detection

# Detect outliers using Z-score on numeric data only
z_scores = np.abs(stats.zscore(X_numeric))
X_zscore = X[(z_scores < 3).all(axis=1)]
y_zscore = y[(z_scores < 3).all(axis=1)]

print(f"\nDataset after Z-Score Outlier Removal: {X_zscore.shape}")

# Detect outliers using IQR
Q1 = X_numeric.quantile(0.25)
Q3 = X_numeric.quantile(0.75)
IQR = Q3 - Q1
X_iqr = X[~((X_numeric < (Q1 - 1.5 * IQR)) | (X_numeric > (Q3 + 1.5 * IQR))).any(axis=1)]
y_iqr = y[~((X_numeric < (Q1 - 1.5 * IQR)) | (X_numeric > (Q3 + 1.5 * IQR))).any(axis=1)]

print(f"Dataset after IQR Outlier Removal: {X_iqr.shape}")

# Step 11: Feature Scaling

# Feature scaling (Normalization and Standardization)
scaler_norm = MinMaxScaler()
scaler_std = StandardScaler()

# Apply scaling to the numeric data
X_normalized = scaler_norm.fit_transform(X_numeric)
X_standardized = scaler_std.fit_transform(X_numeric)

# Display first 5 rows of normalized and standardized data
print("\nFirst 5 rows of Normalized Data:")
print(np.round(X_normalized[:5], 3))

print("\nFirst 5 rows of Standardized Data:")
print(np.round(X_standardized[:5], 3))


Dataset after Z-Score Outlier Removal: (0, 15)
Dataset after IQR Outlier Removal: (688, 15)

First 5 rows of Normalized Data:
[[0.    0.714 0.725 0.386 0.634 0.557 0.   ]
 [0.001 0.796 0.8   0.474 0.338 0.466 1.   ]
 [0.002 0.796 0.6   0.38  0.486 0.591 0.667]
 [0.003 0.184 0.65  0.415 0.894 0.693 0.   ]
 [0.004 0.265 0.65  0.338 0.789 0.455 0.   ]]

First 5 rows of Standardized Data:
[[-1.73   1.007  0.675  0.306  0.481  1.303 -0.724]
 [-1.726  1.432  1.462  0.785 -1.14   0.57   2.487]
 [-1.723  1.432 -0.637  0.27  -0.33   1.578  1.417]
 [-1.719 -1.753 -0.112  0.459  1.909  2.403 -0.724]
 [-1.715 -1.328 -0.112  0.044  1.33   0.478 -0.724]]
