In [94]:
import pandas as pd
import numpy as np
import torch


In [95]:

df = pd.read_csv('cleveland.csv')
print(df.columns)

""" Convert num into a 0 or 1"""
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

print(df['oldpeak'])

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')
0      2.3
1      1.5
2      2.6
3      3.5
4      1.4
      ... 
298    1.2
299    3.4
300    1.2
301    0.0
302    0.0
Name: oldpeak, Length: 303, dtype: float64


For each continuous value, place the values into a bin

In [96]:
# Perform information gain on each feature
import math 

def entropy(class_probs):
    return -sum(p * math.log2(p) for p in class_probs if p != 0)

def information_gain(data, feature, target):
    class_probs = data[target].value_counts(normalize=True)
    entropy_before = round(entropy(class_probs), 3)
    
    values = data[feature].unique()
    weighted_entropy_after = 0
    
    for value in values:
        subset = data[data[feature] == value]
        prob_value = len(subset) / len(data)
        class_probs_subset = subset[target].value_counts(normalize=True)
        entropy_after = entropy(class_probs_subset)
        weighted_entropy_after += prob_value * entropy_after
    
    weighted_entropy_after = round(weighted_entropy_after, 3)
    information_gain = round(entropy_before - weighted_entropy_after, 3)
    return information_gain

# Used to calculate information gain on continuous feature
def discretize_feature(data, feature, num_bins=10):
    # Use pandas' cut function to bin the continuous feature
    new_feature = feature + '_bin'
    data[new_feature] = pd.cut(data[feature], bins=num_bins)
    return new_feature, data


# Caluclate information gain for each categorial
discrete_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
features_inf0_gain = {}
for feature in df.columns:
    if feature == 'num':
        continue
    if feature in discrete_features:
        new_feature, disFeature = discretize_feature(df, feature)
        ig = information_gain(disFeature, new_feature, 'num')
        features_inf0_gain[new_feature] = ig
    else:
        ig = information_gain(df, feature, 'num')
        features_inf0_gain[feature] = ig

features_inf0_gain = {k: v for k, v in sorted(features_inf0_gain.items(), key=lambda item: item[1], reverse=True)}
for k, v in features_inf0_gain.items():
    print(k, v)


thal 0.208
cp 0.205
ca 0.181
oldpeak_bin 0.163
thalach_bin 0.149
exang 0.139
slope 0.112
age_bin 0.073
sex 0.057
restecg 0.024
trestbps_bin 0.023
chol_bin 0.02
fbs 0.0


In [97]:
"""
Set up data for Preprocessing
"""

df = pd.read_csv('cleveland.csv')
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df['num'] = df['num'].astype('category')

selectedFeatures = ['thal', 'cp', 'ca', 'oldpeak', 'thalach', 'exang']
categoricalFeatures = ['cp', 'ca', 'thal', 'exang']
continuousFeatures = ['thalach', 'oldpeak']
df = df[selectedFeatures + ['num']]


In [98]:
"""
Prepare categorical features
"""

# convert categorial features into 'str' type
for feature in categoricalFeatures:
    df[feature] = df[feature].astype('str')

# Define mappings for each categorical feature
feature_mappings = {
    'cp': {'1.0': 0, '2.0': 0.33, '3.0': 0.67, '4.0': 1},
    'thal': {'3.0': 0, '6.0': 0.5, '7.0': 1},
    'ca': {'0.0': 0, '1.0': 0.33, '2.0': 0.67, '3.0': 1},
    'exang': {'0.0': 0, '1.0': 1}
    # 'slope': {'1.0': 0, '2.0': 0.5, '3.0': 1}
}

# Map the categorical variables to 0-1 and remove rows with '?'
for feature, valid_values in feature_mappings.items():
    df = df[df[feature] != '?']
    df[feature] = df[feature].map(valid_values)

In [99]:
"""
Clean up Continuous Features
"""

# remove rows that are not numeric
for feature in continuousFeatures:
    print(f"Converting {feature} to numeric")
    df = df[pd.to_numeric(df[feature], errors='coerce').notna()]

# Z-score normalization
for feature in continuousFeatures:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

Converting thalach to numeric
Converting oldpeak to numeric


In [100]:

df[selectedFeatures] = df[selectedFeatures].astype('float16')

In [101]:
""" Now lets save the data """
print(df.info())
df.to_csv('cleveland_cleaned.csv', index=False)

print(f"\n\n DF HEAD")
print(df.head(5))


<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   thal     297 non-null    float16 
 1   cp       297 non-null    float16 
 2   ca       297 non-null    float16 
 3   oldpeak  297 non-null    float16 
 4   thalach  297 non-null    float16 
 5   exang    297 non-null    float16 
 6   num      297 non-null    category
dtypes: category(1), float16(6)
memory usage: 6.2 KB
None


 DF HEAD
   thal        cp        ca   oldpeak   thalach  exang num
0   0.5  0.000000  0.000000  1.067383  0.017471    0.0   0
1   0.0  1.000000  1.000000  0.381104 -1.813477    1.0   1
2   1.0  1.000000  0.669922  1.324219 -0.897949    1.0   1
3   0.0  0.669922  0.000000  2.095703  1.629883    0.0   0
4   0.0  0.330078  0.000000  0.295410  0.976562    0.0   0
