In [None]:
'''
Only 14 attributes used:
      1. #3  (age)       
      2. #4  (sex)       
      3. #9  (cp)        
      4. #10 (trestbps)  
      5. #12 (chol)      
      6. #16 (fbs)       
      7. #19 (restecg)   
      8. #32 (thalach)   
      9. #38 (exang)     
      10. #40 (oldpeak)   
      11. #41 (slope)     
      12. #44 (ca)        
      13. #51 (thal)      
      14. #58 (num)       (the predicted attribute)
      
Complete attribute documentation:
      1 id: patient identification number
      2 ccf: social security number (I replaced this with a dummy value of 0)
      3 age: age in years
      4 sex: sex (1 = male; 0 = female)
      5 painloc: chest pain location (1 = substernal; 0 = otherwise)
      6 painexer (1 = provoked by exertion; 0 = otherwise)
      7 relrest (1 = relieved after rest; 0 = otherwise)
      8 pncaden (sum of 5, 6, and 7)
      9 cp: chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
     10 trestbps: resting blood pressure (in mm Hg on admission to the hospital)
     11 htn
     12 chol: serum cholestoral in mg/dl
     13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker)
     14 cigs (cigarettes per day)
     15 years (number of years as a smoker)
     16 fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
     17 dm (1 = history of diabetes; 0 = no such history)
     18 famhist: family history of coronary artery disease (1 = yes; 0 = no)
     19 restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
     20 ekgmo (month of exercise ECG reading)
     21 ekgday(day of exercise ECG reading)
     22 ekgyr (year of exercise ECG reading)
     23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no)
     24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no)
     25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no)
     26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no)
     27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no)
     28 proto: exercise protocol
          1 = Bruce     
          2 = Kottus
          3 = McHenry
          4 = fast Balke
          5 = Balke
          6 = Noughton 
          7 = bike 150 kpa min/min  (Not sure if "kpa min/min" is what was written!)
          8 = bike 125 kpa min/min  
          9 = bike 100 kpa min/min
         10 = bike 75 kpa min/min
         11 = bike 50 kpa min/min
         12 = arm ergometer
     29 thaldur: duration of exercise test in minutes
     30 thaltime: time when ST measure depression was noted
     31 met: mets achieved
     32 thalach: maximum heart rate achieved
     33 thalrest: resting heart rate
     34 tpeakbps: peak exercise blood pressure (first of 2 parts)
     35 tpeakbpd: peak exercise blood pressure (second of 2 parts)
     36 dummy
     37 trestbpd: resting blood pressure
     38 exang: exercise induced angina (1 = yes; 0 = no)
     39 xhypo: (1 = yes; 0 = no)
     40 oldpeak = ST depression induced by exercise relative to rest
     41 slope: the slope of the peak exercise ST segment
        -- Value 1: upsloping
        -- Value 2: flat
        -- Value 3: downsloping
     42 rldv5: height at rest
     43 rldv5e: height at peak exercise
     44 ca: number of major vessels (0-3) colored by flourosopy
     45 restckm: irrelevant
     46 exerckm: irrelevant
     47 restef: rest raidonuclid (sp?) ejection fraction
     48 restwm: rest wall (sp?) motion abnormality
        0 = none
        1 = mild or moderate
        2 = moderate or severe
        3 = akinesis or dyskmem (sp?)
     49 exeref: exercise radinalid (sp?) ejection fraction
     50 exerwm: exercise wall (sp?) motion 
     51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
     52 thalsev: not used
     53 thalpul: not used
     54 earlobe: not used
     55 cmo: month of cardiac cath (sp?)  (perhaps "call")
     56 cday: day of cardiac cath (sp?)
     57 cyr: year of cardiac cath (sp?)
     58 num: diagnosis of heart disease (angiographic disease status)
        -- Value 0: < 50% diameter narrowing
        -- Value 1: > 50% diameter narrowing
        (in any major vessel: attributes 59 through 68 are vessels)
     59 lmt
     60 ladprox
     61 laddist
     62 diag
     63 cxmain
     64 ramus
     65 om1
     66 om2
     67 rcaprox
     68 rcadist
     69 lvx1: not used
     70 lvx2: not used
     71 lvx3: not used
     72 lvx4: not used
     73 lvf: not used
     74 cathef: not used
     75 junk: not used
     76 name: last name of patient  (I replaced this with the dummy string "name")
'''

In [None]:
import pandas as pd

# Load the data from the file 'processed.cleveland.data'
data = pd.read_csv('processed.cleveland.data', header=None)
data.head()

In [None]:
# Rename the columns as per the updated names
column_names_updated = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
data.columns = column_names_updated
data.head()

In [None]:
# Rename the dataset to 'heart_disease_data'
heart_disease_data = data.copy()
heart_disease_data.head()

In [None]:
heart_disease_data.isnull().sum()

In [None]:
heart_disease_data.info()

In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np

# Handling non-numeric columns
heart_disease_data_numeric = heart_disease_data.copy()
for col in heart_disease_data_numeric.columns:
    heart_disease_data_numeric[col] = pd.to_numeric(heart_disease_data_numeric[col], errors='coerce')

# Filling NaN values with column means
heart_disease_data_numeric.fillna(heart_disease_data_numeric.mean(), inplace=True)

# Applying Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
anomalies = iso_forest.fit_predict(heart_disease_data_numeric)

# Mapping the predicted values to 'Normal' and 'Anomaly'
anomaly_map = {1: 'Normal', -1: 'Anomaly'}
anomalies = np.vectorize(anomaly_map.get)(anomalies)

# Adding the anomaly labels to the dataset
heart_disease_data['Anomaly'] = anomalies

# Displaying the first few rows of the dataset with anomaly labels
heart_disease_data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the dataset for anomalies
anomaly_data = heart_disease_data[heart_disease_data['Anomaly'] == 'Anomaly']

# Identifying numeric columns
numeric_columns = heart_disease_data.select_dtypes(include=['float64', 'int64']).columns

# Plotting the distribution of numeric features for anomalies vs the entire dataset
fig, axes = plt.subplots(nrows=len(numeric_columns), ncols=1, figsize=(10, 40))

for i, col in enumerate(numeric_columns):
    sns.kdeplot(heart_disease_data[col], ax=axes[i], label='Overall Distribution', shade=True)
    sns.kdeplot(anomaly_data[col], ax=axes[i], label='Anomaly Distribution', shade=True)
    axes[i].set_title(f'Distribution of {col}')
    axes[i].legend()

plt.tight_layout()
plt.show()

Age: The anomalies seem to have a slightly different distribution compared to the overall data, with a higher density around the age of 60.

Trestbps: The anomalies have a higher density around the 140-160 range.

Chol: The anomalies have a higher density in the 200-250 range.

Thalach: The distribution of anomalies is slightly shifted towards the lower values.

Oldpeak: The anomalies have a higher density around the 0-2 range.

Slope, Ca, Thal, Num: The distributions for these features are quite similar for both anomalies and the overall data.

In [None]:
anomaly_data

showcasing the clients who may potentially be an anomaly

In [None]:
# Descriptive statistics for the entire dataset and the anomalies
overall_stats = heart_disease_data.describe().transpose()
anomaly_stats = anomaly_data.describe().transpose()

# Combining the statistics for comparison
stats_comparison = pd.concat([overall_stats, anomaly_stats], axis=1, keys=['Overall', 'Anomaly'])
stats_comparison[['Overall', 'Anomaly']]

exang (exercise induced angina): The mean and median values for anomalies are higher than the overall data, indicating that anomalies tend to have a higher occurrence of exercise-induced angina.

slope (slope of the peak exercise ST segment): The anomalies have a higher mean and median slope value compared to the overall data.

chol (serum cholesterol in mg/dl): The anomalies have a higher mean and median cholesterol level compared to the overall data.

fbs (fasting blood sugar > 120 mg/dl): The mean and median values for anomalies are higher, indicating that anomalies tend to have a higher fasting blood sugar level.

num (diagnosis of heart disease): The anomalies have a significantly higher mean and median value for the diagnosis of heart disease.


Features like chol (serum cholesterol), fbs (fasting blood sugar), and num (diagnosis of heart disease) show noticeable differences in their distributions between normal data points and anomalies. The anomalies tend to have higher values for these features.

In [None]:
# Visualizing the distribution of features for anomalies using box plots with 7 rows and 2 columns
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(15, 40))

for i, col in enumerate(numeric_columns):
    row = i // 2
    col_idx = i % 2
    sns.boxplot(x='Anomaly', y=col, data=heart_disease_data, ax=axes[row, col_idx])
    axes[row, col_idx].set_title(f'Box Plot of {col} by Anomaly')

plt.tight_layout()
plt.show()

Features like chol (serum cholesterol), fbs (fasting blood sugar), and num (diagnosis of heart disease) show noticeable differences in their distributions between normal data points and anomalies. The anomalies tend to have higher values for these features.

Other features also show some differences in their distributions, but they might not be as pronounced.

In [None]:
# Count of anomalies
anomaly_count = anomaly_data.shape[0]
total_data_count = heart_disease_data.shape[0]
anomaly_percentage = (anomaly_count / total_data_count) * 100

anomaly_count, anomaly_percentage

In [None]:
# Feature value ranges for anomalies
feature_ranges = anomaly_data.describe().transpose()[['min', '25%', '50%', '75%', 'max']]
feature_ranges

In [None]:
# Distribution of the target variable 'num' within the anomalies
plt.figure(figsize=(10, 6))
sns.countplot(x='num', data=anomaly_data)
plt.title('Distribution of Diagnosis of Heart Disease (num) within Anomalies')
plt.xlabel('Diagnosis of Heart Disease (num)')
plt.ylabel('Count')
plt.show()

From the plot, we can observe:

A significant number of anomalies have a num value of 3, indicating a higher likelihood of having heart disease.

There are also anomalies with num values of 0, 2, and 4, but the count for num = 3 is the most pronounced.

This suggests that many of the anomalies identified by the Isolation Forest algorithm are associated with a higher likelihood of heart disease.

In [None]:
# Selecting a subset of features for the pair plot
selected_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'num']

# Creating the pair plot for the anomalies
sns.pairplot(anomaly_data[selected_features], hue='num', palette='viridis', diag_kind='kde')
plt.suptitle('Pairwise Relationships Among Anomalies', y=1.02)
plt.show()

Age vs. Chol: There seems to be a cluster of anomalies with higher cholesterol levels as age increases.

Trestbps vs. Thalach: Anomalies with higher resting blood pressure (trestbps) tend to have a lower maximum heart rate achieved (thalach).

Oldpeak vs. Thalach: Anomalies with a higher ST depression induced by exercise relative to rest (oldpeak) tend to have a lower maximum heart rate achieved (thalach).

## Further Analysis of Anomalies

In [None]:
# Statistical summary of the anomalies
anomaly_summary = anomaly_data.describe().transpose()
anomaly_summary

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix of the anomalies
correlation_matrix = anomaly_data.corr()

# Visualizing the correlation matrix using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Anomalies')
plt.show()

age and oldpeak have a negative correlation of -0.25.

thalach (maximum heart rate achieved) has a negative correlation with several features like age, oldpeak, and exang (exercise induced angina).

cp (chest pain type) and num (diagnosis of heart disease) have a positive correlation.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Features and target
X = anomaly_data.drop(['num', 'Anomaly'], axis=1)
y = anomaly_data['num']

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# Extract feature importances
feature_importances = clf.feature_importances_
features = X.columns

# Visualize feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x=feature_importances, y=features, palette='viridis')
plt.title('Feature Importance for Anomalies')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

From the chart, we can observe:

thalach (maximum heart rate achieved) appears to be the most influential feature in determining whether a data point is considered an anomaly.

oldpeak (ST depression induced by exercise relative to rest) and trestbps are also among the top influential features.

Features like sex, fbs (fasting blood sugar), and restecg (resting electrocardiographic results) have relatively lower importance scores, indicating that they might not be as influential in distinguishing anomalies from the rest of the data.