# Importing libraries

In [1]:
import pickle
import pandas as pd
import numpy as np
import math

# Importing datasets

In [2]:
cleveland_db = pd.read_csv('/content/drive/MyDrive/Heart Disease Prediction/Datasets/processed.cleveland.data')
hungarian_db = pd.read_csv('/content/drive/MyDrive/Heart Disease Prediction/Datasets/reprocessed.hungarian.data', sep=" ")
switzerland_db = pd.read_csv('/content/drive/MyDrive/Heart Disease Prediction/Datasets/processed.switzerland.data')
valb_db = pd.read_csv('/content/drive/MyDrive/Heart Disease Prediction/Datasets/processed.va.data')

In [3]:
switzerland_db.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


Some values are missing, they are represented by question marks.

# Concatenating datasets

In [4]:
concat_db = pd.concat([cleveland_db, hungarian_db, switzerland_db, valb_db], axis=0, ignore_index=True)

In [5]:
concat_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  920 non-null    object 
 4   chol      920 non-null    object 
 5   fbs       920 non-null    object 
 6   restecg   920 non-null    object 
 7   thalach   920 non-null    object 
 8   exang     920 non-null    object 
 9   oldpeak   920 non-null    object 
 10  slope     920 non-null    object 
 11  ca        920 non-null    object 
 12  thal      920 non-null    object 
 13  num       920 non-null    float64
dtypes: float64(4), object(10)
memory usage: 100.9+ KB


* We convert all columns to numeric types, with question marks being replaced by 'NaN' values.

In [6]:
concat_db = concat_db.apply(pd.to_numeric, errors='coerce')
concat_db.info()
concat_db.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  862 non-null    float64
 4   chol      913 non-null    float64
 5   fbs       838 non-null    float64
 6   restecg   919 non-null    float64
 7   thalach   866 non-null    float64
 8   exang     866 non-null    float64
 9   oldpeak   858 non-null    float64
 10  slope     801 non-null    float64
 11  ca        600 non-null    float64
 12  thal      700 non-null    float64
 13  num       920 non-null    float64
dtypes: float64(14)
memory usage: 100.9 KB


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


# Deleting out-of-range values

In [8]:
oor_features = ["trestbps", "chol", "fbs", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]

In [9]:
for feature in oor_features:
  oor_indices = concat_db[concat_db[feature]<0].index
  for index in oor_indices: 
      concat_db[feature].loc[index] = np.nan

In [10]:
concat_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  861 non-null    float64
 4   chol      890 non-null    float64
 5   fbs       830 non-null    float64
 6   restecg   919 non-null    float64
 7   thalach   865 non-null    float64
 8   exang     865 non-null    float64
 9   oldpeak   846 non-null    float64
 10  slope     611 non-null    float64
 11  ca        310 non-null    float64
 12  thal      434 non-null    float64
 13  num       920 non-null    float64
dtypes: float64(14)
memory usage: 100.9 KB


# Dealing with missing values

* An 'age_bracket' feature is added. It assigns to each patient’s age its corresponding 10-year bracket (i.e. 5 for a 54-year old).

In [None]:
age_bracket = list()
for i in range(0,len(concat_db)) : 
    if((concat_db['age'].loc[i] > 20) & (concat_db['age'].loc[i]<=30)) : 
        age_bracket.append(2)
    if((concat_db['age'].loc[i] > 30) & (concat_db['age'].loc[i]<=40)) : 
        age_bracket.append(3)
    if((concat_db['age'].loc[i] > 40) & (concat_db['age'].loc[i]<=50)) : 
        age_bracket.append(4)
    if((concat_db['age'].loc[i] > 50) & (concat_db['age'].loc[i]<=60)) : 
        age_bracket.append(5)
    if((concat_db['age'].loc[i] > 60) & (concat_db['age'].loc[i]<=70)) : 
        age_bracket.append(6)
    if((concat_db['age'].loc[i] > 70) & (concat_db['age'].loc[i]<=80)) : 
        age_bracket.append(7)

In [None]:
concat_db['age_bracket'] = pd.Series(age_bracket)

In [None]:
m1, m2, m3, m4, m5, m6 = concat_db['age_bracket'] == 2, concat_db['age_bracket'] == 3, concat_db['age_bracket'] == 4, concat_db['age_bracket'] == 5, concat_db['age_bracket'] == 6, concat_db['age_bracket'] == 7
n1, n2, n3, n4, n5, n6 = (concat_db['age_bracket'] == 2) & (concat_db['sex'] == 1) , (concat_db['age_bracket'] == 3) & (concat_db['sex'] == 1), (concat_db['age_bracket'] == 4) & (concat_db['sex'] == 1), (concat_db['age_bracket'] == 5) & (concat_db['sex'] == 1), (concat_db['age_bracket'] == 6) & (concat_db['sex'] == 1), (concat_db['age_bracket'] == 7) & (concat_db['sex'] == 1)
f1, f2, f3, f4, f5, f6 = (concat_db['age_bracket'] == 2) & (concat_db['sex'] == 0) , (concat_db['age_bracket'] == 3) & (concat_db['sex'] == 0), (concat_db['age_bracket'] == 4) & (concat_db['sex'] == 0), (concat_db['age_bracket'] ==5) & (concat_db['sex'] == 0), (concat_db['age_bracket'] == 6) & (concat_db['sex'] == 0), (concat_db['age_bracket'] ==6) & (concat_db['sex'] == 0)

In [None]:
concat_db = concat_db.drop(concat_db[concat_db['age'].isna()].index)

* **ca is deleted due to too many missing values** 

In [None]:
concat_db = concat_db.drop(columns = 'ca')

* **Missing ‘chol’ and ‘trestbps’ values are replaced with the average value for the age bracket** 

In [None]:
concat_db.loc[m1,'trestbps'] = concat_db.loc[m1,'trestbps'].fillna(concat_db.loc[m1,'trestbps'].mean())
concat_db.loc[m2,'trestbps'] = concat_db.loc[m2,'trestbps'].fillna(concat_db.loc[m2,'trestbps'].mean())
concat_db.loc[m3,'trestbps'] = concat_db.loc[m3,'trestbps'].fillna(concat_db.loc[m3,'trestbps'].mean())
concat_db.loc[m4,'trestbps'] = concat_db.loc[m4,'trestbps'].fillna(concat_db.loc[m4,'trestbps'].mean())
concat_db.loc[m5,'trestbps'] = concat_db.loc[m5,'trestbps'].fillna(concat_db.loc[m5,'trestbps'].mean())
concat_db.loc[m6,'trestbps'] = concat_db.loc[m6,'trestbps'].fillna(concat_db.loc[m6,'trestbps'].mean())

In [None]:
concat_db.loc[n1,'chol'] = concat_db.loc[n1,'chol'].fillna(concat_db.loc[n1,'chol'].mean())
concat_db.loc[n2,'chol'] = concat_db.loc[n2,'chol'].fillna(concat_db.loc[n2,'chol'].mean())
concat_db.loc[n3,'chol'] = concat_db.loc[n3,'chol'].fillna(concat_db.loc[n3,'chol'].mean())
concat_db.loc[n4,'chol'] = concat_db.loc[n4,'chol'].fillna(concat_db.loc[n4,'chol'].mean())
concat_db.loc[n5,'chol'] = concat_db.loc[n5,'chol'].fillna(concat_db.loc[n5,'chol'].mean())
concat_db.loc[n6,'chol'] = concat_db.loc[n6,'chol'].fillna(concat_db.loc[n6,'chol'].mean())

In [None]:
concat_db.loc[f1,'chol'] = concat_db.loc[f1,'chol'].fillna(concat_db.loc[f1,'chol'].mean())
concat_db.loc[f2,'chol'] = concat_db.loc[f2,'chol'].fillna(concat_db.loc[f2,'chol'].mean())
concat_db.loc[f3,'chol'] = concat_db.loc[f3,'chol'].fillna(concat_db.loc[f3,'chol'].mean())
concat_db.loc[f4,'chol'] = concat_db.loc[f4,'chol'].fillna(concat_db.loc[f4,'chol'].mean())
concat_db.loc[f5,'chol'] = concat_db.loc[f5,'chol'].fillna(concat_db.loc[f5,'chol'].mean())
concat_db.loc[f6,'chol'] = concat_db.loc[f6,'chol'].fillna(concat_db.loc[f6,'chol'].mean())

* **Missing ‘exang’, ‘fbs’, ‘oldpeak’, ‘slope’ and ‘thal’ values are replaced with the median value for the age bracket**

In [None]:
concat_db.loc[m1,'exang'] = concat_db.loc[m1,'exang'].fillna(concat_db.loc[m1,'exang'].median())
concat_db.loc[m2,'exang'] = concat_db.loc[m2,'exang'].fillna(concat_db.loc[m2,'exang'].median())
concat_db.loc[m3,'exang'] = concat_db.loc[m3,'exang'].fillna(concat_db.loc[m3,'exang'].median())
concat_db.loc[m4,'exang'] = concat_db.loc[m4,'exang'].fillna(concat_db.loc[m4,'exang'].median())
concat_db.loc[m5,'exang'] = concat_db.loc[m5,'exang'].fillna(concat_db.loc[m5,'exang'].median())
concat_db.loc[m6,'exang'] = concat_db.loc[m6,'exang'].fillna(concat_db.loc[m6,'exang'].median())

In [None]:
concat_db.loc[m1,'fbs'] = concat_db.loc[m1,'fbs'].fillna(concat_db.loc[m1,'fbs'].median())
concat_db.loc[m2,'fbs'] = concat_db.loc[m2,'fbs'].fillna(concat_db.loc[m2,'fbs'].median())
concat_db.loc[m3,'fbs'] = concat_db.loc[m3,'fbs'].fillna(concat_db.loc[m3,'fbs'].median())
concat_db.loc[m4,'fbs'] = concat_db.loc[m4,'fbs'].fillna(concat_db.loc[m4,'fbs'].median())
concat_db.loc[m5,'fbs'] = concat_db.loc[m5,'fbs'].fillna(concat_db.loc[m5,'fbs'].median())
concat_db.loc[m6,'fbs'] = concat_db.loc[m6,'fbs'].fillna(concat_db.loc[m6,'fbs'].median())

In [None]:
concat_db.loc[m1,'oldpeak'] = concat_db.loc[m1,'oldpeak'].fillna(concat_db.loc[m1,'oldpeak'].median())
concat_db.loc[m2,'oldpeak'] = concat_db.loc[m2,'oldpeak'].fillna(concat_db.loc[m2,'oldpeak'].median())
concat_db.loc[m3,'oldpeak'] = concat_db.loc[m3,'oldpeak'].fillna(concat_db.loc[m3,'oldpeak'].median())
concat_db.loc[m4,'oldpeak'] = concat_db.loc[m4,'oldpeak'].fillna(concat_db.loc[m4,'oldpeak'].median())
concat_db.loc[m5,'oldpeak'] = concat_db.loc[m5,'oldpeak'].fillna(concat_db.loc[m5,'oldpeak'].median())
concat_db.loc[m6,'oldpeak'] = concat_db.loc[m6,'oldpeak'].fillna(concat_db.loc[m6,'oldpeak'].median())

In [None]:
concat_db.loc[m1,'slope'] = concat_db.loc[m1,'slope'].fillna(concat_db.loc[m1,'slope'].median())
concat_db.loc[m2,'slope'] = concat_db.loc[m2,'slope'].fillna(concat_db.loc[m2,'slope'].median())
concat_db.loc[m3,'slope'] = concat_db.loc[m3,'slope'].fillna(concat_db.loc[m3,'slope'].median())
concat_db.loc[m4,'slope'] = concat_db.loc[m4,'slope'].fillna(concat_db.loc[m4,'slope'].median())
concat_db.loc[m5,'slope'] = concat_db.loc[m5,'slope'].fillna(concat_db.loc[m5,'slope'].median())
concat_db.loc[m6,'slope'] = concat_db.loc[m6,'slope'].fillna(concat_db.loc[m6,'slope'].median())

In [None]:
concat_db.loc[m1,'thal'] = concat_db.loc[m1,'thal'].fillna(concat_db.loc[m1,'thal'].median())
concat_db.loc[m2,'thal'] = concat_db.loc[m2,'thal'].fillna(concat_db.loc[m2,'thal'].median())
concat_db.loc[m3,'thal'] = concat_db.loc[m3,'thal'].fillna(concat_db.loc[m3,'thal'].median())
concat_db.loc[m4,'thal'] = concat_db.loc[m4,'thal'].fillna(concat_db.loc[m4,'thal'].median())
concat_db.loc[m5,'thal'] = concat_db.loc[m5,'thal'].fillna(concat_db.loc[m5,'thal'].median())
concat_db.loc[m6,'thal'] = concat_db.loc[m6,'thal'].fillna(concat_db.loc[m6,'thal'].median())

* **Missing ‘thalach’ values were replaced with (220 – age) for men and (226 – age) for women**

In [None]:
indices = concat_db[concat_db['thalach'].isna()].index
for index in indices : 
    if (concat_db['sex'].loc[index] == 1) :
        concat_db["thalach"].loc[index] = 220 - concat_db['age'].loc[index]
    else : 
        concat_db['thalach'].loc[index] == 226 - concat_db['age'].loc[index]

* **Removing some rows with other missing values**

In [None]:
concat_db = concat_db.drop(concat_db[concat_db['restecg'].isna()].index)

In [None]:
concat_db = concat_db.drop(concat_db[concat_db['age_bracket'].isna()].index)

In [None]:
concat_db.info()

* **Reducing 'num' to two classes**

In [None]:
concat_db['num'] = concat_db.num.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})

* **Exporting data**

In [None]:
with open('df_combined.pickle', 'wb') as f:
    pickle.dump(concat_db, f)