In [1]:
TRAIN_DATA_PATH = "../../data/interim/train_data.pkl"
PROC_TRAIN_DATA_PATH = "../../data/interim/1__analytics_preprocessed_data.pkl"

# Import packages


In [2]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore")

In [3]:
# Change some pandas display options
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.width", 1000)
pd.set_option("styler.format.precision", 10)

In [4]:
df_train = pd.read_pickle(TRAIN_DATA_PATH)

# Functions


In [5]:
def remove_duplicates(data: pd.DataFrame):
    """Remove duplicates values if exist"""
    print(f"Duplicates count before droping:{data.duplicated().sum()}")
    data.drop_duplicates(inplace=True)
    print(f"Duplicates count after droping:{data.duplicated().sum()}")
    print(f"Data dimension{data.shape}")

# Substitute strange values

In [6]:
# Substitute '?' with np.NAN
df_train.replace("?", np.nan, inplace=True)

# Duplicates


In [7]:
remove_duplicates(df_train)

Duplicates count before droping:0
Duplicates count after droping:0
Data dimension(271, 14)


# Numbers to String

Changing categorical features that contain numbers to be in string format

| Attribute                          | Updated Feature Values                                                   |
| :--------------------------------- | :----------------------------------------------------------------------- |
| **sex**                            | 0:female<br>1:male                                                       |
| **chest pain type**                | 1:typical angina<br>2:atypical angina<br>3:non-anginal<br>4:asymptomatic |
| **fasting blood sugar**            | 0:> 120 mg/dl<br>1:< 120 mg/dl                                           |
| **resting electrocardiographic**   | 0:normal<br>1:ST-T wave abnormality<br>2:ventricular hypertrophy         |
| **exercise induced angina**        | 0:no<br>1:yes                                                            |
| **slope peak exercise ST segment** | 1:upsloping<br>2:flat<br>3:downsloping                                   |
| **thallium stress result**         | 3:normal<br>6:fixed defect<br>7:reversible defect                        |
| **target**                         | 0:no disease<br>1:LAD<br>2:LCX<br>3:RCA<br>4:highest                     |


In [8]:
REPLACEMENTS = {
    "sex": {0: "female", 1: "male"},
    "chest_pain_type": {
        1: "typical angina",
        2: "atypical angina",
        3: "non-anginal",
        4: "asymptomatic",
    },
    "fasting_blood_sugar": {0: "> 120 mg/dl", 1: "< 120 mg/dl"},
    "resting_electrocardiographic": {
        0: "normal",
        1: "ST wave abnormality",
        2: "ventricular hypertrophy",
    },
    "exercise_induced_angina": {0: "no", 1: "yes"},
    "slope_peak_exercise_ST_segment": {1: "upsloping", 2: "flat", 3: "downsloping"},
    "thallium_stress_result": {3: "normal", 6: "fixed defect", 7: "reversible defect"},
    "target": {0: "no disease", 1: "LAD", 2: "LCX", 3: "RCA", 4: "highest"},
}

# Convert all columns to numeric
df_train = df_train.apply(pd.to_numeric)

for col, replacement in REPLACEMENTS.items():
    df_train[col] = df_train[col].map(replacement).astype(str)

In [9]:
# 'number of major vessels' has only 4 values
# Change its type to be object
df_train["number of major vessels"] = df_train["number of major vessels"].astype(
    "object"
)

KeyError: 'number of major vessels'

In [None]:
df_train.head(5)

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic,maximum heart rate,exercise induced angina,ST depression,slope peak exercise ST segment,number of major vessels,thallium stress result,target
155,51.0,male,asymptomatic,140.0,299.0,> 120 mg/dl,normal,173.0,yes,1.6,upsloping,0.0,reversible defect,LAD
10,56.0,female,atypical angina,140.0,294.0,> 120 mg/dl,ventricular hypertrophy,153.0,no,1.3,flat,0.0,normal,no disease
53,60.0,male,asymptomatic,130.0,253.0,> 120 mg/dl,normal,144.0,yes,1.4,upsloping,1.0,reversible defect,LAD
122,55.0,male,asymptomatic,140.0,217.0,> 120 mg/dl,normal,111.0,yes,5.6,downsloping,0.0,reversible defect,RCA
208,62.0,female,asymptomatic,150.0,244.0,> 120 mg/dl,normal,154.0,yes,1.4,flat,0.0,normal,LAD


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271 entries, 155 to 174
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             271 non-null    float64
 1   sex                             271 non-null    object 
 2   chest pain type                 271 non-null    object 
 3   resting blood pressure          271 non-null    float64
 4   serum cholestoral               271 non-null    float64
 5   fasting blood sugar             271 non-null    object 
 6   resting electrocardiographic    271 non-null    object 
 7   maximum heart rate              271 non-null    float64
 8   exercise induced angina         271 non-null    object 
 9   ST depression                   271 non-null    float64
 10  slope peak exercise ST segment  271 non-null    object 
 11  number of major vessels         267 non-null    object 
 12  thallium stress result          27

# Add columns

## Slice age column

In [None]:
min_val = df_train["age"].min()
max_val = df_train["age"].max()
df_train["age_encoded"] = pd.cut(
    df_train["age"],
    bins=[min_val, 41, 55, max_val],
    labels=["Adults", "Middle-Aged", "Senior"],
)
df_train["age_encoded"] = df_train["age_encoded"].astype("object")

# Save processed data


In [None]:
df_train.to_pickle(PROC_TRAIN_DATA_PATH)