In [39]:
TRAIN_DATA_PATH = "../../data/interim/cleveland_train.pkl"
PROC_TRAIN_DATA_PATH = "../../data/interim/1__analytics_preprocessed_df.pkl"


# Import packages


In [40]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')


In [41]:
# Change some pandas display options
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.width", 1000)
pd.set_option("styler.format.precision", 10)


In [42]:
df_train = pd.read_pickle(TRAIN_DATA_PATH)


# Functions


In [43]:

def renamed_cols(data:pd.DataFrame, output=True):    
    """Changes columns name for the passed dataset
    """

    # Obtain features before modifying names
    old_columns_names = data.columns.to_list()
    
   
    data.rename(
        columns={
            'cp'        :   'chest pain type',
            'trestbps'  :   'resting blood pressure',
            'chol'      :   'serum cholestoral',
            'fbs'       :   'fasting blood sugar',
            'restecg'   :   'resting electrocardiographic',
            'thalach'   :   'maximum heart rate',
            'exang'     :   'exercise induced angina',
            'oldpeak'   :   'ST depression',
            'slope'     :   'slope peak exercise ST segment',
            'ca'        :   'number of major vessels',
            'thal'      :   'thallium stress result'
        },
        inplace = True
    )
    
    
    # data.rename(
    #     columns={
    #         'resting bp s'  :   'resting blood pressure',
    #         'cholesterol'      :   'serum cholestoral',
    #         'resting ecg'   :   'resting electrocardiographic',
    #         'max heart rate'   :   'maximum heart rate',
    #         'exercise angina'     :   'exercise induced angina',
    #         'oldpeak'   :   'ST depression',
    #         'ST slope'     :   'slope peak exercise ST segment',
    #     },
    #     inplace = True
    # )
    
    if(output):
        new_columns_names = data.columns.to_list()
    
        # before_after_pairs = pd.DataFrame([old_columns_names, new_columns_names], columns=['before renaming', 'after renaming'])
        before_after_pairs = pd.DataFrame({'before renaming': old_columns_names, 'after renaming': new_columns_names})

        print(before_after_pairs)

    return None


def remove_duplicates(data: pd.DataFrame):
    """ Remove duplicates values if exist"""
    print(f"Duplicates count before droping:{data.duplicated().sum()}")
    data.drop_duplicates(inplace=True)
    print(f"Duplicates count after droping:{data.duplicated().sum()}")
    print(f"Data dimension{data.shape}")


# Substitute strange values

In [44]:
# Substitute '?' with np.NAN
df_train.replace('?', np.nan, inplace=True)

# Duplicates


In [45]:
remove_duplicates(df_train)

Duplicates count before droping:0
Duplicates count after droping:0
Data dimension(226, 14)


# Change columns name

In [46]:
renamed_cols(df_train)

   before renaming                  after renaming
0   age             age                           
1   sex             sex                           
2   cp              chest pain type               
3   trestbps        resting blood pressure        
4   chol            serum cholestoral             
5   fbs             fasting blood sugar           
6   restecg         resting electrocardiographic  
7   thalach         maximum heart rate            
8   exang           exercise induced angina       
9   oldpeak         ST depression                 
10  slope           slope peak exercise ST segment
11  ca              number of major vessels       
12  thal            thallium stress result        
13  target          target                        


# Numbers to String

Changing categorical features that contain numbers to be in string format

| Attribute                          | Updated Feature Values                                                   |
| :--------------------------------- | :----------------------------------------------------------------------- |
| **sex**                            | 0:female<br>1:male                                                       |
| **chest pain type**                | 1:typical angina<br>2:atypical angina<br>3:non-anginal<br>4:asymptomatic |
| **fasting blood sugar**            | 0:> 120 mg/dl<br>1:< 120 mg/dl                                           |
| **resting electrocardiographic**   | 0:normal<br>1:ST-T wave abnormality<br>2:ventricular hypertrophy         |
| **exercise induced angina**        | 0:no<br>1:yes                                                            |
| **slope peak exercise ST segment** | 1:upsloping<br>2:flat<br>3:downsloping                                   |
| **thallium stress result**         | 3:normal<br>6:fixed defect<br>7:reversible defect                        |
| **target**                         | 0:no disease<br>1:LAD<br>2:LCX<br>3:RCA<br>4:highest                     |


In [47]:
REPLACEMENTS = {
    "sex" : {0: "female", 1: "male"},
    "chest pain type" : {1: "typical angina", 2: "atypical angina", 3: "non-anginal", 4: "asymptomatic"},
    "fasting blood sugar" : {0: "> 120 mg/dl", 1: "< 120 mg/dl"},
    "resting electrocardiographic":{0: "normal", 1: "ST wave abnormality", 2: "ventricular hypertrophy"},
    "exercise induced angina" : {0: "no", 1: "yes"},
    "slope peak exercise ST segment" : {1: "upsloping", 2: "flat", 3: "downsloping"},
    "thallium stress result" : {3: "normal", 6: "fixed defect", 7: "reversible defect"},
    "target" : {0: "no disease", 1: "LAD", 2: "LCX", 3: "RCA", 4: "highest"}
    
}

In [48]:
# Convert all columns to numeric
df_train = df_train.apply(pd.to_numeric)

for col, replacement in REPLACEMENTS.items():
    df_train[col] = df_train[col].map(replacement).astype(str)

In [49]:

df_train.head(5)


Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic,maximum heart rate,exercise induced angina,ST depression,slope peak exercise ST segment,number of major vessels,thallium stress result,target
156,58.0,male,asymptomatic,125.0,300.0,> 120 mg/dl,ventricular hypertrophy,171.0,no,0.0,upsloping,2.0,reversible defect,LAD
118,65.0,male,asymptomatic,135.0,254.0,> 120 mg/dl,ventricular hypertrophy,127.0,no,2.8,flat,1.0,reversible defect,LCX
277,57.0,male,atypical angina,154.0,232.0,> 120 mg/dl,ventricular hypertrophy,164.0,no,0.0,upsloping,1.0,normal,LAD
142,64.0,male,non-anginal,125.0,309.0,> 120 mg/dl,normal,131.0,yes,1.8,flat,0.0,reversible defect,LAD
297,45.0,male,typical angina,110.0,264.0,> 120 mg/dl,normal,132.0,no,1.2,flat,0.0,reversible defect,LAD


In [51]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 156 to 256
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             226 non-null    float64
 1   sex                             226 non-null    object 
 2   chest pain type                 226 non-null    object 
 3   resting blood pressure          226 non-null    float64
 4   serum cholestoral               226 non-null    float64
 5   fasting blood sugar             226 non-null    object 
 6   resting electrocardiographic    226 non-null    object 
 7   maximum heart rate              226 non-null    float64
 8   exercise induced angina         226 non-null    object 
 9   ST depression                   226 non-null    float64
 10  slope peak exercise ST segment  226 non-null    object 
 11  number of major vessels         222 non-null    float64
 12  thallium stress result          22

# Save processed data


In [50]:
df_train.to_pickle(PROC_TRAIN_DATA_PATH)