In [123]:
TRAIN_DATA_PATH = "../../data/raw/heart_statlog_cleveland_hungary_final - Copy.csv"
PROC_TRAIN_DATA_PATH = "../../data/interim/1__analytics_preprocessed_df.pkl"


# Import packages


In [124]:
import pandas as pd
import numpy as np


In [125]:
# Change some pandas display options
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.width", 1000)
pd.set_option("styler.format.precision", 10)


In [126]:
df_train = pd.read_csv(TRAIN_DATA_PATH)


# Functions


In [127]:

# Get the values count for each feature in the DataFrame
def get_values_count(data: pd.DataFrame, cols: list) -> None:
    """
    Calculate the values count for each feature in the DataFrame
    
    Parameters
    ----------
    data : DataFrame
    cols : list of features
    
    Returns
    -------
    values_count : DataFrame contains values count for each feature
    """
    for col in cols:
        print(f"{col} :{data[col].value_counts(dropna=False).to_dict()}")


def get_unique_values(data: pd.DataFrame):
    """
    Get unique values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    uniques = pd.DataFrame(data={"feature": [], "uniques": []})

    for col in data.columns:
        unique_values = data[col].unique()
        if len(unique_values) > 1000:
            continue

        uniques.loc[len(uniques)] = [col, unique_values]

    uniques.index = uniques["feature"]
    uniques.drop(columns="feature", inplace=True)
    return uniques


def get_strange_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Get strange values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    data_copy = data.copy()
    uniques = pd.DataFrame(data={"feature": [], "indices-values": []})

    for col in data_copy.columns:
        # Change the column type to be string
        data_copy[col] = data_copy[col].astype(str)

        # The pattern for checking the presence of strange values
        pattern = r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)|[a-zA-Z]+)"

        # Get mask matrix that refers to strange values ([~]: for not contain)
        strange_values_mask = ~data_copy[col].str.contains(
            pat=pattern, na=True, regex=True, case=False
        )

        # Get index of the strange values
        strange_values_ind = data_copy[strange_values_mask].index.to_list()

        if not strange_values_ind:
            continue

        # dictionary = {
        #     col : {ind:list(data_copy.loc[ind,col]) for ind in strange_values_ind}
        # }

        # reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()}
        # reform = pd.DataFrame.from_dict(reform, orient='index').transpose()

        uniques.loc[len(uniques)] = [
            col,
            [(ind, data_copy.loc[ind, col]) for ind in strange_values_ind],
        ]
        
        print(uniques)

    uniques.set_index(keys=["feature"], inplace=True)

    if not uniques.empty:
        return uniques

    return "No strange values found!"



In [128]:
get_strange_values(df_train)


  strange_values_mask = ~data_copy[col].str.contains(


'No strange values found!'

In [129]:
# Replace all '?' values with NaN
df_train.replace('?', np.nan, inplace=True)


## Missing Values

Since **number of major vessels** & **thallium stress result** are categorical features, the missing values could be replaced by most mode value.


In [130]:
# Remove null values if exist
print(f"--Missing values count--\n{df_train.isnull().sum().sort_values()}")


--Missing values count--
age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64


In [131]:
# # Replace null values from training set
# mode_imputer = SimpleImputer(strategy="most_frequent")
# train_data[
#     ["thallium stress result", "number of major vessels"]
# ] = mode_imputer.fit_transform(
#     train_data[["thallium stress result", "number of major vessels"]]
# )
# test_data[
#     ["thallium stress result", "number of major vessels"]
# ] = mode_imputer.transform(
#     test_data[["thallium stress result", "number of major vessels"]]
# )


## Duplicates


In [132]:
def remove_duplicates(data: pd.DataFrame):
    data_copy = data.copy()
    """ Remove duplicates values if exist"""
    print(f"Duplicates count before droping:{data_copy.duplicated().sum()}")
    data_copy.drop_duplicates(inplace=True)
    print(f"Duplicates count after droping:{data_copy.duplicated().sum()}")
    print(f"Data dimension{data.shape}")


remove_duplicates(df_train)


Duplicates count before droping:272
Duplicates count after droping:0
Data dimension(1190, 12)


## Balancing

Check if the training data is well balanced because one of the major issues when dealing with unbalanced datasets relates to the metrics used to evaluate a model. Using simpler metrics like accuracy_score can be misleading. In a dataset with highly unbalanced classes, if the classifier always "predicts" the most common class without performing any analysis of the features, it will still have a high accuracy rate, obviously illusory.

Depending on the obtained result, the data is well balanced and no need to resample it.


In [133]:
# def check_balancing(data, target_name):
#     """
#     Check if the target's classes are balanced between each other
#     """
#     # return data[target_name].value_counts(normalize=normalize)

#     # Target Class count
#     plt.figure(figsize=(8, 8))
#     plt.pie(
#         data[target_name].value_counts(),
#         labels=["no disease", "LAD", "LCX", "RCA", "highest"],
#         autopct="%1.2f%%",
#         explode=[0, 0.2, 0.2, 0.2, 0.2],
#         shadow=True,
#     )

#     my_circle = plt.Circle((0, 0), 0.4, color="white")
#     p = plt.gcf()
#     p.gca().add_artist(my_circle)
#     plt.title("Target Class Count")


# check_balancing(train_data, "target")



# Change the cols name

In [134]:
df_train.columns

Index(['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target'], dtype='object')

In [135]:
# new_cols_names = [
#     "sex",
#     "chest pain type",
#     "fasting blood sugar",
#     "resting electrocardiographic",
#     "exercise induced angina",
#     "slope peak exercise ST segment",
#     "number of major vessels",
#     "thallium stress result",
# ]


df_train.rename(
        columns={
            'resting bp s'  :   'resting blood pressure',
            'cholesterol'      :   'serum cholestoral',
            'resting ecg'   :   'resting electrocardiographic',
            'max heart rate'   :   'maximum heart rate',
            'exercise angina'     :   'exercise induced angina',
            'oldpeak'   :   'ST depression',
            'ST slope'     :   'slope peak exercise ST segment',
        },
        inplace = True
    )


# Numbers to String

Changing categorical features that contain numbers to be in string format

| Attribute                          | Updated Feature Values                                                   |
| :--------------------------------- | :----------------------------------------------------------------------- |
| **sex**                            | 0:female<br>1:male                                                       |
| **chest pain type**                | 1:typical angina<br>2:atypical angina<br>3:non-anginal<br>4:asymptomatic |
| **fasting blood sugar**            | 0:> 120 mg/dl<br>1:< 120 mg/dl                                           |
| **resting electrocardiographic**   | 0:normal<br>1:ST-T wave abnormality<br>2:ventricular hypertrophy         |
| **exercise induced angina**        | 0:no<br>1:yes                                                            |
| **slope peak exercise ST segment** | 1:upsloping<br>2:flat<br>3:downsloping                                   |
| **thallium stress result**         | 3:normal<br>6:fixed defect<br>7:reversible defect                        |
| **target**                         | 0:no disease<br>1:LAD<br>2:LCX<br>3:RCA<br>4:highest                     |


In [136]:
df_train.head()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic,maximum heart rate,exercise induced angina,ST depression,slope peak exercise ST segment,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [137]:
# Convert all columns to numeric
df_train = df_train.apply(pd.to_numeric)

df_train["sex"] = df_train["sex"].map({0: "female", 1: "male"})
df_train["chest pain type"] = df_train["chest pain type"].map(
    {1: "typical angina", 2: "atypical angina", 3: "non-anginal", 4: "asymptomatic"}
)
df_train["fasting blood sugar"] = df_train["fasting blood sugar"].map(
    {0: "> 120 mg/dl", 1: "< 120 mg/dl"}
)
df_train["resting electrocardiographic"] = df_train[
    "resting electrocardiographic"
].map({0: "normal", 1: "ST wave abnormality", 2: "ventricular hypertrophy"})

df_train["exercise induced angina"] = df_train[
    "exercise induced angina"
].map({0: "no", 1: "yes"})

df_train["slope peak exercise ST segment"] = df_train[
    "slope peak exercise ST segment"
].map({1: "upsloping", 2: "flat", 3: "downsloping"})

# df_train["thallium stress result"] = df_train[
#     "thallium stress result"
# ].map({3: "normal", 6: "fixed defect", 7: "reversible defect"})

df_train["target"] = df_train["target"].map(
    {0: "no disease", 1: "LAD", 2: "LCX", 3: "RCA", 4: "highest"}
)

df_train.head(5)


Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic,maximum heart rate,exercise induced angina,ST depression,slope peak exercise ST segment,target
0,40,male,atypical angina,140,289,> 120 mg/dl,normal,172,no,0.0,upsloping,no disease
1,49,female,non-anginal,160,180,> 120 mg/dl,normal,156,no,1.0,flat,LAD
2,37,male,atypical angina,130,283,> 120 mg/dl,ST wave abnormality,98,no,0.0,upsloping,no disease
3,48,female,asymptomatic,138,214,> 120 mg/dl,normal,108,yes,1.5,flat,LAD
4,54,male,non-anginal,150,195,> 120 mg/dl,normal,122,no,0.0,upsloping,no disease


# Save processed data


In [138]:
df_train.to_pickle(PROC_TRAIN_DATA_PATH)