In [1]:
TRAIN_DATA_PATH = "../data/raw/train.csv"
PROC_TRAIN_DATA_PATH = "../data/interim/1__analytics_preprocessed_df.pkl"

# Import packages

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Change some pandas display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

In [None]:
raw_df_train = pd.read_csv(TRAIN_DATA_PATH)

# Functions

In [None]:
c_cols = ['sex', 'chest pain type', 'fasting blood sugar', 'resting electrocardiographic', 'exercise induced angina'
    , 'slope peak exercise ST segment', 'number of major vessels','thallium stress result']

# Get the values count for each feature in the DataFrame
def get_values_count(data:pd.DataFrame, cols:list) -> None:
    """
    Calculate the values count for each feature in the DataFrame
    
    Parameters
    ----------
    data : DataFrame
    cols : list of features
    
    Returns
    -------
    values_count : DataFrame contains values count for each feature
    """
    for col in cols:
        print(f"{col} :{data[col].value_counts(dropna=False).to_dict()}")

def get_unique_values(data:pd.DataFrame):
    """
    Get unique values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    uniques = pd.DataFrame(data={
        'feature':[],
        'uniques':[]
    })
  
    for col in data.columns:
        unique_values = data[col].unique()
        if(len(unique_values)>1000):
            continue

        uniques.loc[len(uniques)] = [col, unique_values]
    
    uniques.index = uniques['feature']
    uniques.drop(columns='feature', inplace=True)
    return uniques

def get_strange_values(data:pd.DataFrame) -> pd.DataFrame:
    """
    Get strange values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    data_copy = data.copy()
    uniques = pd.DataFrame(data={
        'feature':[],
        'indices-values':[]
    })
  
    for col in data_copy.columns:
        # Change the column values to be object for applying Regex
        data_copy[col] = data_copy[col].astype(str)

        # The pattern for checking the presence of strange values
        pattern = "([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)|[a-zA-Z]+)"

        # Get mask matrix that refers to strange values ([~]: for not contain)
        strange_values_mask = ~data_copy[col].str.contains(pat=pattern, na=True, regex=True, case=False)
        
        # Get index of the strange values
        strange_values_ind = data_copy[strange_values_mask].index.to_list()

        # if the indices are empty check 
        if(not strange_values_ind):
            continue
        
        # dictionary = {
        #     col : {ind:list(data_copy.loc[ind,col]) for ind in strange_values_ind}
        # }

        # reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()}
        # reform = pd.DataFrame.from_dict(reform, orient='index').transpose()


        uniques.loc[len(uniques)] = [col, [(ind, data_copy.loc[ind,col]) for ind in strange_values_ind]]
    


    uniques.set_index(keys=['feature'], inplace=True)


    if(not uniques.empty):
        return uniques
    
    return "No strange values found!"
    
get_strange_values(train_data)


In [None]:
# Replace all '?' values with NaN
train_data.replace('?', np.nan, inplace=True)
test_data.replace('?', np.nan, inplace=True)

## Missing Values
Since **number of major vessels** & **thallium stress result** are categorical features, the missing values could be replaced by most mode value.

In [None]:
# Remove null values if exist
print(f"--Missing values count--\n{train_data.isnull().sum().sort_values()}")

In [None]:
# Replace null values from training set
mode_imputer = SimpleImputer(strategy='most_frequent')
train_data[['thallium stress result','number of major vessels']] = mode_imputer.fit_transform(train_data[['thallium stress result','number of major vessels']])
test_data[['thallium stress result','number of major vessels']] = mode_imputer.transform(test_data[['thallium stress result','number of major vessels']])


## Duplicates

In [None]:
def remove_duplicates(data: pd.DataFrame):
    data_copy = data.copy()
    """ Remove duplicates values if exist"""
    print(f"Duplicates count before droping:{data_copy.duplicated().sum()}")
    data_copy.drop_duplicates(inplace=True)
    print(f"Duplicates count after droping:{data_copy.duplicated().sum()}")
    print(f"Data dimension{data.shape}")

remove_duplicates(train_data)

## Balancing
Check if the training data is well balanced because one of the major issues when dealing with unbalanced datasets relates to the metrics used to evaluate a model. Using simpler metrics like accuracy_score can be misleading. In a dataset with highly unbalanced classes, if the classifier always "predicts" the most common class without performing any analysis of the features, it will still have a high accuracy rate, obviously illusory.

Depending on the obtained result, the data is well balanced and no need to resample it.

In [None]:
def check_balancing(data, target_name):
    """
    Check if the target's classes are balanced between each other
    """
    # return data[target_name].value_counts(normalize=normalize)

    #Target Class count
    plt.figure(figsize=(8,8))
    plt.pie(data[target_name].value_counts(), labels=['no disease', 'LAD','LCX', 'RCA', 'highest'], autopct='%1.2f%%', explode=[0,0.2,0.2,0.2,0.2], shadow=True)

    my_circle = plt.Circle( (0,0), 0.4, color='white')
    p = plt.gcf()
    p.gca().add_artist(my_circle)
    plt.title('Target Class Count')

check_balancing(train_data, 'target')

## Numbers to String
Changing categorical features that contain numbers to be in string format

| Attribute   | Updated Feature Values 
| :-- | :-- 
|**sex** |0:female<br>1:male|
|**chest pain type** | 1:typical angina<br>2:atypical angina<br>3:non-anginal<br>4:asymptomatic|
|**fasting blood sugar** |0:> 120 mg/dl<br>1:< 120 mg/dl|
|**resting electrocardiographic** |0:normal<br>1:ST-T wave abnormality<br>2:ventricular hypertrophy|
|**exercise induced angina** |0:no<br>1:yes|
|**slope peak exercise ST segment** |1:upsloping<br>2:flat<br>3:downsloping|
|**thallium stress result** |3:normal<br>6:fixed defect<br>7:reversible defect|
|**target** |0:no disease<br>1:LAD<br>2:LCX<br>3:RCA<br>4:highest|

In [None]:
# Convert all columns to numeric
train_data = train_data.apply(pd.to_numeric)


train_w_cat_data = train_data.copy()

train_w_cat_data['sex'] = train_w_cat_data['sex'].map({0:'female', 1:'male'})
train_w_cat_data['chest pain type'] = train_w_cat_data['chest pain type'].map({
        1:'typical angina', 2:'atypical angina',
        3:'non-anginal',    4:'asymptomatic'})
train_w_cat_data['fasting blood sugar'] = train_w_cat_data['fasting blood sugar'].map({
        0:'> 120 mg/dl', 1:'< 120 mg/dl'})
train_w_cat_data['resting electrocardiographic'] = train_w_cat_data['resting electrocardiographic'].map({
        0:'normal', 1:'ST wave abnormality', 2:'ventricular hypertrophy'})
train_w_cat_data['exercise induced angina'] = train_w_cat_data['exercise induced angina'].map({
        0:'no', 1:'yes'})
train_w_cat_data['slope peak exercise ST segment'] = train_w_cat_data['slope peak exercise ST segment'].map({
        1:'upsloping', 2:'flat', 3:'downsloping'})

train_w_cat_data['thallium stress result'] = train_w_cat_data['thallium stress result'].map({
        3:'normal', 6:'fixed defect', 7:'reversible defect'})

train_w_cat_data['target'] = train_w_cat_data['target'].map({0:'no disease', 1:'LAD',
        2:'LCX', 3:'RCA', 4:'highest'})

train_w_cat_data.head(5)


# Save processed data

In [None]:
df_train.to_pickle(PROC_TRAIN_DATA_PATH)