In [1]:
RAW_DATA_PATH = "../../data/raw/cleveland.csv"
TRAIN_DATA_PATH = "../../data/interim/train_data.pkl"
TEST_DATA_PATH = "../../data/interim/test_data.pkl"

# Import packages


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")

In [3]:
# Change some pandas display options
pd.set_option("display.max_rows", 10000)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.width", 1000)
pd.set_option("styler.format.precision", 10)

# Functions


In [4]:
def rename_cols(data: pd.DataFrame, output=False):
    """Changes columns name for the passed dataset"""

    # Obtain features before modifying names
    old_columns_names = data.columns.to_list()

    data.rename(
        columns={
            "cp": "chest_pain_type",
            "trestbps": "resting_blood_pressure",
            "chol": "serum_cholestoral",
            "fbs": "fasting_blood_sugar",
            "restecg": "resting_electrocardiographic",
            "thalach": "maximum_heart_rate",
            "exang": "exercise_induced_angina",
            "oldpeak": "ST_depression",
            "slope": "slope_peak_exercise_ST_segment",
            "ca": "number_of_major_vessels",
            "thal": "thallium_stress_result",
        },
        inplace=True,
    )

    if output:
        new_columns_names = data.columns.to_list()

        # before_after_pairs = pd.DataFrame([old_columns_names, new_columns_names], columns=['before renaming', 'after renaming'])
        before_after_pairs = pd.DataFrame(
            {"before renaming": old_columns_names, "after renaming": new_columns_names}
        )

        return before_after_pairs

    return None

# Read data


In [5]:
# Read data
df_raw = pd.read_csv(RAW_DATA_PATH)

In [6]:
# Display raw train data shape
print(f"Num of rows: {df_raw.shape[0]}")
print(f"Num of features: {df_raw.shape[1]}")

Num of rows: 302
Num of features: 14


In [7]:
# Display raw train data
df_raw.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


In [8]:
# Display a random sample
df_raw.sample(1).iloc[0]

age         43.0 
sex         1.0  
cp          4.0  
trestbps    132.0
chol        247.0
fbs         1.0  
restecg     2.0  
thalach     143.0
exang       1.0  
oldpeak     0.1  
slope       2.0  
ca          ?    
thal        7.0  
target      1    
Name: 191, dtype: object

# Renamed columns


In [9]:
rename_cols(df_raw, output=True)

Unnamed: 0,before renaming,after renaming
0,age,age
1,sex,sex
2,cp,chest_pain_type
3,trestbps,resting_blood_pressure
4,chol,serum_cholestoral
5,fbs,fasting_blood_sugar
6,restecg,resting_electrocardiographic
7,thalach,maximum_heart_rate
8,exang,exercise_induced_angina
9,oldpeak,ST_depression


# Splitting data

Splitting the raw data with stratify method for ensuring fair splitting upon target value


In [10]:
df_train, df_test = train_test_split(
    df_raw, test_size=0.1, stratify=df_raw["target"], random_state=44
)

In [11]:
df_train["target"].value_counts() / df_train.shape[0]

0    0.538745
1    0.180812
3    0.118081
2    0.118081
4    0.044280
Name: target, dtype: float64

In [12]:
df_test["target"].value_counts() / df_test.shape[0]

0    0.548387
1    0.193548
2    0.129032
3    0.096774
4    0.032258
Name: target, dtype: float64

In [13]:
df_train.to_pickle(TRAIN_DATA_PATH)
df_test.to_pickle(TEST_DATA_PATH)