In [6]:
TRAIN_DATA_PATH = "../../data/raw/heart_statlog_cleveland_hungary_final - Copy.csv"

# Import packages

In [2]:
import pandas as pd

In [4]:
# Change some pandas display options
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

# Data Acquisition
The Cleveland data composes 13 (numerical and categorical) features with multivariate output

In [6]:
def combine_data(paths: list) -> pd.DataFrame:
    """
    Combine data from csv files
    
    Parameters
    ----------
    paths : list of paths to csv files
    
    Returns
    -------
    data : DataFrame contains all data
    """
    data = pd.DataFrame()
    for path in paths:
        data = data.append(pd.read_csv(path))
    data.to_csv('./data/all_data.csv', index=False)
    return data

def change_features_name(data:pd.DataFrame, output=True):
    """Changes columns name for the passed dataset"""
    data_copy = data.copy()

    # Obtain features before modifying names
    before_ren_cols = data_copy.columns.to_list()
    
   
    data_copy.rename(
        columns={
            'cp'        :   'chest pain type',
            'trestbps'  :   'resting blood pressure',
            'chol'      :   'serum cholestoral',
            'fbs'       :   'fasting blood sugar',
            'restecg'   :   'resting electrocardiographic',
            'thalach'   :   'maximum heart rate',
            'exang'     :   'exercise induced angina',
            'oldpeak'   :   'ST depression',
            'slope'     :   'slope peak exercise ST segment',
            'ca'        :   'number of major vessels',
            'thal'      :   'thallium stress result'
        },
        inplace = True
    )
    
    # Obtain features after modifying names
    after_ren_cols = data_copy.columns.to_list()
    
    comp_cols_name = pd.DataFrame(np.array([before_ren_cols, after_ren_cols]).T, columns=['before renaming', 'after renaming'])

    
    if(not output):
        return data_copy

    return data_copy, comp_cols_name


def load_data(path:str, split=False) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load data from csv file
    
    Returns
    -------
    train_data : DataFrame contains training set
    test_data : DataFrame contains testing set
    """
    data = pd.read_csv(path)

    if(split):
        train_data, test_data = train_test_split(data, random_state=33, test_size=0.3)
        print ('Train set shape:  ', train_data.shape)
        print ('Test set shape:   ', test_data.shape)

        train_data, _ = change_features_name(train_data, output=True)
        test_data, _ = change_features_name(test_data, output=True)


        return train_data, test_data

    data = change_features_name(data, output=False)
    
    return data


# paths = [
#     './data/cleveland.csv',
#     './data/hungarian.csv',
#     './data/switzerland.csv',
#     './data/VA Long Beach.csv',
# ]
# implement combine_data function and load data
# data = combine_data(paths)
# train_data, test_data = load_data('./data/cleveland.csv', split=True)

    

In [None]:
# all_data = load_data('./data/all_data.csv').iloc[:,:5]
# cleveland_df = load_data('./data/cleveland.csv').iloc[:,:5]
# hungarian_df = load_data('./data/hungarian.csv').iloc[:,:5]
# switzerland_df = load_data('./data/switzerland.csv').iloc[:,:5]
# VA_df = load_data('./data/VA Long Beach.csv').iloc[:,:5]

In [None]:
# cols = ['age', 'sex', 'chest pain type', 'resting blood pressure', 'serum cholestoral']
# hungarian_df.merge(cleveland_df, how='inner', left_index=True, right_on=cols, indicator=False)


In [7]:
# Read data
df_train = pd.read_csv(TRAIN_DATA_PATH)

In [11]:
# Display raw train data shape
print(f"Num of rows: {df_train.shape[0]}")
print(f"Num of features: {df_train.shape[1]}")

Num of rows: 1190
Num of features: 12


In [12]:
# Display raw train data
df_train.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [14]:
# Display a random sample
df_train.sample(1).iloc[0]

age                    45.0 
sex                    0.0  
chest pain type        2.0  
resting bp s           112.0
cholesterol            160.0
fasting blood sugar    0.0  
resting ecg            0.0  
max heart rate         138.0
exercise angina        0.0  
oldpeak                0.0  
ST slope               2.0  
target                 0.0  
Name: 1056, dtype: float64

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
