In [2]:
TRAIN_DATA_PATH = "../data/raw/train.csv"

# Import packages

In [3]:
import pandas as pd

In [11]:
# Change some pandas display options
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
# pd.set_option("precision", 10)

# Data Acquisition
The Cleveland data composes 13 (numerical and categorical) features with multivariate output

In [6]:
def combine_data(paths: list) -> pd.DataFrame:
    """
    Combine data from csv files
    
    Parameters
    ----------
    paths : list of paths to csv files
    
    Returns
    -------
    data : DataFrame contains all data
    """
    data = pd.DataFrame()
    for path in paths:
        data = data.append(pd.read_csv(path))
    data.to_csv('./data/all_data.csv', index=False)
    return data

def change_features_name(data:pd.DataFrame, output=True):
    """Changes columns name for the passed dataset"""
    data_copy = data.copy()

    # Obtain features before modifying names
    before_ren_cols = data_copy.columns.to_list()
    
   
    data_copy.rename(
        columns={
            'cp'        :   'chest pain type',
            'trestbps'  :   'resting blood pressure',
            'chol'      :   'serum cholestoral',
            'fbs'       :   'fasting blood sugar',
            'restecg'   :   'resting electrocardiographic',
            'thalach'   :   'maximum heart rate',
            'exang'     :   'exercise induced angina',
            'oldpeak'   :   'ST depression',
            'slope'     :   'slope peak exercise ST segment',
            'ca'        :   'number of major vessels',
            'thal'      :   'thallium stress result'
        },
        inplace = True
    )
    
    # Obtain features after modifying names
    after_ren_cols = data_copy.columns.to_list()
    
    comp_cols_name = pd.DataFrame(np.array([before_ren_cols, after_ren_cols]).T, columns=['before renaming', 'after renaming'])

    
    if(not output):
        return data_copy

    return data_copy, comp_cols_name


def load_data(path:str, split=False) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load data from csv file
    
    Returns
    -------
    train_data : DataFrame contains training set
    test_data : DataFrame contains testing set
    """
    data = pd.read_csv(path)

    if(split):
        train_data, test_data = train_test_split(data, random_state=33, test_size=0.3)
        print ('Train set shape:  ', train_data.shape)
        print ('Test set shape:   ', test_data.shape)

        train_data, _ = change_features_name(train_data, output=True)
        test_data, _ = change_features_name(test_data, output=True)


        return train_data, test_data

    data = change_features_name(data, output=False)
    
    return data


# paths = [
#     './data/cleveland.csv',
#     './data/hungarian.csv',
#     './data/switzerland.csv',
#     './data/VA Long Beach.csv',
# ]
# implement combine_data function and load data
# data = combine_data(paths)
# train_data, test_data = load_data('./data/cleveland.csv', split=True)

    

In [None]:
all_data = load_data('./data/all_data.csv').iloc[:,:5]
cleveland_df = load_data('./data/cleveland.csv').iloc[:,:5]
hungarian_df = load_data('./data/hungarian.csv').iloc[:,:5]
switzerland_df = load_data('./data/switzerland.csv').iloc[:,:5]
VA_df = load_data('./data/VA Long Beach.csv').iloc[:,:5]

In [None]:
cols = ['age', 'sex', 'chest pain type', 'resting blood pressure', 'serum cholestoral']
hungarian_df.merge(cleveland_df, how='inner', left_index=True, right_on=cols, indicator=False)


In [None]:
# Read data
df_train = pd.read_csv(TRAIN_DATA_PATH)

In [None]:
# Display raw train data shape
df_train.shape

In [None]:
# Display raw train data
df_train.head()

In [None]:
# Display a random sample
df_train.sample(1).iloc[0]

In [None]:
df_train.info()