In [13]:
TRAIN_DATA_PATH = "../../data/interim/train_data.pkl"

# Import packages

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [15]:
# Change some pandas display options
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

# Functions

In [16]:
# Get the values count for each feature in the DataFrame
def get_values_count(data: pd.DataFrame, cols: list) -> None:
    """Calculate the values count for each feature in the DataFrame

    Args:
        data (pd.DataFrame): the data to get count from
        cols (list): list of features
    """
    
    for col in cols:
        print(f"{col} :{data[col].value_counts(dropna=False).to_dict()}")


def get_unique_values(data: pd.DataFrame) -> pd.DataFrame:
    """Get unique values in each feature

    Args:
        data (pd.DataFrame): the data to get uniques from

    Returns:
        pd.DataFrame : return a df contains unique values for each feature
    """
    uniques = pd.DataFrame(data={"feature": [], "uniques": []})

    for col in data.columns:
        unique_values = data[col].unique()
        if len(unique_values) > 1000:
            continue

        uniques.loc[len(uniques)] = [col, unique_values]

    uniques.index = uniques["feature"]
    uniques.drop(columns="feature", inplace=True)
    return uniques


def get_strange_values(data: pd.DataFrame) -> pd.DataFrame:
    """Get strange values in each feature

    Args:
        data (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: return a df contains strange values for each feature
    """

    data_copy = data.copy()
    uniques = pd.DataFrame(data={"feature": [], "indices-values": []})

    for col in data_copy.columns:

        data_copy[col] = data_copy[col].astype(str)

        # The pattern for checking the presence of strange values
        pattern = r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)|[a-zA-Z]+)"

        # Get mask matrix that refers to strange values ([~]: for not contain)
        strange_values_mask = ~data_copy[col].str.contains(
            pat=pattern, na=True, regex=True, case=False
        )

        strange_values_ind = data_copy[strange_values_mask].index.to_list()

        if not strange_values_ind:
            continue

        # dictionary = {
        #     col : {ind:list(data_copy.loc[ind,col]) for ind in strange_values_ind}
        # }

        # reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()}
        # reform = pd.DataFrame.from_dict(reform, orient='index').transpose()

        uniques.loc[len(uniques)] = [
            col,
            [(ind, data_copy.loc[ind, col]) for ind in strange_values_ind],
        ]
        

    uniques.set_index(keys=["feature"], inplace=True)

    return uniques



# Data Acquisition

In [17]:
# Read data
df_train = pd.read_pickle(TRAIN_DATA_PATH)

In [18]:
# Display raw train data shape
print(f"Num of rows: {df_train.shape[0]}")
print(f"Num of features: {df_train.shape[1]}")

Num of rows: 271
Num of features: 14


In [19]:
# Display raw train data
df_train.head()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic,maximum heart rate,exercise induced angina,ST depression,slope peak exercise ST segment,number of major vessels,thallium stress result,target
155,51.0,1.0,4.0,140.0,299.0,0.0,0.0,173.0,1.0,1.6,1.0,0.0,7.0,1
10,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
53,60.0,1.0,4.0,130.0,253.0,0.0,0.0,144.0,1.0,1.4,1.0,1.0,7.0,1
122,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3
208,62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0,1


In [20]:
# Display a random sample
df_train.sample(1).iloc[0]

age                               41.0 
sex                               1.0  
chest pain type                   3.0  
resting blood pressure            130.0
serum cholestoral                 214.0
fasting blood sugar               0.0  
resting electrocardiographic      2.0  
maximum heart rate                168.0
exercise induced angina           0.0  
ST depression                     2.0  
slope peak exercise ST segment    2.0  
number of major vessels           0.0  
thallium stress result            3.0  
target                            0    
Name: 211, dtype: object

In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271 entries, 155 to 174
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             271 non-null    float64
 1   sex                             271 non-null    float64
 2   chest pain type                 271 non-null    float64
 3   resting blood pressure          271 non-null    float64
 4   serum cholestoral               271 non-null    float64
 5   fasting blood sugar             271 non-null    float64
 6   resting electrocardiographic    271 non-null    float64
 7   maximum heart rate              271 non-null    float64
 8   exercise induced angina         271 non-null    float64
 9   ST depression                   271 non-null    float64
 10  slope peak exercise ST segment  271 non-null    float64
 11  number of major vessels         271 non-null    object 
 12  thallium stress result          27

*summary*: 
- **ca** and **thal** are object type

# Uniques

In [22]:
get_unique_values(df_train)

Unnamed: 0_level_0,uniques
feature,Unnamed: 1_level_1
age,"[51.0, 56.0, 60.0, 55.0, 62.0, 39.0, 59.0, 66.0, 65.0, 64.0, 61.0, 47.0, 52.0, 43.0, 57.0, 44.0, 49.0, 50.0, 42.0, 58.0, 54.0, 41.0, 35.0, 53.0, 45.0, 70.0, 46.0, 40.0, 68.0, 76.0, 67.0, 69.0, 48.0, 63.0, 34.0, 77.0, 29.0, 38.0, 37.0, 74.0, 71.0]"
sex,"[1.0, 0.0]"
chest pain type,"[4.0, 2.0, 3.0, 1.0]"
resting blood pressure,"[140.0, 130.0, 150.0, 94.0, 135.0, 160.0, 145.0, 117.0, 120.0, 112.0, 136.0, 132.0, 110.0, 108.0, 146.0, 155.0, 118.0, 124.0, 129.0, 105.0, 142.0, 192.0, 115.0, 164.0, 100.0, 138.0, 134.0, 170.0, 152.0, 128.0, 126.0, 148.0, 158.0, 123.0, 122.0, 125.0, 102.0, 106.0, 174.0, 200.0, 114.0, 104.0, 178.0, 180.0, 154.0, 156.0, 101.0, 144.0, 172.0]"
serum cholestoral,"[299.0, 294.0, 253.0, 217.0, 244.0, 199.0, 234.0, 246.0, 360.0, 212.0, 230.0, 281.0, 307.0, 204.0, 196.0, 298.0, 193.0, 341.0, 201.0, 221.0, 141.0, 295.0, 278.0, 269.0, 149.0, 239.0, 261.0, 315.0, 240.0, 233.0, 206.0, 184.0, 258.0, 266.0, 273.0, 288.0, 214.0, 284.0, 198.0, 226.0, 283.0, 260.0, 192.0, 264.0, 176.0, 174.0, 222.0, 311.0, 177.0, 220.0, 303.0, 211.0, 263.0, 326.0, 209.0, 223.0, 305.0, 197.0, 229.0, 166.0, 268.0, 237.0, 235.0, 306.0, 167.0, 275.0, 207.0, 232.0, 255.0, 186.0, 271.0, 227.0, 218.0, 417.0, 270.0, 319.0, 254.0, 126.0, 276.0, 282.0, 164.0, 259.0, 219.0, 340.0, 160.0, 205.0, 225.0, 236.0, 188.0, 242.0, 256.0, 213.0, 309.0, 243.0, 203.0, 216.0, 228.0, 250.0, 187.0, 245.0, ...]"
fasting blood sugar,"[0.0, 1.0]"
resting electrocardiographic,"[0.0, 2.0, 1.0]"
maximum heart rate,"[173.0, 153.0, 144.0, 111.0, 154.0, 179.0, 161.0, 120.0, 151.0, 132.0, 160.0, 103.0, 146.0, 143.0, 169.0, 122.0, 162.0, 136.0, 126.0, 163.0, 175.0, 157.0, 152.0, 148.0, 142.0, 141.0, 125.0, 108.0, 105.0, 109.0, 159.0, 168.0, 130.0, 195.0, 185.0, 90.0, 178.0, 145.0, 181.0, 140.0, 115.0, 97.0, 116.0, 150.0, 71.0, 131.0, 114.0, 118.0, 138.0, 165.0, 139.0, 171.0, 190.0, 194.0, 112.0, 96.0, 172.0, 113.0, 180.0, 95.0, 149.0, 147.0, 137.0, 128.0, 129.0, 166.0, 174.0, 156.0, 177.0, 182.0, 164.0, 192.0, 167.0, 202.0, 188.0, 123.0, 133.0, 158.0, 155.0, 127.0, 170.0, 121.0, 187.0, 117.0, 106.0, 186.0, 134.0, 184.0, 99.0, 88.0]"
exercise induced angina,"[1.0, 0.0]"
ST depression,"[1.6, 1.3, 1.4, 5.6, 0.0, 0.5, 0.8, 2.0, 1.0, 0.1, 4.2, 1.9, 3.0, 1.5, 0.6, 1.2, 0.3, 1.8, 2.1, 2.6, 2.2, 0.4, 3.6, 3.4, 1.1, 0.2, 2.8, 6.2, 0.9, 2.3, 4.0, 2.4, 0.7, 3.8, 4.4, 3.2, 3.5, 2.5, 2.9]"


# Strange values


In [23]:

get_strange_values(df_train)


Unnamed: 0_level_0,indices-values
feature,Unnamed: 1_level_1
number of major vessels,"[(301, ?), (191, ?), (165, ?), (286, ?)]"
thallium stress result,"[(86, ?)]"


# Missing values

In [24]:
# Remove null values if exist
print(f"--Missing values count--\n{df_train.isnull().sum().sort_values()}")

--Missing values count--
age                               0
sex                               0
chest pain type                   0
resting blood pressure            0
serum cholestoral                 0
fasting blood sugar               0
resting electrocardiographic      0
maximum heart rate                0
exercise induced angina           0
ST depression                     0
slope peak exercise ST segment    0
number of major vessels           0
thallium stress result            0
target                            0
dtype: int64
