In [19]:
TRAIN_DATA_PATH = "../../data/interim/cleveland_train.pkl"

# Import packages

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [37]:
# Change some pandas display options
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

# Functions

In [38]:
# Get the values count for each feature in the DataFrame
def get_values_count(data: pd.DataFrame, cols: list) -> None:
    """Calculate the values count for each feature in the DataFrame

    Args:
        data (pd.DataFrame): the data to get count from
        cols (list): list of features
    """
    
    for col in cols:
        print(f"{col} :{data[col].value_counts(dropna=False).to_dict()}")


def get_unique_values(data: pd.DataFrame) -> pd.DataFrame:
    """Get unique values in each feature

    Args:
        data (pd.DataFrame): the data to get uniques from

    Returns:
        pd.DataFrame : return a df contains unique values for each feature
    """
    uniques = pd.DataFrame(data={"feature": [], "uniques": []})

    for col in data.columns:
        unique_values = data[col].unique()
        if len(unique_values) > 1000:
            continue

        uniques.loc[len(uniques)] = [col, unique_values]

    uniques.index = uniques["feature"]
    uniques.drop(columns="feature", inplace=True)
    return uniques


def get_strange_values(data: pd.DataFrame) -> pd.DataFrame:
    """Get strange values in each feature

    Args:
        data (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: return a df contains strange values for each feature
    """

    data_copy = data.copy()
    uniques = pd.DataFrame(data={"feature": [], "indices-values": []})

    for col in data_copy.columns:

        data_copy[col] = data_copy[col].astype(str)

        # The pattern for checking the presence of strange values
        pattern = r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)|[a-zA-Z]+)"

        # Get mask matrix that refers to strange values ([~]: for not contain)
        strange_values_mask = ~data_copy[col].str.contains(
            pat=pattern, na=True, regex=True, case=False
        )

        strange_values_ind = data_copy[strange_values_mask].index.to_list()

        if not strange_values_ind:
            continue

        # dictionary = {
        #     col : {ind:list(data_copy.loc[ind,col]) for ind in strange_values_ind}
        # }

        # reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()}
        # reform = pd.DataFrame.from_dict(reform, orient='index').transpose()

        uniques.loc[len(uniques)] = [
            col,
            [(ind, data_copy.loc[ind, col]) for ind in strange_values_ind],
        ]
        

    uniques.set_index(keys=["feature"], inplace=True)

    return uniques



# Data Acquisition

In [39]:
# Read data
df_train = pd.read_pickle(TRAIN_DATA_PATH)

In [40]:
# Display raw train data shape
print(f"Num of rows: {df_train.shape[0]}")
print(f"Num of features: {df_train.shape[1]}")

Num of rows: 226
Num of features: 14


In [41]:
# Display raw train data
df_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
156,58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1
118,65.0,1.0,4.0,135.0,254.0,0.0,2.0,127.0,0.0,2.8,2.0,1.0,7.0,2
277,57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
142,64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1
297,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1


In [42]:
# Display a random sample
df_train.sample(1).iloc[0]

age          60.0
sex           1.0
cp            4.0
trestbps    145.0
chol        282.0
fbs           0.0
restecg       2.0
thalach     142.0
exang         1.0
oldpeak       2.8
slope         2.0
ca            2.0
thal          7.0
target          2
Name: 64, dtype: object

In [43]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 156 to 256
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       226 non-null    float64
 1   sex       226 non-null    float64
 2   cp        226 non-null    float64
 3   trestbps  226 non-null    float64
 4   chol      226 non-null    float64
 5   fbs       226 non-null    float64
 6   restecg   226 non-null    float64
 7   thalach   226 non-null    float64
 8   exang     226 non-null    float64
 9   oldpeak   226 non-null    float64
 10  slope     226 non-null    float64
 11  ca        226 non-null    object 
 12  thal      226 non-null    object 
 13  target    226 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 26.5+ KB


*summary*: 
- **ca** and **thal** are object type

# Uniques

In [36]:
get_unique_values(df_train)

Unnamed: 0_level_0,uniques
feature,Unnamed: 1_level_1
age,"[58.0, 65.0, 57.0, 64.0, 45.0, 52.0, 63.0, 62.0, 46.0, 59.0, 56.0, 51.0, 70.0, 44.0, 41.0, 39.0, 49.0, 48.0, 54.0, 61.0, 66.0, 38.0, 43.0, 67.0, 60.0, 50.0, 53.0, 40.0, 34.0, 77.0, 35.0, 69.0, 42.0, 68.0, 55.0, 71.0, 37.0, 47.0, 74.0, 76.0]"
sex,"[1.0, 0.0]"
cp,"[4.0, 2.0, 3.0, 1.0]"
trestbps,"[125.0, 135.0, 154.0, 110.0, 172.0, 130.0, 138.0, 160.0, 134.0, 132.0, 150.0, 94.0, 156.0, 120.0, 104.0, 128.0, 108.0, 140.0, 124.0, 142.0, 114.0, 112.0, 200.0, 146.0, 106.0, 145.0, 144.0, 152.0, 118.0, 192.0, 105.0, 155.0, 123.0, 102.0, 126.0, 136.0, 100.0, 122.0, 180.0, 174.0, 178.0, 148.0, 129.0, 117.0, 115.0, 101.0, 170.0]"
chol,"[300.0, 254.0, 232.0, 309.0, 264.0, 199.0, 212.0, 263.0, 243.0, 273.0, 409.0, 184.0, 223.0, 283.0, 227.0, 245.0, 220.0, 214.0, 208.0, 193.0, 205.0, 269.0, 313.0, 126.0, 197.0, 187.0, 266.0, 204.0, 177.0, 321.0, 209.0, 281.0, 330.0, 318.0, 229.0, 282.0, 295.0, 407.0, 175.0, 247.0, 288.0, 218.0, 188.0, 172.0, 275.0, 246.0, 268.0, 258.0, 267.0, 168.0, 169.0, 308.0, 225.0, 200.0, 298.0, 224.0, 221.0, 167.0, 207.0, 249.0, 256.0, 340.0, 230.0, 210.0, 304.0, 198.0, 219.0, 244.0, 248.0, 417.0, 234.0, 240.0, 307.0, 274.0, 270.0, 211.0, 206.0, 335.0, 217.0, 239.0, 236.0, 182.0, 394.0, 149.0, 226.0, 265.0, 325.0, 354.0, 277.0, 250.0, 201.0, 353.0, 216.0, 284.0, 293.0, 306.0, 315.0, 255.0, 302.0, 241.0, ...]"
fbs,"[0.0, 1.0]"
restecg,"[2.0, 0.0, 1.0]"
thalach,"[171.0, 127.0, 164.0, 131.0, 132.0, 162.0, 147.0, 168.0, 97.0, 152.0, 125.0, 150.0, 105.0, 169.0, 154.0, 143.0, 170.0, 148.0, 184.0, 133.0, 173.0, 136.0, 144.0, 160.0, 182.0, 180.0, 156.0, 163.0, 103.0, 158.0, 140.0, 174.0, 157.0, 165.0, 113.0, 139.0, 142.0, 96.0, 172.0, 167.0, 114.0, 126.0, 122.0, 99.0, 181.0, 192.0, 130.0, 195.0, 194.0, 146.0, 88.0, 111.0, 115.0, 166.0, 108.0, 95.0, 178.0, 151.0, 137.0, 187.0, 161.0, 175.0, 149.0, 123.0, 159.0, 112.0, 120.0, 118.0, 186.0, 141.0, 71.0, 138.0, 134.0, 109.0, 190.0, 145.0, 153.0, 121.0, 155.0, 129.0, 179.0, 188.0, 116.0]"
exang,"[0.0, 1.0]"
oldpeak,"[0.0, 2.8, 1.8, 1.2, 0.5, 1.4, 1.0, 1.9, 2.1, 2.0, 3.0, 0.2, 4.0, 0.6, 0.8, 1.6, 4.4, 0.1, 0.3, 2.2, 0.4, 0.9, 4.2, 3.2, 0.7, 1.1, 1.5, 2.4, 5.6, 3.5, 2.5, 3.6, 2.6, 2.3, 3.4, 1.3, 3.8]"


# Strange values


In [45]:

get_strange_values(df_train)


Unnamed: 0_level_0,indices-values
feature,Unnamed: 1_level_1
ca,"[(165, ?), (301, ?), (191, ?), (286, ?)]"
thal,"[(265, ?), (86, ?)]"


# Missing values

In [46]:
# Remove null values if exist
print(f"--Missing values count--\n{df_train.isnull().sum().sort_values()}")

--Missing values count--
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
