In [54]:
# DATA_PATH = "../../data/raw/cleveland.csv"
TRAIN_DATA_PATH = "../../data/interim/cleveland_train.pkl"
# TEST_DATA_PATH = "../../data/interim/cleveland_test.pkl"

# Import packages

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [56]:
# Change some pandas display options
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

# Functions

In [57]:
# Get the values count for each feature in the DataFrame
def get_values_count(data: pd.DataFrame, cols: list) -> None:
    """
    Calculate the values count for each feature in the DataFrame
    
    Parameters
    ----------
    data : DataFrame
    cols : list of features
    
    Returns
    -------
    values_count : DataFrame contains values count for each feature
    """
    for col in cols:
        print(f"{col} :{data[col].value_counts(dropna=False).to_dict()}")


def get_unique_values(data: pd.DataFrame):
    """
    Get unique values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    uniques = pd.DataFrame(data={"feature": [], "uniques": []})

    for col in data.columns:
        unique_values = data[col].unique()
        if len(unique_values) > 1000:
            continue

        uniques.loc[len(uniques)] = [col, unique_values]

    uniques.index = uniques["feature"]
    uniques.drop(columns="feature", inplace=True)
    return uniques


def get_strange_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Get strange values in each feature
    
    Parameters
    ----------
    data : DataFrame
    
    Returns
    -------
    unique_values : DataFrame contains unique values for each feature
    """

    data_copy = data.copy()
    uniques = pd.DataFrame(data={"feature": [], "indices-values": []})

    for col in data_copy.columns:
        # Change the column type to be string
        data_copy[col] = data_copy[col].astype(str)

        # The pattern for checking the presence of strange values
        pattern = r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)|[a-zA-Z]+)"

        # Get mask matrix that refers to strange values ([~]: for not contain)
        strange_values_mask = ~data_copy[col].str.contains(
            pat=pattern, na=True, regex=True, case=False
        )

        # Get index of the strange values
        strange_values_ind = data_copy[strange_values_mask].index.to_list()

        if not strange_values_ind:
            continue

        # dictionary = {
        #     col : {ind:list(data_copy.loc[ind,col]) for ind in strange_values_ind}
        # }

        # reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()}
        # reform = pd.DataFrame.from_dict(reform, orient='index').transpose()

        # user len(uniques) as index for inserting new row
        uniques.loc[len(uniques)] = [
            col,
            [(ind, data_copy.loc[ind, col]) for ind in strange_values_ind],
        ]
        
        # print(uniques)

    uniques.set_index(keys=["feature"], inplace=True)

    if not uniques.empty:
        return uniques

    return "No strange values found!"


# Data Acquisition

In [58]:
# Read data
df_train = pd.read_pickle(TRAIN_DATA_PATH)

In [59]:
# Display raw train data shape
print(f"Num of rows: {df_train.shape[0]}")
print(f"Num of features: {df_train.shape[1]}")

Num of rows: 226
Num of features: 14


In [60]:
# Display raw train data
df_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
156,58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1
118,65.0,1.0,4.0,135.0,254.0,0.0,2.0,127.0,0.0,2.8,2.0,1.0,7.0,2
277,57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
142,64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1
297,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1


In [61]:
# Display a random sample
df_train.sample(1).iloc[0]

age         50.0 
sex         0.0  
cp          2.0  
trestbps    120.0
chol        244.0
fbs         0.0  
restecg     0.0  
thalach     162.0
exang       0.0  
oldpeak     1.1  
slope       1.0  
ca          0.0  
thal        3.0  
target      0    
Name: 197, dtype: object

In [62]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226 entries, 156 to 256
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       226 non-null    float64
 1   sex       226 non-null    float64
 2   cp        226 non-null    float64
 3   trestbps  226 non-null    float64
 4   chol      226 non-null    float64
 5   fbs       226 non-null    float64
 6   restecg   226 non-null    float64
 7   thalach   226 non-null    float64
 8   exang     226 non-null    float64
 9   oldpeak   226 non-null    float64
 10  slope     226 non-null    float64
 11  ca        226 non-null    object 
 12  thal      226 non-null    object 
 13  target    226 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 26.5+ KB


*summary*: 
- **ca** and **thal** are object type

# Strange values


In [63]:

get_strange_values(df_train)


Unnamed: 0_level_0,indices-values
feature,Unnamed: 1_level_1
ca,"[(165, ?), (301, ?), (191, ?), (286, ?)]"
thal,"[(265, ?), (86, ?)]"


# Missing values

In [64]:
# Remove null values if exist
print(f"--Missing values count--\n{df_train.isnull().sum().sort_values()}")

--Missing values count--
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
