In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load data set
data = pd.read_csv("heart.csv")

#check if data set is loaded successfully
print(data.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  


In [3]:
# check what we are working with
# inspect how big our data set is
size_of_dataset = data.shape
print(size_of_dataset)

(303, 14)


In [4]:
# inspect data types
print(data.dtypes)

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


In [5]:
# Data cleaning and pre-processing
# 1. check for missing values / empty cells
empty_cells = data.isnull().sum()
print(empty_cells)



age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [6]:
# 2. Check for duplicates
duplicates = data.duplicated().sum()
print(duplicates)

1


In [7]:
# know which exact row has this dupicate
duplicated_rows = data[data.duplicated(keep=False)]
print(duplicated_rows)

# get the duplicated rows index
print(duplicated_rows.index)



     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
163   38    1   2       138   175    0        1      173      0      0.0   
164   38    1   2       138   175    0        1      173      0      0.0   

     slope  ca  thal  target  
163      2   4     2       1  
164      2   4     2       1  
Index([163, 164], dtype='int64')


In [8]:
# Eliminate the duplicate
eliminate_duplicates = data.drop_duplicates(keep="first", inplace=True)

print(eliminate_duplicates)

#check for duplicates again
new_duplicate_status = data.duplicated().sum()
print(f"Number of duplicates:{new_duplicate_status}")



None
Number of duplicates:0


In [9]:
# 3. check for wrong data format (different data types) e.g 1/7/2024, 172024 or 1.4, 1.45, 1.5, 155
# columns with wrong data types will always have an object data type except for strings because their correct data type is object.

# check for data types
print(data.dtypes)

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


In [12]:
# 4. check for wrong data (outliers) using the z-score, z = (x - mean)/ standard deviation

def detect_outliers(df):
    outliers_dict = {}
    threshold = 4  # Define the z-score threshold for identifying outliers

    # Iterate through each column in the DataFrame
    for column in df.columns:
        # Check if the column is numeric
        if pd.api.types.is_numeric_dtype(df[column]):
            # Calculate the mean and standard deviation of the column
            mean = np.mean(df[column])
            std = np.std(df[column])
            
            # Initialize a list to hold the outliers for the current column
            outliers = []

            # Iterate through each value in the column
            for value in df[column]:
                # Calculate the z-score for the value
                z_score = (value - mean) / std
                
                # Check if the z-score is greater than the threshold
                if np.abs(z_score) > threshold:
                    outliers.append(value)
            
            # Add the outliers to the dictionary with the column name as the key
            outliers_dict[column] = outliers
    
    return outliers_dict

# Call the function and print the result
outliers = detect_outliers(data)
print(outliers)


{'age': [], 'sex': [], 'cp': [], 'trestbps': [], 'chol': [564], 'fbs': [], 'restecg': [], 'thalach': [], 'exang': [], 'oldpeak': [6.2], 'slope': [], 'ca': [], 'thal': [], 'target': []}


In [13]:
# handle outliers
median_trestbps = data['chol'].median()
data.loc[data['chol'] > 450, 'chol'] = median_trestbps
print(outliers)

{'age': [], 'sex': [], 'cp': [], 'trestbps': [], 'chol': [564], 'fbs': [], 'restecg': [], 'thalach': [], 'exang': [], 'oldpeak': [6.2], 'slope': [], 'ca': [], 'thal': [], 'target': []}


  data.loc[data['chol'] > 450, 'chol'] = median_trestbps
