In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import  f1_score, confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
# reading csv file
df = pd.read_csv('US_Heart_Patients.csv')

In [None]:
# print first 10 rows:-
print (df.head(10))

In [None]:
# print 5-point summary:-
print('5-point summary below as \n')
print (df.describe().loc[['min', '25%', '50%', '75%', 'max']])

In [None]:
# print the information regarding columns data type:-
print ('data type of each column \n')
print(df.dtypes)

In [None]:
# print the number of missing values in each column:-
print ('number of missing values in each column \n')
print(df.isnull().sum())

In [None]:
# visualize the correlations using heatmap:-

correlation_matrix = df.corr(numeric_only=True)
plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, xticklabels=correlation_matrix.columns,yticklabels=correlation_matrix.columns, annot=True, )

In [None]:
# visualize the data distributions using histogram:-

df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

In [9]:
# helper methods for detecting the outliers for non categorical and non binary columns:-
def get_non_categorical_columns():
    return df.select_dtypes(include=['float64', 'int64']).columns.tolist()

def iqr_outlier(data):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    min = q1 - 1.5 * iqr
    max = q3 + 1.5 * iqr
    result = pd.Series([0] * len(data))
    result[((data < min) | (data > max))] = 1
    return sum(result)

def check_column_is_binary(col):
    if df[col].dropna().isin([0, 1]).all() and df[col].nunique() == 2:
        return True
    else:
        return False


In [None]:
# printing the number of outliers for non categorical and non binary columns:-
outlier_columns = []
non_categorical_columns = get_non_categorical_columns()

def visualize_outliers():
    plt.figure(figsize=(10,8))
    i = 0
    for col in non_categorical_columns:
        if not check_column_is_binary(col):
            plt.subplot(3, 4, i+1)
            sns.boxplot(y=df[col])
            i +=1
    plt.tight_layout()
    plt.show()
    
def print_and_visualize_outliers():
    total_outliers = 0
    for col in non_categorical_columns:
        if not check_column_is_binary(col):
            outliers =iqr_outlier(df[col])
            if outliers ==0:
                # print the columns where no outliers are presents
                print(f'there is no outliers found in this column = {col}')
            else:
                print(f'total outliers found in this column = {col} is {outliers}')
                outlier_columns.append(col)
                total_outliers += outliers
    print(f'total outliers are {total_outliers}')
    visualize_outliers()
print_and_visualize_outliers()

In [None]:
#  Data preprocessing steps:-

# separate the categorical and non categorical columns to fill the missing value
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# using mode to impute the categorical columns
# using median to impute non categorical columns

def imput_missing_values():
     for col in categorical_columns:
        df[col].fillna(df[col].mode().iloc[0], inplace=True)
     for col in non_categorical_columns:
          df[col].fillna(df[col].median(), inplace=True)
            
# treatment for the outliers
def treat_outliers():
    for column in outlier_columns:
        median = df[column].median()
        std_dev = df[column].std()
        lower_limit = median - 2 * std_dev
        upper_limit = median + 2 * std_dev
        df[column] = np.where(df[column]> upper_limit, upper_limit, df[column])
        df[column] = np.where(df[column]< lower_limit,lower_limit, df[column])
    # print and visualize outliers:-
    print_and_visualize_outliers()
    
# encode categorical columns using OneHotEncoder
def encode():
      #Initialize OneHotEncoder instance
      encoder = OneHotEncoder(sparse_output=False, drop='first')
      # Apply one-hot encoding to the categorical columns
      one_hot_encoded = encoder.fit_transform((df[categorical_columns]))
      one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
      # Concatenate the one-hot encoded dataframe with the original dataframe
      newdf = pd.concat([df, one_hot_df], axis=1)
       # Drop the original categorical columns
      newdf = newdf.drop(categorical_columns, axis=1)
      return newdf
    
def data_processing():
     # Impute the missing values
     imput_missing_values()
     # outliers treatment
     treat_outliers()
     # Encode categorical features
     return encode()

df = data_processing()

In [None]:
# extra processing to remove outliers for the gulcose column
lower_limit = df['glucose'].quantile(0.25)
upper_limit = df['glucose'].quantile(0.75)
df['glucose'] = np.where(df['glucose'] > upper_limit, upper_limit, df['glucose'])
df['glucose'] = np.where(df['glucose'] < lower_limit, lower_limit, df['glucose'])

print_and_visualize_outliers()

In [13]:
# helper method to evalute F1 score, Confusion Matrix and Classification report:-

def evalute_model(actual, predict):
    score = accuracy_score(actual, predict) 
    f1 = f1_score(actual, predict)
    report = classification_report(actual, predict)
    matrix = confusion_matrix(actual, predict)
    print(f'accuracy score:{score} \n F1 Score: {f1} \n Classification report:\n {report} \n Confusion matrix \n {matrix}')
    

In [14]:
X = df.drop(columns=['Heart-Att']) # feature columns
y= df['Heart-Att'] # target column

# helper method to split dataset into training and test data 
def split_dataset(test_size):
    return train_test_split(X, y, test_size=test_size, random_state=42)

X_train, X_test, y_train, y_test = split_dataset(.2)

In [None]:
# train the model using DecisionTreeClassifier algorithm
model=DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=5, min_samples_leaf=5)
model.fit(X_train,y_train)

# evalute the model for test and training data:
print ('model evalute results for test data \n')
evalute_model(y_test, model.predict(X_test))

print ('\n model evalute results for training data \n')
evalute_model(y_train, model.predict(X_train))


In [None]:
# train the model using naive bayes(MultinomialNB) algorithm:-
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# evalute the model for test and training data:
print ('model evalute results for test data \n')
evalute_model(y_test, mnb.predict(X_test))

print ('\n model evalute results for training data')
evalute_model(y_train, mnb.predict(X_train))

In [None]:
Form above, DecisionTreeClassifier is best as comapared to naive bayes because 
accuracy for tree for test data is 84 % while for naive bayes is 77 %
and for training data tree accuracy is 85 % while for naive bayes is 78%

confusion matrix represent the false positive, true positive