<a href="https://colab.research.google.com/github/Mihishi/sca-supervised-prediction-model/blob/main/sudden_cardiac_arrest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Step 1: Import Libraries***

In [5]:
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# ***Step 2: Read Dataset***

In [6]:
sca_df = pd.read_csv('sudden_cardiac_arrest_dataset.csv')
sca_df.shape

(1025, 15)

In [7]:
sca_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,corona_vaccine_name
0,32,1,0,125,212,0,1,168,0,1.0,2,2,3,0,Sinovac
1,35,1,0,140,203,1,0,155,1,3.1,0,0,3,0,Pfizer
2,20,1,0,145,174,0,1,125,1,2.6,0,0,3,0,Johnson & Johnson
3,33,1,0,148,203,0,1,161,0,0.0,2,1,3,0,AstraZeneca
4,30,0,0,138,294,1,1,106,0,1.9,1,3,2,0,AstraZeneca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,22,1,1,140,221,0,1,164,1,0.0,2,0,2,1,AstraZeneca
1021,18,1,0,125,258,0,0,141,1,2.8,1,1,3,0,Pfizer
1022,26,1,0,110,275,0,0,118,1,1.0,1,1,2,0,Sinovac
1023,32,0,0,110,254,0,0,159,0,0.0,2,0,2,1,Moderna


# ***Step 3: Dataset Overview***

### ***Step 3.1: Dataset Basic Information***

In [8]:
sca_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1025 non-null   int64  
 1   sex                  1025 non-null   int64  
 2   cp                   1025 non-null   int64  
 3   trestbps             1025 non-null   int64  
 4   chol                 1025 non-null   int64  
 5   fbs                  1025 non-null   int64  
 6   restecg              1025 non-null   int64  
 7   thalach              1025 non-null   int64  
 8   exang                1025 non-null   int64  
 9   oldpeak              1025 non-null   float64
 10  slope                1025 non-null   int64  
 11  ca                   1025 non-null   int64  
 12  thal                 1025 non-null   int64  
 13  target               1025 non-null   int64  
 14  corona_vaccine_name  1025 non-null   object 
dtypes: float64(1), int64(13), object(1)
me

In [9]:
sca_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,corona_vaccine_name
0,32,1,0,125,212,0,1,168,0,1.0,2,2,3,0,Sinovac
1,35,1,0,140,203,1,0,155,1,3.1,0,0,3,0,Pfizer
2,20,1,0,145,174,0,1,125,1,2.6,0,0,3,0,Johnson & Johnson
3,33,1,0,148,203,0,1,161,0,0.0,2,1,3,0,AstraZeneca
4,30,0,0,138,294,1,1,106,0,1.9,1,3,2,0,AstraZeneca


In [10]:
sca_df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target',
       'corona_vaccine_name'],
      dtype='object')

In [11]:
print("Number of rows and columns of Training Dataset :", sca_df.shape)

Number of rows and columns of Training Dataset : (1025, 15)


In [12]:
# Define the continuous features
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Identify the features to be converted to object data type
features_to_convert = [feature for feature in sca_df.columns if feature not in continuous_features]

# Convert the identified features to object data type
sca_df[features_to_convert] = sca_df[features_to_convert].astype('object')

sca_df.dtypes

Unnamed: 0,0
age,int64
sex,object
cp,object
trestbps,int64
chol,int64
fbs,object
restecg,object
thalach,int64
exang,object
oldpeak,float64


### ***Step 3.2: Summary Statistics for Numerical Variables***

In [13]:
#Check statistical values for fields with numerical datatype
sca_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1025.0,26.49561,5.165638,18.0,22.0,27.0,31.0,35.0
trestbps,1025.0,131.611707,17.516718,94.0,120.0,130.0,140.0,200.0
chol,1025.0,246.0,51.59251,126.0,211.0,240.0,275.0,564.0
thalach,1025.0,149.114146,23.005724,71.0,132.0,152.0,166.0,202.0
oldpeak,1025.0,1.071512,1.175053,0.0,0.0,0.8,1.8,6.2


### ***Step 3.3: Summary Statistics for Categorical Variables***

In [14]:
# Get the summary statistics for categorical variables
sca_df.describe(include='object')

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,ca,thal,target,corona_vaccine_name
count,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025
unique,2,4,2,3,2,3,5,4,2,5
top,1,0,0,1,0,1,0,2,1,AstraZeneca
freq,713,497,872,513,680,482,578,544,526,219
