# Naive Bayesian

## Import Library

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, average_precision_score

## Load Dataset

In [2]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv", header=0)
print("rows of original dataset:", len(df))
print(df.head())

rows of original dataset: 5110
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


## Preprocessing

In [3]:
# data cleaning
df = df.dropna()
df.drop(['id'], axis = 1, inplace = True)
print("Rows after dropping missing values:", len(df))
print(df.info())

Rows after dropping missing values: 4909
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   Residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.2+ KB
None


In [4]:
# Find category feature
categories = []
for col in df.columns:
  if df[col].dtype == 'object':
    categories.append(col)

# One-hot encode categorical features
encoder = OneHotEncoder()
df_encoded = encoder.fit_transform(df[categories])

# Convert sparse matrix to a DataFrame and reset the index
df_encoded = pd.DataFrame(df_encoded.toarray(), columns=encoder.get_feature_names_out())
df_encoded.reset_index(drop=True, inplace=True)

# Drop the original categorical columns from df_abnormal
df_dropped = df.drop(columns=categories).reset_index(drop=True)

# Combine df_encoded (one-hot encoded) and df_dropped (the rest of the columns)
df_filtered = pd.concat([df_encoded, df_dropped], axis=1)

# Count number of dos and non-dos attacks
stroke_num = (df_filtered['stroke'] == 1).sum()
nonstroke_num = (df_filtered['stroke'] == 0).sum()

# Display the filtered dataframe
print(f"Number of stroke: {stroke_num}, number of non-stroke: {nonstroke_num}")
print(df_filtered.head())

Number of stroke: 209, number of non-stroke: 4700
   gender_Female  gender_Male  gender_Other  ever_married_No  \
0            0.0          1.0           0.0              0.0   
1            0.0          1.0           0.0              0.0   
2            1.0          0.0           0.0              0.0   
3            1.0          0.0           0.0              0.0   
4            0.0          1.0           0.0              0.0   

   ever_married_Yes  work_type_Govt_job  work_type_Never_worked  \
0               1.0                 0.0                     0.0   
1               1.0                 0.0                     0.0   
2               1.0                 0.0                     0.0   
3               1.0                 0.0                     0.0   
4               1.0                 0.0                     0.0   

   work_type_Private  work_type_Self-employed  work_type_children  ...  \
0                1.0                      0.0                 0.0  ...   
1             

## Scale & Resample data

In [5]:
# Split the DataFrame into features (X) and target (y)
X = df_filtered.iloc[:, :-1]  # All columns except the last one (features)
y = df_filtered['stroke']

# Scale the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_filtered)

# Set up the resampling strategy
sampling_strategy = {label: int(min(stroke_num,nonstroke_num)) for label in [0,1]}

# Apply RandomUnderSampler with the strategy,
# majority and minority classes are equal, and their number is set to 50%
rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Count number of dos and non-dos attacks
stroke_num = (y_resampled == 1).sum()
nonstroke_num = (y_resampled == 0).sum()

# Display the resampled dataframe
print("Rows after resampled data:", len(X_resampled))
print(f"Number of stroke: {stroke_num}, number of non-stroke attacks: {nonstroke_num}")

Rows after resampled data: 418
Number of stroke: 209, number of non-stroke attacks: 209


## Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

## Running Native Bayes with Different Kernels

In [7]:
# Initialize the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Fit the model to the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Predict probabilities for calculating Average Precision
y_pred_proba = nb_classifier.predict_proba(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
ap = average_precision_score(y_test, y_pred_proba[:, 1])

print("Accuracy:", accuracy)
print("Confusion Matrix:",confusion_matrix(y_test, y_pred))
print("Classification Report:\n", report)
print("Average Precision:", ap)

Accuracy: 0.6190476190476191
Confusion Matrix: [[12 30]
 [ 2 40]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.29      0.43        42
           1       0.57      0.95      0.71        42

    accuracy                           0.62        84
   macro avg       0.71      0.62      0.57        84
weighted avg       0.71      0.62      0.57        84

Average Precision: 0.6605634547120212
