## Sử dụng thuật toán

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style('darkgrid')

In [None]:
data = pd.read_csv("/content/sample_data/heart_failure_clinical_records_dataset.csv")
data.head(9)

In [None]:
#normalize
data = data.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))

In [None]:
data['DEATH_EVENT'].astype(int)

In [None]:
corr = data.iloc[:,:-1].corr(method="pearson")
cmap = sns.diverging_palette(250,354,80,60,center='dark',as_cmap=True)
sns.heatmap(corr, vmax=1, vmin=-.5, cmap=cmap, square=True, linewidths=.2)

In [None]:
def calculate_prior(data, Y):
    classes = sorted(list(data[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(data[data[Y]==i])/len(data))
    return prior

In [None]:
def calculate_likelihood_gaussian(data, feat_name, feat_val, Y, label):
    feat = list(data.columns)
    data = data[data[Y]==label]
    mean, std = data[feat_name].mean(), data[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

In [None]:
def naive_bayes_gaussian(data, X, Y):
    # get feature names
    features = list(data.columns)[:-1]

    # calculate prior
    prior = calculate_prior(data, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(data[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(data, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.3, random_state=41)

In [None]:
X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_gaussian(train, X=X_test, Y="DEATH_EVENT")
print(Y_test)

## Sử dụng thư viện

In [65]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [66]:
df = pd.read_csv('/content/sample_data/heart_failure_clinical_records_dataset.csv')

In [3]:
df.shape

(299, 13)

In [67]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [68]:
# apply normalization techniques
for column in df.columns:
    df[column] = df[column]  / df[column].abs().max()
      
# view normalized data
display(df)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,0.789474,0.0,0.074036,0.0,0.2500,1.0,0.311765,0.202128,0.878378,1.0,0.0,0.014035,1.0
1,0.578947,0.0,1.000000,0.0,0.4750,0.0,0.309833,0.117021,0.918919,1.0,0.0,0.021053,1.0
2,0.684211,0.0,0.018573,0.0,0.2500,0.0,0.190588,0.138298,0.871622,1.0,1.0,0.024561,1.0
3,0.526316,1.0,0.014120,0.0,0.2500,0.0,0.247059,0.202128,0.925676,1.0,0.0,0.024561,1.0
4,0.684211,1.0,0.020354,1.0,0.2500,0.0,0.384706,0.287234,0.783784,0.0,0.0,0.028070,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,0.652632,0.0,0.007760,1.0,0.4750,1.0,0.182353,0.117021,0.966216,1.0,1.0,0.947368,0.0
295,0.578947,0.0,0.231523,0.0,0.4750,0.0,0.317647,0.127660,0.939189,0.0,0.0,0.950877,0.0
296,0.473684,0.0,0.262053,1.0,0.7500,0.0,0.872941,0.085106,0.932432,0.0,0.0,0.975439,0.0
297,0.473684,0.0,0.306958,0.0,0.4750,0.0,0.164706,0.148936,0.945946,1.0,1.0,0.982456,0.0


In [69]:
df.DEATH_EVENT.astype(int)

0      1
1      1
2      1
3      1
4      1
      ..
294    0
295    0
296    0
297    0
298    0
Name: DEATH_EVENT, Length: 299, dtype: int64

In [71]:
#df_max_scaled
x = df[['age', 'anaemia', 'diabetes', 'sex','creatinine_phosphokinase', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'smoking', 'time' ]]
target = df['DEATH_EVENT']

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(x,target,test_size= 0.2)

In [73]:
y_test.shape

(60,)

In [74]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [75]:
model.fit(X_train, y_train)


MultinomialNB()

In [76]:
nr_correct = (y_test == model.predict(X_test)).sum()

In [77]:
print(f'{nr_correct} documents classified correctly')

46 documents classified correctly


In [78]:
nr_incorrect = y_test.size - nr_correct

In [79]:
print(f'{nr_incorrect} documents classified incorrectly')

14 documents classified incorrectly


In [80]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The testing accuracy of the model is {1-fraction_wrong:.2%}') 

The testing accuracy of the model is 76.67%


In [81]:
model.score(X_test, y_test)

0.7666666666666667

In [82]:
from sklearn.naive_bayes import GaussianNB
classifer = GaussianNB()

In [83]:
classifer.fit(X_train, y_train)

GaussianNB()

In [84]:
correct = (y_test == classifer.predict(X_test)).sum()

In [85]:
print(f'{correct} documents classified correctly')

50 documents classified correctly


In [86]:
incorrect = y_test.size - correct

In [87]:
print(f'{incorrect} documents classified incorrectly')

10 documents classified incorrectly


In [88]:
fraction_wr = incorrect / (correct + incorrect)
print(f'The testing accuracy of the model is {1-fraction_wr:.2%}') 

The testing accuracy of the model is 83.33%


In [89]:
recall_score(y_test, classifer.predict(X_test))

0.5333333333333333

In [90]:
precision_score(y_test, classifer.predict(X_test))

0.7272727272727273

In [91]:
f1_score(y_test, classifer.predict(X_test))

0.6153846153846153