In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv('transfusion.csv')
dataset

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [3]:
print("\nMissing values:")
missing_values = dataset.isnull().sum()
print(missing_values)


Missing values:
Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64


In [4]:
print("\nSummary statistics:")
print(dataset.describe())


Summary statistics:
       Recency (months)  Frequency (times)  Monetary (c.c. blood)  \
count        748.000000         748.000000             748.000000   
mean           9.506684           5.514706            1378.676471   
std            8.095396           5.839307            1459.826781   
min            0.000000           1.000000             250.000000   
25%            2.750000           2.000000             500.000000   
50%            7.000000           4.000000            1000.000000   
75%           14.000000           7.000000            1750.000000   
max           74.000000          50.000000           12500.000000   

       Time (months)  whether he/she donated blood in March 2007  
count     748.000000                                  748.000000  
mean       34.282086                                    0.237968  
std        24.376714                                    0.426124  
min         2.000000                                    0.000000  
25%        16.000000  

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [9]:
X = dataset.drop(['whether he/she donated blood in March 2007'], axis=1) 
y = dataset['whether he/she donated blood in March 2007']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(598, 4)
(150, 4)
(598,)
(150,)


In [10]:
accuracy = {}

model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
acc = model1.score(X_test,y_test)
accuracy['Logistic Regression'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.78
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.98      0.87       116
           1       0.60      0.09      0.15        34

    accuracy                           0.78       150
   macro avg       0.69      0.54      0.51       150
weighted avg       0.74      0.78      0.71       150


Confusion Matrix
[[114   2]
 [ 31   3]]


In [11]:
model2 = GaussianNB()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
acc = model2.score(X_test,y_test)
accuracy['Gaussian Naive Bayes'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7266666666666667
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.91      0.84       116
           1       0.23      0.09      0.13        34

    accuracy                           0.73       150
   macro avg       0.50      0.50      0.48       150
weighted avg       0.65      0.73      0.68       150


Confusion Matrix
[[106  10]
 [ 31   3]]


In [12]:
model3 = DecisionTreeClassifier()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)
acc = model3.score(X_test,y_test)
accuracy['Decision Tree'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.80      0.81       116
           1       0.34      0.35      0.35        34

    accuracy                           0.70       150
   macro avg       0.58      0.58      0.58       150
weighted avg       0.70      0.70      0.70       150


Confusion Matrix
[[93 23]
 [22 12]]


In [13]:
model4 = RandomForestClassifier()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
acc = model4.score(X_test,y_test)
accuracy['Random Forest'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7133333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       116
           1       0.34      0.29      0.32        34

    accuracy                           0.71       150
   macro avg       0.57      0.57      0.57       150
weighted avg       0.70      0.71      0.70       150


Confusion Matrix
[[97 19]
 [24 10]]


In [14]:
model5 = SVC()
model5.fit(X_train, y_train)
y_pred = model5.predict(X_test)
acc = model5.score(X_test,y_test)
accuracy['Support Vector Machine'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7533333333333333
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.97      0.86       116
           1       0.00      0.00      0.00        34

    accuracy                           0.75       150
   macro avg       0.38      0.49      0.43       150
weighted avg       0.59      0.75      0.66       150


Confusion Matrix
[[113   3]
 [ 34   0]]


In [15]:
model6 = KNeighborsClassifier()
model6.fit(X_train, y_train)
y_pred = model6.predict(X_test)
acc = model6.score(X_test,y_test)
accuracy['K-Nearest Neighbors'] = acc
print(f"Accuracy : {acc}")

print('Classification Report')
print(classification_report(y_test, y_pred))
print()

print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7266666666666667
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       116
           1       0.36      0.26      0.31        34

    accuracy                           0.73       150
   macro avg       0.58      0.56      0.57       150
weighted avg       0.70      0.73      0.71       150


Confusion Matrix
[[100  16]
 [ 25   9]]


In [16]:
for name, acc in accuracy.items():
    print(f"{name}: Accuracy = {acc*100:.2f}%")

Logistic Regression: Accuracy = 78.00%
Gaussian Naive Bayes: Accuracy = 72.67%
Decision Tree: Accuracy = 70.00%
Random Forest: Accuracy = 71.33%
Support Vector Machine: Accuracy = 75.33%
K-Nearest Neighbors: Accuracy = 72.67%
