## Predict whether or not a donor will give blood the next time the vehicle comes to campus.

In [1]:
#importing library
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
# loading data
transfusion  = pd.read_csv('./datasets/transfusion.data')

In [3]:
transfusion

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [4]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [5]:
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'Donated in March 2007'},    inplace=True )

In [6]:
transfusion

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Donated in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [7]:
transfusion['Donated in March 2007'].value_counts(normalize=True)  # Normalize is used to find percentage 

0    0.762032
1    0.237968
Name: Donated in March 2007, dtype: float64

In [8]:
X = transfusion.drop(columns = 'Donated in March 2007')

In [9]:
Y = transfusion['Donated in March 2007']

In [10]:
x_train , x_test , y_train , y_test = train_test_split(X , Y ,test_size=0.25,random_state=42 , stratify=Y) # stratify is used 
                                                                                                # to maintain testcase ratio

In [11]:
y_train.value_counts(normalize=True)

0    0.761141
1    0.238859
Name: Donated in March 2007, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

0    0.764706
1    0.235294
Name: Donated in March 2007, dtype: float64

In [13]:
from tpot import TPOTClassifier



In [14]:
from sklearn.metrics import roc_auc_score

In [17]:
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=1,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)

In [18]:
tpot.fit(x_train , y_train)

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=20, random_state=42,
               scoring='roc_auc', verbosity=1)

In [20]:
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(x_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7637


In [21]:
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. Normalizer()
2. MultinomialNB(alpha=0.001)


In [23]:
x_train.var().round(3) # variance is spread of data , with such a high variance of monetary model may weight it more than other 
                        # features so we need to normalize variance 

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [24]:
# normalizing variance 

In [25]:
x_train_normed, x_test_normed = x_train.copy(), x_test.copy()
col_to_normalize = 'Monetary (c.c. blood)'

In [28]:
for df_ in [x_train_normed , x_test_normed]:
    df_['monetart_log'] = np.log(df_[col_to_normalize])
    df_.drop(columns = col_to_normalize, inplace=True)

In [30]:
x_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetart_log           0.837
dtype: float64

## MultinomialNB Model

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
mnb = MultinomialNB()

In [33]:
mnb.fit(x_train_normed,y_train)

MultinomialNB()

In [38]:
mnb_pred = mnb.predict(x_test)

In [39]:
# AUC score for tpot model
mnb_auc_score = roc_auc_score(y_test, mnb.predict_proba(x_test_normed)[:, 1])
print(f'\nAUC score: {mnb_auc_score:.4f}')


AUC score: 0.7638


# Logical Reg Model

In [41]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

# Train the model
logreg.fit(x_train_normed, y_train)

# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(x_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7890


In [43]:
# Importing itemgetter
from operator import itemgetter

# Sort models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score), ('mnb',mnb_auc_score)],
    key=itemgetter(1),
    reverse=True)

[('logreg', 0.7890178003814368),
 ('mnb', 0.7638270820089001),
 ('tpot', 0.7637476160203432)]

## As we can see above Tpot improved to some extent after normalizing data 
## We need to apply normalizer to mnb to get more accuracy than logreg model

# Thank