In [2]:
# Print out the first 5 lines from the transfusion.data file
!head -n5 datasets/transfusion.data

Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 ,50,12500,98 ,1
0 ,13,3250,28 ,1
1 ,16,4000,35 ,1
2 ,20,5000,45 ,1


In [1]:
#Import pandas
import pandas as pd

#Read in the dataset and assign it to a transfusion variable
transfusion = pd.read_csv('/Users/Olaide/datasets/transfusion.data')

#Display the first few rows
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [2]:
#print a concise summary of transfusion dataframe
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [3]:
#Renaming target column as 'target' for brevity
transfusion.rename(
    columns = {'whether he/she donated blood in March 2007': 'target'},
    inplace = True
)

#print out the first 2 rows to show if the change was implemented
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [4]:
#Print target incidence proportions, rounding output to 3 decimal places
transfusion.target.value_counts(normalize = True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

In [5]:
#Import tain_test split method
from sklearn.model_selection import train_test_split

#Split transfusion dataframe into X_train, X_test, y_train and y_test datasets
#Stratifying on the 'target' column
X_train, X_test, y_train, y_test = train_test_split(
    transfusion.drop(columns = 'target'),
    transfusion.target,
    test_size = 0.25,
    random_state = 42,
    stratify = transfusion.target
)

#Print out the first 2 rows of X_train
X_train.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


In [7]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.11.6.post1-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 9.7 kB/s ta 0:00:01
Collecting deap>=1.2
  Downloading deap-1.3.1-cp38-cp38-macosx_10_13_x86_64.whl (109 kB)
[K     |████████████████████████████████| 109 kB 23 kB/s eta 0:00:01
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25ldone
[?25h  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11956 sha256=652394ee58bdf5938be8f335ba9fc4505df9efb3afc481e17242ff5e62d56698
  Stored in directory: /Users/olaide/Library/Caches/pip/wheels/a8/bb/8f/6b9328d23c2dcedbfeb8498b9f650d55d463089e3b8fc0bfb2
Successfully built stopit
Installing collected packages: deap, update-checker, stopit, tpot
Successfully installed deap-1.3.1 stopit-1.1.2 tpot-0.11.6.post1 u

In [10]:
#Import TPOTClassifier and roc_auc_score
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

#Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations = 5,
    population_size = 20,
    verbosity = 2,
    scoring = 'roc_auc',
    random_state = 42,
    disable_update_check = True,
    config_dict = 'TPOT light'
)
tpot.fit(X_train, y_train)
tpot.export('tpot_digits_pipeline.py')

#AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')

#Print best pipeline steps
print('\nBest pipeline steps:', end = '\n')

for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start = 1):
    #Print idx and transform
    print(f'{idx}.{transform}')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)

AUC score: 0.7637

Best pipeline steps:
1.Normalizer()
2.MultinomialNB(alpha=0.001)


In [13]:
#X_train variance, rounding the output to 3 decimal places
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [15]:
#Import numpy
import numpy as np

#Copy X_train and X_test into X_train normed and X_test_normed
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()

#Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

#Log normalization
for df_ in [X_train_normed, X_test_normed]:
    #Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    #Drop the original column
    df_.drop(columns = col_to_normalize, inplace = True)
    
#Check the variance for X_train_normed
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [18]:
#Importing modules
from sklearn import linear_model

#Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver = 'liblinear',
    random_state = 42
)

#Train the model
logreg.fit(X_train_normed, y_train)

#AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


In [19]:
#Importing itemgetter
from operator import itemgetter

#Sort models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key = itemgetter(1),
    reverse = True
)

[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]