In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("transfusion.data", delimiter=",")
df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
5,4,4,1000,4,0
6,2,7,1750,14,1
7,1,12,3000,35,0
8,2,9,2250,22,1
9,5,46,11500,98,1


In [3]:
df.isnull().sum()

Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64

In [5]:
df.rename(columns={'whether he/she donated blood in March 2007':'target'}, inplace=True)
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [6]:
from sklearn import linear_model

In [10]:
df.target.value_counts(normalize=True)

0    0.762032
1    0.237968
Name: target, dtype: float64

In [11]:
import sklearn
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_X, val_X, train_y, val_y = train_test_split(df.drop(columns=['target']),df.target,
                                                  test_size=0.25,random_state=42, stratify=df.target)
print(sklearn.__version__)

0.24.1


In [12]:
train_X.shape

(561, 4)

In [13]:
train_X.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26
116,2,7,1750,46
661,16,2,500,16
154,2,1,250,2


In [14]:
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

# Initialize TPOTClassifier
tpot = TPOTClassifier(generations=7,
                     population_size=20,
                     verbosity=2,
                     scoring='roc_auc',
                     random_state=42,
                     disable_update_check=True,
                     config_dict='TPOT light')
tpot.fit(train_X, train_y)

# AUC Score
auc_score = roc_auc_score(val_y, tpot.predict_proba(val_X)[:, 1])
print("\nAUC SCORE: {:.4f}".format(auc_score))

# Best Pipeline steps
print("\nBEST PIPELINE STEPS:", end="\n")
for i, (y, z) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print("{}. {}".format(i, z))

Optimization Progress:   0%|          | 0/160 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Generation 6 - Current best internal CV score: 0.7464101394881147

Generation 7 - Current best internal CV score: 0.7464101394881147

Best pipeline: LogisticRegression(CombineDFs(Normalizer(StandardScaler(input_matrix), norm=l2), StandardScaler(input_matrix)), C=25.0, dual=False, penalty=l2)

AUC SCORE: 0.7920

BEST PIPELINE STEPS:
1. FeatureUnion(transformer_list=[('pipeline',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('normalizer', Normalizer())])),
                               ('standardscaler', StandardScaler())

In [15]:
train_X.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [16]:
train_X_norm = train_X.copy()
val_X_norm = val_X.copy()

#norm_column = train_X_norm.var().idxmax(axis=1)
norm_column = 'Monetary (c.c. blood)'

# Log Normalization
for dataf in [train_X_norm, val_X_norm]:
    # Adding Normalized Column
    dataf['monetary_log'] = np.log(dataf[norm_column])
    # Drop normalized column
    dataf.drop(columns=norm_column, inplace=True)
    
# Variance of train_X_norm
print(train_X_norm.var().round(3))

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64


In [17]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='liblinear',
                                          random_state=42)
# Training the model
log_reg.fit(train_X_norm, train_y)

LogisticRegression(random_state=42, solver='liblinear')

In [18]:
# AUC Score
log_reg_auc = roc_auc_score(val_y, log_reg.predict_proba(val_X_norm)[:, 1])
print("\nAUC SCORE: {:.4f}".format(log_reg_auc))


AUC SCORE: 0.7891
