In [1]:
#Lets Import the Pandas for some data pre-processing.
import pandas as pd


In [2]:
#lets load the dataset with the variable name of "transfusion"
#Using ".read_csv()" to read the dataset because our given dataset is CSV.
transfusion = pd.read_csv("transfusion.csv")
transfusion.head(5)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [3]:
#lets get the statistical figure of our dataset
#By using Pandas function ".describe()"
transfusion.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [4]:
# get the info of the dataset using pandas ".info()" function. which brings the complete info of our dataset
transfusion.info()
# See here every data is number Or Integer ,we can pass Machine learning activity

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [5]:
#check the null values, using Pandas ".isnull.sum()" function.
transfusion.isnull().sum()
#zero null values in the dataset.

Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64

In [6]:
# Lets assume the target in the dataset which is "whether he/she donated blood in March 2007"
# I am gonna change the name of the column into "target"
transfusion.rename(columns={"whether he/she donated blood in March 2007":"target"},inplace=True)
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [7]:
# here we are counting the "0" & "1" values of the target column by using ".value_counts()"
# and rounding the output upto 3 decimals by ".round(n)" function.
transfusion.target.value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

In [8]:
# creating a ML model by importing the "train_test_spilt" library from "sklearn.model_selection"
from sklearn.model_selection import train_test_split

# passing the train & test parameters of x,y
# here we are giving 25% of data for test and rest 75% for training and droping the "target" column
x_train,x_test,y_train,y_test = train_test_split(
    transfusion.drop(columns='target'),
    transfusion.target,
    test_size=0.25,
    random_state=42,
    stratify=transfusion.target)

In [9]:
#Lets have a glance on the x_train data.
x_train.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


In [10]:
# lets import the powerful "tpot" library , which automates the ML module and finds the best pipeline for the dataset.
# Also "roc_auc_score" from sklearn
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

#create a new variable with the name "tpot", by creating no:of Generations, dividing population_size in 20.
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light')



In [11]:
tpot.fit(x_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=20, random_state=42,
               scoring='roc_auc', verbosity=2)

In [12]:
# AUC score for tpot model
# Here tpot module predicts the x_test values by using ".predict_proba()" command
# Print the nAUC score.
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(x_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')

# Here we are Printing the best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print(f'{idx}. {transform}')


AUC score: 0.7637

Best pipeline steps:
1. Normalizer()
2. MultinomialNB(alpha=0.001)


In [13]:
# x_train output with rounding upto 3 decimal places. Here i am putting the result in some dataframe using ".reset_index()"
x_train.var().round(3).reset_index()

Unnamed: 0,index,0
0,Recency (months),66.929
1,Frequency (times),33.83
2,Monetary (c.c. blood),2114363.7
3,Time (months),611.147


In [14]:
# Lets import numpy for some mathematical calculations
import numpy as np

# I am Copying x_train and x_test into x_train_normed and x_test_normed
x_train_normed,x_test_normed = x_train.copy(), x_test.copy()

# Here we are Specifying the which column to normalize.In our condition we normalize "Monetary(c.c.blood)"
col_to_normalize = "Monetary (c.c. blood)"

# Log normalization
for df_ in [x_train_normed, x_test_normed]:
    # Adding the log to normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Droping the original column
    df_.drop(columns=col_to_normalize, inplace=True)

# Lets Check the variance for x_train_normed and rounding upto 3 decimals
x_test_normed.var().round(3)

Recency (months)      61.692
Frequency (times)     34.887
Time (months)        533.939
monetary_log           0.828
dtype: float64

In [15]:
# Import the linear_model from sklearn
from sklearn import linear_model

# Instantiate Logistic Regression, by creating the variable logreg.
logreg = linear_model.LogisticRegression(solver='liblinear', random_state=42)

# Here we are training the model, by using the function "logreg.fit()" putting the (x-train, y_train) values.
logreg.fit(x_train_normed, y_train)

# Predicting the AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(x_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


In [16]:
# Importing "itemgetter"
from operator import itemgetter

# Sorting the models based on their AUC score in Descending Order (high to low)
# I am using the "Sorted" Function by passing (tpot, with auc_scaore) & (logreg, with auc_score)
# "reverse=True" is a statement which enables us to arrange values in descending.
sorted([('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
       key=itemgetter(1),
    reverse = True)

[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]