In [49]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [50]:
values=df.to_numpy()
X = df.drop(columns=['whether he/she donated blood in March 2007'])
y = df['whether he/she donated blood in March 2007']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True, random_state=42)



In [51]:
print(y_train.value_counts(normalize=True))

# Using sklearn accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score

majority_class = y_train.mode()[0]
prediction = np.full(shape=y_train.shape, 
                     fill_value=majority_class)

accuracy_score(y_train, prediction)

0    0.768271
1    0.231729
Name: whether he/she donated blood in March 2007, dtype: float64


0.768270944741533

In [88]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression

pipeline = make_pipeline(\
                         RobustScaler(), SelectKBest(f_classif), LogisticRegression(solver='lbfgs'))

In [98]:
from sklearn.model_selection import GridSearchCV
param_grid={\
            'selectkbest__k':[1, 2, 3, 4], 
           'logisticregression__class_weight':[None, 'balanced'],
           'logisticregression__C':[.0001, .001, .01, .1, 1.0, 10.0, 100.00, 1000.0, 10000.0]}

clf=GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                       ('selectkbest', SelectKBest()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.0001, 0.001, 0.01, 0.1,
                                                   1.0, 10.0, 100.0, 1000.0,
                                                   10000.0],
                         'logisticregression__class_weight': [None, 'balanced'],
                         'selectkbest__k': [1, 2, 3, 4]},
             scoring='accuracy', verbose=1)

In [99]:
pd.DataFrame.from_dict(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__class_weight,param_selectkbest__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011801,0.002786,0.002600,0.000490,0.0001,,1,"{'logisticregression__C': 0.0001, 'logisticreg...",0.769912,0.767857,0.767857,0.767857,0.767857,0.768268,0.000822,22
1,0.008201,0.000401,0.001601,0.000489,0.0001,,2,"{'logisticregression__C': 0.0001, 'logisticreg...",0.769912,0.767857,0.767857,0.767857,0.767857,0.768268,0.000822,22
2,0.009602,0.002728,0.001601,0.000490,0.0001,,3,"{'logisticregression__C': 0.0001, 'logisticreg...",0.769912,0.767857,0.767857,0.767857,0.767857,0.768268,0.000822,22
3,0.010200,0.001471,0.002404,0.001022,0.0001,,4,"{'logisticregression__C': 0.0001, 'logisticreg...",0.769912,0.767857,0.767857,0.767857,0.767857,0.768268,0.000822,22
4,0.007803,0.000401,0.001798,0.000399,0.0001,balanced,1,"{'logisticregression__C': 0.0001, 'logisticreg...",0.646018,0.580357,0.651786,0.642857,0.625000,0.629204,0.026011,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.009998,0.002444,0.002002,0.000637,10000.0,,4,"{'logisticregression__C': 10000.0, 'logisticre...",0.769912,0.741071,0.803571,0.776786,0.803571,0.778982,0.023382,2
68,0.006998,0.000004,0.001404,0.000491,10000.0,balanced,1,"{'logisticregression__C': 10000.0, 'logisticre...",0.654867,0.580357,0.669643,0.642857,0.625000,0.634545,0.030794,52
69,0.007799,0.001168,0.001599,0.000496,10000.0,balanced,2,"{'logisticregression__C': 10000.0, 'logisticre...",0.654867,0.562500,0.642857,0.616071,0.625000,0.620259,0.031889,61
70,0.007799,0.000748,0.001400,0.000492,10000.0,balanced,3,"{'logisticregression__C': 10000.0, 'logisticre...",0.654867,0.562500,0.642857,0.616071,0.625000,0.620259,0.031889,61


In [100]:
print('Cross Validation Score:', clf.best_score_)
print('Best Parameters:', clf.best_params_)
features = clf.best_estimator_.named_steps['selectkbest']
feature_names = features.get_support()
all_names = X_train.columns
selected_features=all_names[feature_names]
print(selected_features)
unselected_features = all_names[~feature_names]
print(unselected_features)

Cross Validation Score: 0.7807522123893806
Best Parameters: {'logisticregression__C': 1.0, 'logisticregression__class_weight': None, 'selectkbest__k': 4}
Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)'],
      dtype='object')
Index([], dtype='object')


In [101]:
pred = clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.7540106951871658
