# Preprocessor Tuning

In [38]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import set_config; set_config(display='diagram')

## (0) The turmors Dataset

* The following dataset describes tumors that are either <font color=red>malignant</font> or <font color=green>benign</font>. 
* The task is to detect as many malignant tumors as possible.

In [3]:
data = pd.read_csv("../data/workflow/tumors_dataset.csv")
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,malignant
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,,0.205,0.4,0.1625,0.2364,0.07678,1


In [4]:
data.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,malignant
count,568.0,569.0,567.0,568.0,568.0,569.0,567.0,567.0,569.0,569.0,...,566.0,566.0,567.0,564.0,564.0,565.0,566.0,566.0,567.0,569.0
mean,14.12565,19.289649,91.884568,655.126937,0.096319,0.104341,0.088756,0.048845,0.181162,0.062798,...,25.662792,107.370583,881.842504,0.132332,0.254284,0.27164,0.114679,0.290069,0.083956,0.372583
std,3.526937,4.301036,24.273297,352.17855,0.014042,0.052813,0.079754,0.038846,0.027414,0.00706,...,6.158413,33.651145,569.96564,0.022906,0.157231,0.208576,0.065735,0.061893,0.018091,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.6975,16.17,75.1,420.175,0.08629,0.06492,0.02952,0.02031,0.1619,0.0577,...,21.065,84.1225,514.65,0.1165,0.147575,0.1144,0.064943,0.250425,0.071365,0.0
50%,13.355,18.84,86.18,551.4,0.095865,0.09263,0.06154,0.03341,0.1792,0.06154,...,25.37,97.665,686.6,0.1313,0.213,0.2267,0.100015,0.28225,0.08004,0.0
75%,15.7975,21.8,103.95,784.15,0.1053,0.1304,0.13,0.07382,0.1957,0.06612,...,29.705,125.775,1086.0,0.146025,0.3393,0.3829,0.161375,0.317825,0.092085,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              568 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           567 non-null    float64
 3   mean area                568 non-null    float64
 4   mean smoothness          568 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           567 non-null    float64
 7   mean concave points      567 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            567 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [9]:
print(data.malignant.value_counts(normalize=True))
data.malignant.value_counts()

malignant
0    0.627417
1    0.372583
Name: proportion, dtype: float64


malignant
0    357
1    212
Name: count, dtype: int64

## (1) Building a Pipeline

In [18]:
pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaling', MinMaxScaler()),
    ('model', LogisticRegression())
])

pipeline

0,1,2
,steps,"[('imputer', ...), ('scaling', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## (2) Optmizing a pipelined model

In [None]:
X = data.drop(columns=['malignant'])
y = data.malignant
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

grid_search = GridSearchCV(
    pipeline,
    param_grid={
        'imputer__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    },
    cv=10,
    scoring='recall'
)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'imputer__n_neighbors': 2}

In [29]:
grid_search = GridSearchCV(
    pipeline,
    param_grid={
        'imputer__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    },
    cv=5,
    scoring='recall'
)
grid_search.fit(data.drop(columns=['malignant']), data.malignant)
grid_search.best_params_

{'imputer__n_neighbors': 3}

In [30]:
n_best = grid_search.best_params_["imputer__n_neighbors"]
n_best

3

## (3) Evaluating a pipeline

In [39]:
X = data.drop(columns=['malignant'])
y = data.malignant
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

cv_score = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='recall').mean()
cv_score

np.float64(0.9056451612903226)

## (4) Predicting using a fitted and pipelined model

In [41]:
new_tumor = pd.read_csv("../data/workflow/new_tumor.csv")
new_tumor.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [None]:
X = data.drop(columns=['malignant'])
y = data.malignant

estimator = grid_search.best_estimator_

# Make a prediction with pipeline (best estimator)
estimator.predict(new_tumor)

array([1])

In [43]:
# FYI, you have access to the predicted probabilities
grid_search.best_estimator_.predict_proba(new_tumor)

array([[0.03194086, 0.96805914]])