In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer

breast_cancer_data = load_breast_cancer()

In [3]:
print(breast_cancer_data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [4]:
print(breast_cancer_data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [5]:
breast_cancer_df = pd.DataFrame(breast_cancer_data.data, columns=breast_cancer_data.feature_names)
breast_cancer_df["target"] = breast_cancer_data.target
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
X = breast_cancer_df.iloc[:, :-1].values
y = breast_cancer_df.iloc[:, -1].values

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


pipe = make_pipeline(StandardScaler(), SVC())
clf_gridSearch = GridSearchCV(pipe, param_grid={'svc__gamma': [0.001, 0.01, 0.1, 1, 10], 'svc__C': [0.1, 1, 10, 100]}, scoring='f1_macro')

In [19]:
clf_gridSearch.fit(X_train, y_train)

In [20]:
pd.DataFrame(clf_gridSearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063565,0.025147,0.040031,0.018715,0.1,0.001,"{'svc__C': 0.1, 'svc__gamma': 0.001}",0.48461,0.595693,0.419855,0.490575,0.594148,0.516976,0.068307,12
1,0.026902,0.013229,0.022075,0.017006,0.1,0.01,"{'svc__C': 0.1, 'svc__gamma': 0.01}",0.916143,0.945909,0.931869,0.943732,0.930274,0.933585,0.010705,10
2,0.057466,0.021815,0.024356,0.007444,0.1,0.1,"{'svc__C': 0.1, 'svc__gamma': 0.1}",0.945055,0.945909,0.921875,0.929021,0.903943,0.929161,0.015634,11
3,0.062225,0.012619,0.026998,0.009614,0.1,1.0,"{'svc__C': 0.1, 'svc__gamma': 1}",0.384615,0.384615,0.384615,0.387597,0.382812,0.384851,0.00154,15
4,0.04428,0.007636,0.013861,0.002691,0.1,10.0,"{'svc__C': 0.1, 'svc__gamma': 10}",0.384615,0.384615,0.384615,0.387597,0.382812,0.384851,0.00154,15
5,0.021889,0.001819,0.011572,0.001974,1.0,0.001,"{'svc__C': 1, 'svc__gamma': 0.001}",0.945055,0.959121,0.931869,0.914006,0.930274,0.936065,0.015167,9
6,0.010845,0.00157,0.00787,0.003701,1.0,0.01,"{'svc__C': 1, 'svc__gamma': 0.01}",0.959725,0.959725,0.986575,0.943732,0.930274,0.956006,0.018841,5
7,0.020996,0.002814,0.011193,0.001625,1.0,0.1,"{'svc__C': 1, 'svc__gamma': 0.1}",0.973333,0.934544,0.947334,0.958833,0.946259,0.952061,0.013124,6
8,0.036829,0.004621,0.015828,0.001574,1.0,1.0,"{'svc__C': 1, 'svc__gamma': 1}",0.384615,0.384615,0.384615,0.387597,0.382812,0.384851,0.00154,15
9,0.039311,0.004255,0.016003,0.002069,1.0,10.0,"{'svc__C': 1, 'svc__gamma': 10}",0.384615,0.384615,0.384615,0.387597,0.382812,0.384851,0.00154,15


In [17]:
predict = clf_gridSearch.best_estimator_.predict(X_test)

In [10]:
predict = pipe.predict(X_test)

In [37]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        63
           1       0.98      0.98      0.98       108

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171



In [31]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
predict_lg = lg.predict(X_test)

In [33]:
print(classification_report(y_test, predict_lg))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171

