In [104]:
from sklearn.datasets import load_breast_cancer
df = load_breast_cancer()

In [105]:
df.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [106]:
X = df.data
y = df.target

In [20]:
print(df.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [107]:
#ANNOVA
from sklearn.feature_selection import f_classif
import numpy as np
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

In [121]:
svalue, pvalue = f_classif(X,y)
print(svalue)

[646.981021 118.096059 697.235272 573.060747 83.651123 313.233079
 533.793126 861.676020 69.527444 0.093459 268.840327 0.039095 253.897392
 243.651586 2.557968 53.247339 39.014482 113.262760 0.024117 3.468275
 860.781707 149.596905 897.944219 661.600206 122.472880 304.341063
 436.691939 964.385393 118.860232 66.443961]


In [122]:
max_val = svalue>100
print(max_val)
np.count_nonzero(max_val ==1)

[ True  True  True  True False  True  True  True False False  True False
  True  True False False False  True False False  True  True  True  True
  True  True  True  True  True False]


20

In [None]:
#Numpy Extraction of features
X_new=X[:,[0,1,2,3,5,6,7,10,11,12,17,20,21,22,23,24,25,26,27,28]]


In [212]:
print(X_new.shape)

(714, 20)


In [209]:
#SMOTE :- Synthetic Minority Oversampling Technique, used for make balanced data for every class.

from imblearn.over_sampling import SMOTE
smt = SMOTE()
X, y = smt.fit_resample(X_new,y)


In [125]:
print(X.shape)
print(y.shape)

(714, 20)
(714,)


In [208]:
# Feature Scaling

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_new=scaler.fit_transform(X)


In [200]:
# Train-Test-Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_new,y,random_state=10)


In [204]:
#Finding best parameter for model.

from sklearn.model_selection import GridSearchCV
gsv = GridSearchCV(model,param_grid={'n_neighbors':[3,4],'metric':['minkowski'],'p':[2,3,4,5,6,7]},cv=5)
gsv.fit(X_new,y)
print(gsv.best_score_)
print(gsv.best_params_)


0.9747857776026791
{'metric': 'minkowski', 'n_neighbors': 1, 'p': 5}


In [205]:
#Model Selection - KNN

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 3,metric = 'minkowski',p=5)
model.fit(X_train,y_train)


In [210]:
print(model.score(X_train,y_train))

print(model.score(X_test,y_test))

1.0
0.9664804469273743
