In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
cancer = load_breast_cancer()

In [4]:
cancer.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [5]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [6]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [7]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [8]:
df=pd.DataFrame(cancer.data)

In [9]:
df.columns=cancer.feature_names

In [10]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [11]:
df['target']=cancer.target

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [13]:
df.shape

(569, 31)

In [14]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


## Normality test

In [15]:
df.skew()

mean radius                0.942380
mean texture               0.650450
mean perimeter             0.990650
mean area                  1.645732
mean smoothness            0.456324
mean compactness           1.190123
mean concavity             1.401180
mean concave points        1.171180
mean symmetry              0.725609
mean fractal dimension     1.304489
radius error               3.088612
texture error              1.646444
perimeter error            3.443615
area error                 5.447186
smoothness error           2.314450
compactness error          1.902221
concavity error            5.110463
concave points error       1.444678
symmetry error             2.195133
fractal dimension error    3.923969
worst radius               1.103115
worst texture              0.498321
worst perimeter            1.128164
worst area                 1.859373
worst smoothness           0.415426
worst compactness          1.473555
worst concavity            1.150237
worst concave points       0

In [16]:
df.columns[df.skew()>1.96]

Index(['radius error', 'perimeter error', 'area error', 'smoothness error',
       'concavity error', 'symmetry error', 'fractal dimension error'],
      dtype='object')

## HeatMap

In [17]:
# fig=plt.figure(figsize=(20,20))
# sns.heatmap(df.corr(),annot=True,annot_kws={'size':9.5})

## Split the Data

In [18]:
x=df.drop('target',axis=1)
y=df.target

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

## Scalling

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
sc=StandardScaler()

In [23]:
x_train1=sc.fit_transform(x_train)
x_test1=sc.transform(x_test)

In [24]:
x_train_sc=pd.DataFrame(x_train1,columns=cancer.feature_names)
x_test_sc=pd.DataFrame(x_test1,columns=cancer.feature_names)

## RFE

In [25]:
from sklearn.feature_selection import RFE

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
lgr=LogisticRegression()

In [28]:
rfe=RFE(lgr)
rfe=rfe.fit(x_train_sc,y_train)

In [29]:
x_train_sc_rfe=x_train_sc[x_train_sc.columns[rfe.support_]]

In [30]:
x_train_sc_rfe.head()

Unnamed: 0,mean area,mean compactness,mean concavity,mean concave points,radius error,area error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst concavity,worst concave points,worst symmetry
0,-0.208616,-0.77317,-0.762312,-0.933241,-0.533593,-0.391775,-0.685487,-0.19762,-0.506748,-0.30791,-0.273576,-1.507424,-0.572239,-0.840822,-0.856362
1,-0.296503,-0.587616,-0.091985,-0.542684,-0.830401,-0.531261,0.107689,-0.422917,-0.458495,-0.465287,-0.438127,-1.273017,0.318045,-0.377067,-1.341582
2,-0.010242,1.570006,0.73232,0.386583,0.487478,0.181325,0.624386,0.036022,-1.192272,0.203869,-0.127445,-0.024877,0.272612,-0.047627,-0.089971
3,-0.400014,-0.97065,-0.634704,-0.654992,-0.725949,-0.508935,-0.619011,-0.502189,-0.583287,-0.501,-0.493386,-0.959895,-0.470142,-0.493515,0.226547
4,-0.241538,-0.70063,-0.750349,-0.637469,-0.202659,-0.191661,-0.456221,-0.195534,0.596414,-0.296107,-0.266734,-0.442374,-0.835138,-0.659802,-0.387208


In [31]:
x_train_sc_rfe.shape

(398, 15)

## StatsModel

In [32]:
import statsmodels.api as sm

In [33]:
x_train_sc_rfe_sm=sm.add_constant(x_train_sc_rfe)

In [34]:
col=x_train_sc_rfe_sm.columns

In [35]:
x_train_sc_rfe_sm.head()

Unnamed: 0,const,mean area,mean compactness,mean concavity,mean concave points,radius error,area error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst concavity,worst concave points,worst symmetry
0,1.0,-0.208616,-0.77317,-0.762312,-0.933241,-0.533593,-0.391775,-0.685487,-0.19762,-0.506748,-0.30791,-0.273576,-1.507424,-0.572239,-0.840822,-0.856362
1,1.0,-0.296503,-0.587616,-0.091985,-0.542684,-0.830401,-0.531261,0.107689,-0.422917,-0.458495,-0.465287,-0.438127,-1.273017,0.318045,-0.377067,-1.341582
2,1.0,-0.010242,1.570006,0.73232,0.386583,0.487478,0.181325,0.624386,0.036022,-1.192272,0.203869,-0.127445,-0.024877,0.272612,-0.047627,-0.089971
3,1.0,-0.400014,-0.97065,-0.634704,-0.654992,-0.725949,-0.508935,-0.619011,-0.502189,-0.583287,-0.501,-0.493386,-0.959895,-0.470142,-0.493515,0.226547
4,1.0,-0.241538,-0.70063,-0.750349,-0.637469,-0.202659,-0.191661,-0.456221,-0.195534,0.596414,-0.296107,-0.266734,-0.442374,-0.835138,-0.659802,-0.387208


In [36]:
logm=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [37]:
logm.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,382.0
Model Family:,Binomial,Df Model:,15.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.489
Date:,"Fri, 13 Sep 2019",Deviance:,32.979
Time:,17:27:50,Pearson chi2:,65.8
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.7629,3.349,-0.825,0.409,-9.326,3.800
mean area,5.0470,6.624,0.762,0.446,-7.935,18.029
mean compactness,7.3399,3.389,2.166,0.030,0.697,13.983
mean concavity,3.5723,4.685,0.763,0.446,-5.609,12.754
mean concave points,-13.4517,6.561,-2.050,0.040,-26.312,-0.592
radius error,-0.4936,7.881,-0.063,0.950,-15.941,14.954
area error,-9.7839,14.320,-0.683,0.494,-37.851,18.284
fractal dimension error,0.7074,0.962,0.736,0.462,-1.178,2.592
worst radius,7.4901,27.240,0.275,0.783,-45.899,60.879


In [38]:
col=col.drop('worst perimeter')

In [39]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [40]:
logm1=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [41]:
logm1.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,383.0
Model Family:,Binomial,Df Model:,14.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.491
Date:,"Fri, 13 Sep 2019",Deviance:,32.981
Time:,17:27:51,Pearson chi2:,66.0
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.7785,3.334,-0.833,0.405,-9.314,3.757
mean area,5.0345,6.623,0.760,0.447,-7.946,18.015
mean compactness,7.2812,3.158,2.305,0.021,1.091,13.472
mean concavity,3.5946,4.687,0.767,0.443,-5.591,12.781
mean concave points,-13.4648,6.572,-2.049,0.040,-26.345,-0.585
radius error,-0.3780,7.508,-0.050,0.960,-15.093,14.337
area error,-9.9731,13.781,-0.724,0.469,-36.983,17.037
fractal dimension error,0.7093,0.963,0.736,0.462,-1.179,2.598
worst radius,7.1102,26.111,0.272,0.785,-44.066,58.286


In [42]:
col=col.drop('radius error')

In [43]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [44]:
logm2=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [45]:
logm2.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,384.0
Model Family:,Binomial,Df Model:,13.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.492
Date:,"Fri, 13 Sep 2019",Deviance:,32.984
Time:,17:27:51,Pearson chi2:,66.7
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.8662,2.844,-1.008,0.313,-8.439,2.707
mean area,5.2282,5.391,0.970,0.332,-5.337,15.794
mean compactness,7.2673,3.132,2.320,0.020,1.129,13.406
mean concavity,3.5638,4.633,0.769,0.442,-5.517,12.645
mean concave points,-13.4988,6.536,-2.065,0.039,-26.308,-0.689
area error,-10.6355,4.228,-2.516,0.012,-18.921,-2.350
fractal dimension error,0.7030,0.954,0.737,0.461,-1.167,2.573
worst radius,7.3128,25.696,0.285,0.776,-43.051,57.677
worst texture,-3.1703,0.949,-3.340,0.001,-5.031,-1.310


In [46]:
col=col.drop('worst concave points')

In [47]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [48]:
logm3=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [49]:
logm3.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,385.0
Model Family:,Binomial,Df Model:,12.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.496
Date:,"Fri, 13 Sep 2019",Deviance:,32.991
Time:,17:27:52,Pearson chi2:,65.7
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.9033,2.807,-1.034,0.301,-8.405,2.599
mean area,5.2976,5.342,0.992,0.321,-5.173,15.768
mean compactness,7.2355,3.100,2.334,0.020,1.160,13.311
mean concavity,3.7167,4.263,0.872,0.383,-4.639,12.072
mean concave points,-13.7370,5.926,-2.318,0.020,-25.352,-2.122
area error,-10.5854,4.207,-2.516,0.012,-18.830,-2.341
fractal dimension error,0.6821,0.918,0.743,0.457,-1.117,2.481
worst radius,7.4893,25.581,0.293,0.770,-42.648,57.626
worst texture,-3.1621,0.942,-3.355,0.001,-5.009,-1.315


In [50]:
col=col.drop('worst radius')

In [51]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [52]:
logm4=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [53]:
logm4.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,386.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.538
Date:,"Fri, 13 Sep 2019",Deviance:,33.076
Time:,17:27:52,Pearson chi2:,71.3
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.1627,1.163,-1.859,0.063,-4.443,0.118
mean area,5.8454,5.086,1.149,0.250,-4.123,15.813
mean compactness,7.2601,3.113,2.332,0.020,1.159,13.362
mean concavity,2.9824,3.327,0.896,0.370,-3.539,9.504
mean concave points,-13.0305,5.246,-2.484,0.013,-23.313,-2.748
area error,-10.7485,4.317,-2.490,0.013,-19.210,-2.287
fractal dimension error,0.7509,0.879,0.854,0.393,-0.973,2.475
worst texture,-3.1494,0.935,-3.369,0.001,-4.981,-1.317
worst area,-9.5648,7.090,-1.349,0.177,-23.462,4.332


In [54]:
col=col.drop('worst smoothness')

In [55]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [56]:
logm5=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [57]:
logm5.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,387.0
Model Family:,Binomial,Df Model:,10.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.805
Date:,"Fri, 13 Sep 2019",Deviance:,33.609
Time:,17:27:53,Pearson chi2:,56.9
No. Iterations:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.1283,1.106,-1.924,0.054,-4.296,0.040
mean area,7.0452,4.625,1.523,0.128,-2.019,16.109
mean compactness,7.2465,3.096,2.341,0.019,1.179,13.314
mean concavity,3.3033,3.224,1.024,0.306,-3.017,9.623
mean concave points,-13.8342,5.206,-2.657,0.008,-24.038,-3.631
area error,-10.3661,3.979,-2.605,0.009,-18.165,-2.567
fractal dimension error,0.7001,0.876,0.800,0.424,-1.016,2.416
worst texture,-3.1469,0.926,-3.399,0.001,-4.961,-1.333
worst area,-10.6865,6.625,-1.613,0.107,-23.671,2.298


In [58]:
col=col.drop('fractal dimension error')

In [59]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [60]:
logm6=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [61]:
logm6.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,388.0
Model Family:,Binomial,Df Model:,9.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-17.13
Date:,"Fri, 13 Sep 2019",Deviance:,34.26
Time:,17:27:53,Pearson chi2:,56.6
No. Iterations:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.8752,0.986,-1.901,0.057,-3.808,0.058
mean area,6.4310,4.414,1.457,0.145,-2.221,15.083
mean compactness,7.1223,2.880,2.473,0.013,1.477,12.767
mean concavity,5.1442,2.219,2.318,0.020,0.794,9.494
mean concave points,-14.6725,4.965,-2.955,0.003,-24.403,-4.942
area error,-9.6832,3.244,-2.985,0.003,-16.040,-3.326
worst texture,-3.0845,0.862,-3.578,0.000,-4.774,-1.395
worst area,-10.1413,6.249,-1.623,0.105,-22.389,2.107
worst concavity,-6.0999,2.493,-2.447,0.014,-10.986,-1.214


In [62]:
col=col.drop('mean area')

In [63]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [64]:
logm7=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [65]:
logm7.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,389.0
Model Family:,Binomial,Df Model:,8.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-18.358
Date:,"Fri, 13 Sep 2019",Deviance:,36.716
Time:,17:27:54,Pearson chi2:,69.3
No. Iterations:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.1147,0.710,-1.571,0.116,-2.505,0.276
mean compactness,6.9216,2.537,2.728,0.006,1.949,11.894
mean concavity,5.7331,2.381,2.408,0.016,1.067,10.400
mean concave points,-14.0047,4.488,-3.120,0.002,-22.802,-5.208
area error,-10.3186,3.770,-2.737,0.006,-17.708,-2.929
worst texture,-2.8926,0.786,-3.681,0.000,-4.433,-1.352
worst area,-1.3718,1.903,-0.721,0.471,-5.102,2.358
worst concavity,-6.4398,2.343,-2.748,0.006,-11.032,-1.847
worst symmetry,-2.1574,0.828,-2.605,0.009,-3.780,-0.534


In [66]:
col=col.drop('worst area')

In [67]:
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]

In [68]:
logm8=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()

In [69]:
logm8.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,390.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-18.63
Date:,"Fri, 13 Sep 2019",Deviance:,37.259
Time:,17:27:55,Pearson chi2:,62.5
No. Iterations:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2643,0.699,-1.810,0.070,-2.634,0.105
mean compactness,7.6671,2.454,3.124,0.002,2.857,12.477
mean concavity,6.7404,2.039,3.306,0.001,2.745,10.736
mean concave points,-15.6253,4.213,-3.709,0.000,-23.882,-7.368
area error,-11.9897,3.225,-3.718,0.000,-18.310,-5.669
worst texture,-3.0245,0.798,-3.790,0.000,-4.589,-1.460
worst concavity,-7.2609,2.176,-3.336,0.001,-11.527,-2.995
worst symmetry,-2.2841,0.784,-2.914,0.004,-3.821,-0.748


In [70]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif['Features']=x_train_sc_rfe_sm.columns
vif['VIF']=[variance_inflation_factor(x_train_sc_rfe_sm.values,i) for i in range(x_train_sc_rfe_sm.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

Unnamed: 0,Features,VIF
2,mean concavity,17.66
3,mean concave points,8.46
6,worst concavity,7.52
1,mean compactness,4.98
4,area error,2.25
7,worst symmetry,1.64
5,worst texture,1.21
0,const,1.0


In [71]:
col=col.drop('mean concavity')
x_train_sc_rfe_sm=x_train_sc_rfe_sm[col]
logm9=sm.GLM(list(y_train),x_train_sc_rfe_sm,family=sm.families.Binomial()).fit()
logm9.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,398.0
Model:,GLM,Df Residuals:,391.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-29.231
Date:,"Fri, 13 Sep 2019",Deviance:,58.461
Time:,17:27:55,Pearson chi2:,763.0
No. Iterations:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.5826,0.529,-1.101,0.271,-1.620,0.454
mean compactness,4.5661,1.303,3.503,0.000,2.011,7.121
mean concave points,-8.0581,1.876,-4.296,0.000,-11.735,-4.381
area error,-6.7786,1.849,-3.667,0.000,-10.402,-3.155
worst texture,-2.2001,0.495,-4.445,0.000,-3.170,-1.230
worst concavity,-1.6697,0.829,-2.015,0.044,-3.294,-0.045
worst symmetry,-2.1677,0.625,-3.466,0.001,-3.394,-0.942


In [72]:
vif=pd.DataFrame()
vif['Features']=x_train_sc_rfe_sm.columns
vif['VIF']=[variance_inflation_factor(x_train_sc_rfe_sm.values,i) for i in range(x_train_sc_rfe_sm.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

Unnamed: 0,Features,VIF
2,mean concave points,5.37
1,mean compactness,4.43
5,worst concavity,3.97
3,area error,2.1
6,worst symmetry,1.49
4,worst texture,1.19
0,const,1.0


## Prediction

In [73]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [74]:
# lgr.fit(x_train_sc_rfe_sm,y_train)

In [75]:
y_pred_sm_train=logm9.predict(x_train_sc_rfe_sm)

In [76]:
accuracy_score(y_train,y_pred_sm_train.round())

0.9748743718592965

In [78]:
confusion_matrix(y_train,y_pred_sm_train.round())

array([[142,   7],
       [  3, 246]], dtype=int64)

In [93]:
print(classification_report(y_train,y_pred_sm_train.round()))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       149
           1       0.97      0.99      0.98       249

    accuracy                           0.97       398
   macro avg       0.98      0.97      0.97       398
weighted avg       0.97      0.97      0.97       398



In [94]:
y_pred_lgr_test=sm.add_constant(x_test_sc)

In [95]:
x_test_sc_sm=y_pred_lgr_test[x_train_sc_rfe_sm.columns]

In [96]:
x_test_sc_sm.head()

Unnamed: 0,const,mean compactness,mean concave points,area error,worst texture,worst concavity,worst symmetry
0,1.0,0.048819,-0.271603,-0.21112,-0.144019,-0.007133,0.21048
1,1.0,-0.007883,0.824931,1.135809,0.178777,0.001034,-0.533417
2,1.0,0.371436,0.86104,0.157975,0.082271,0.564609,-0.073904
3,1.0,0.553274,-0.542949,-0.51549,-0.431872,-0.143943,-0.525384
4,1.0,0.170045,-0.597378,-0.40427,-0.96931,-0.453296,-0.890102


In [97]:
y_pred_sm_test=logm9.predict(x_test_sc_sm)

In [98]:
accuracy_score(y_test,y_pred_sm_test.round())

0.9473684210526315

In [99]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier()

knn.fit(x_train_sc,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [100]:
pred=knn.predict(x_train_sc)

In [102]:
accuracy_score(y_train,pred)

0.9773869346733668