In [269]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split

In [270]:
df = pd.read_csv('./breast_cancer.csv')


In [271]:
df.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [272]:

y = df['Class']
x = df.drop(['Class'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3)


In [273]:
#Librerias especificas de regresion logistica
from ISLP.models import (ModelSpec as MS, summarize)
import statsmodels.api as sm

In [274]:
positive = y_train == 4
diseño = MS(x_train)
X = diseño.fit_transform(x_train)
Y = positive
glm = sm.GLM(Y,X,
             family=sm.families.Binomial())
resultados = glm.fit()
summarize(resultados)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-13.5619,2.41,-5.628,0.0
Clump Thickness,0.8664,0.24,3.611,0.0
Uniformity of Cell Size,-0.1225,0.392,-0.313,0.755
Uniformity of Cell Shape,0.3396,0.434,0.783,0.434
Marginal Adhesion,0.4104,0.171,2.403,0.016
Single Epithelial Cell Size,0.1673,0.197,0.849,0.396
Bare Nuclei,0.3622,0.128,2.829,0.005
Bland Chromatin,0.7378,0.278,2.651,0.008
Normal Nucleoli,0.2938,0.153,1.921,0.055
Mitoses,0.8531,0.574,1.487,0.137


In [275]:
#coeficientes
resultados.params

intercept                     -13.561939
Clump Thickness                 0.866371
Uniformity of Cell Size        -0.122461
Uniformity of Cell Shape        0.339618
Marginal Adhesion               0.410387
Single Epithelial Cell Size     0.167340
Bare Nuclei                     0.362160
Bland Chromatin                 0.737772
Normal Nucleoli                 0.293774
Mitoses                         0.853131
dtype: float64

In [276]:
#p-values
# H0 : beta sub i = 0
# H1 : beta sub i sig
resultados.pvalues


intercept                      1.819549e-08
Clump Thickness                3.047289e-04
Uniformity of Cell Size        7.545193e-01
Uniformity of Cell Shape       4.335510e-01
Marginal Adhesion              1.624639e-02
Single Epithelial Cell Size    3.959329e-01
Bare Nuclei                    4.667235e-03
Bland Chromatin                8.023201e-03
Normal Nucleoli                5.472521e-02
Mitoses                        1.370135e-01
dtype: float64

In [277]:
#coef significativos
resultados.params[resultados.pvalues < 0.05]

intercept           -13.561939
Clump Thickness       0.866371
Marginal Adhesion     0.410387
Bare Nuclei           0.362160
Bland Chromatin       0.737772
dtype: float64

In [278]:
#coef no sginificativos
resultados.params[resultados.pvalues > 0.05]

Uniformity of Cell Size       -0.122461
Uniformity of Cell Shape       0.339618
Single Epithelial Cell Size    0.167340
Normal Nucleoli                0.293774
Mitoses                        0.853131
dtype: float64

In [279]:
aux = resultados.predict(MS(x_test).fit_transform(x_test))


In [280]:
y_pred = np.array([4]*len(aux))
y_pred[aux<0.5] = 2
y_pred = pd.DataFrame(data=y_pred)

print(y_test.head())


542    2
392    2
594    2
393    2
292    4
Name: Class, dtype: int64


In [281]:
from ISLP import confusion_table 
cf = confusion_table(y_pred, y_test)
cf

Truth,2,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
2,120,5
4,5,75


In [282]:
TP = cf.iloc[0,0]
FP = cf.iloc[0,1]
FN = cf.iloc[1,0]
TN = cf.iloc[1,1]
N = TP+FP+FN+TN

In [283]:
Accuracy = (TP+TN)/N
Precision = TP /(TP+FP)
Recall = TP / (TP + FN)
F_1 = (2*Precision*Recall)/(Precision+Recall)

In [284]:
print(f""" 
Accuracy: {round(Accuracy,4)} 
Precision: {round(Precision,4)} 
Recall: {round(Recall,4)} 
F_1: {round(F_1,4)}
""")

 
Accuracy: 0.9512 
Precision: 0.96 
Recall: 0.96 
F_1: 0.96



#### Modelo con las varibles significativas

In [285]:
summarize(resultados)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-13.5619,2.41,-5.628,0.0
Clump Thickness,0.8664,0.24,3.611,0.0
Uniformity of Cell Size,-0.1225,0.392,-0.313,0.755
Uniformity of Cell Shape,0.3396,0.434,0.783,0.434
Marginal Adhesion,0.4104,0.171,2.403,0.016
Single Epithelial Cell Size,0.1673,0.197,0.849,0.396
Bare Nuclei,0.3622,0.128,2.829,0.005
Bland Chromatin,0.7378,0.278,2.651,0.008
Normal Nucleoli,0.2938,0.153,1.921,0.055
Mitoses,0.8531,0.574,1.487,0.137


In [286]:
list(resultados.params[resultados.pvalues > 0.05].index)

['Uniformity of Cell Size',
 'Uniformity of Cell Shape',
 'Single Epithelial Cell Size',
 'Normal Nucleoli',
 'Mitoses']

In [287]:
df_2 = df.drop(list(resultados.params[resultados.pvalues > 0.05].index), axis=1)


In [288]:
from sklearn.model_selection import train_test_split
# en este caso se va a predecir la varible class como clase 2 o clase 4
y_2 = df_2['Class']
x_2 = df_2.drop(['Class'], axis=1)

x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_2, y_2, test_size= 0.3)

In [289]:
from ISLP.models import (ModelSpec as MS, summarize)
import statsmodels.api as sm
positive = y_train_2 == 4

In [290]:
diseño = MS(x_train_2)
X_2 = diseño.fit_transform(x_train_2)
Y_2 = positive
glm = sm.GLM(Y_2,X_2,
             family=sm.families.Binomial())
resultados_2 = glm.fit()

In [291]:
summarize(resultados_2)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.6019,1.582,-7.333,0.0
Clump Thickness,0.9015,0.175,5.15,0.0
Marginal Adhesion,0.5186,0.162,3.198,0.001
Bare Nuclei,0.5883,0.127,4.647,0.0
Bland Chromatin,0.7764,0.196,3.969,0.0


In [292]:
aux = resultados_2.predict(MS(x_test_2).fit_transform(x_test_2))
y_pred_2 = np.array([4]*len(aux))
y_pred_2[aux<0.5] = 2
y_pred_2 = pd.DataFrame(data=y_pred_2)

In [293]:
from ISLP import confusion_table 
cf_2 = confusion_table(y_pred_2, y_test_2)
cf_2

Truth,2,4
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
2,120,7
4,3,75


In [294]:
TP_2 = cf_2.iloc[0,0]
FP_2 = cf_2.iloc[0,1]
FN_2 = cf_2.iloc[1,0]
TN_2 = cf_2.iloc[1,1]
N_2 = TP_2+FP_2+FN_2+TN_2

In [295]:
Accuracy_2 = (TP_2+TN_2)/N_2
Precision_2 = TP /(TP_2+FP_2)
Recall_2 = TP_2 / (TP_2 + FN_2)
F_1_2 = (2*Precision_2*Recall_2)/(Precision_2+Recall_2)

In [296]:
print(f""" 
Accuracy: {round(Accuracy,4)}\tAccuracy_2:\t{round(Accuracy_2,4)}
Precision: {round(Precision,4)}\tPrecision_2:\t{round(Precision_2,4)}
Recall: {round(Recall,4)}\t\tRecall_2:\t{round(Recall_2,4)}
F_1: {round(F_1,4)}\t\tF_1_2:\t\t{round(F_1_2,4)}
""")

 
Accuracy: 0.9512	Accuracy_2:	0.9512
Precision: 0.96	Precision_2:	0.9449
Recall: 0.96		Recall_2:	0.9756
F_1: 0.96		F_1_2:		0.96

