In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import warnings
# from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

# Reading the dataset
dataset = pd.read_csv("wdbc.data", sep=",").values

# Forming a data frame 
attributes = ["ID","Diagnosis","mean_radius","mean_texture","mean_perimeter","mean_area","mean_smoothness","mean_compactness","mean_concavity","mean_concave_points","mean_symmetry","mean_fractal_dimension","se_radius","se_texture","se_perimeter","se_area","se_smoothness","se_compactness","se_concavity","se_concave_points","se_symmetry","se_fractal_dimension",
            "worst_radius","worst_texture","worst_perimeter","worst_area","worst_smoothness","worst_compactness","worst_concavity","worst_concave_points","worst_symmetry","worst_fractal_dimension"]
data = pd.DataFrame(data=dataset,columns=attributes)
data.set_index("ID", inplace=True)

# Displaying the diagnosis(M = malignant, B = benign) and the 30 real-valued attributes
display(data.head())

Unnamed: 0_level_0,Diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [2]:
# Label-1 indicates malignant and Label-0 indicates benign 
data["Diagnosis"] = data["Diagnosis"].replace("M",1)
data["Diagnosis"] = data["Diagnosis"].replace("B",0)
data = data.apply(pd.to_numeric, errors="ignore")

# X contains only the first 10 features i.e only the mean values of the attributes not their standard error or maximum values
X = dataset[:,2:12]
X = X.astype("float64")
X = np.concatenate([np.ones((X.shape[0],1)),X],axis=1)
labels = data["Diagnosis"].values.reshape(-1,1)
data.drop("Diagnosis", axis = 1, inplace = True)
final_data = np.concatenate([X,labels],axis=1)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
m,n = final_data.shape
for i in range(5):
    np.random.seed(0)
    np.random.shuffle(final_data)
    x_train = final_data[0:int(0.8*m),0:n-1]
    y_train = final_data[0:int(0.8*m),n-1]
    x_test  = final_data[int(0.8*m):,0:n-1]
    y_test  = final_data[int(0.8*m):,n-1]
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train,y_train)
    predictions = logisticRegr.predict(x_test)
    accuracy = (sum([1 if y_test[i]==predictions[i] else 0 for i in range(y_test.shape[0])])/y_test.shape[0])*100
    print("\nAccuracy using sklearn: {:.2f}%\n".format(accuracy))
    from sklearn.metrics import classification_report
    print(classification_report(y_test,predictions))
    cm = metrics.confusion_matrix(y_test, predictions)
    print("Confusion matrix:\n",cm)


Accuracy using sklearn: 92.11%

              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94        75
         1.0       0.89      0.87      0.88        39

    accuracy                           0.92       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114

Confusion matrix:
 [[71  4]
 [ 5 34]]

Accuracy using sklearn: 91.23%

              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93        69
         1.0       0.93      0.84      0.88        45

    accuracy                           0.91       114
   macro avg       0.92      0.90      0.91       114
weighted avg       0.91      0.91      0.91       114

Confusion matrix:
 [[66  3]
 [ 7 38]]

Accuracy using sklearn: 91.23%

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93        69
         1.0       0.89      0.89      0.89        45

    acc

In [4]:
data1 = np.array([[92.11,91.23,91.23,88.60,86.84,92.11],
                 [0.91, 0.92, 0.91, 0.87, 0.86, 0.92],
                 [0.91, 0.90, 0.91, 0.88, 0.86, 0.91],
                 [0.91, 0.91, 0.91, 0.87, 0.86, 0.91 ]])
columns1 = ["Validation1","Validation2","Validation3","Validation4","Validation5","Maximum Value"]
index1 = ["Accuracy", "Pricision", "Recall", "F1 Score"]
df1 = pd.DataFrame(data = data1, columns = columns1, index = index1)
display(df1)

Unnamed: 0,Validation1,Validation2,Validation3,Validation4,Validation5,Maximum Value
Accuracy,92.11,91.23,91.23,88.6,86.84,92.11
Pricision,0.91,0.92,0.91,0.87,0.86,0.92
Recall,0.91,0.9,0.91,0.88,0.86,0.91
F1 Score,0.91,0.91,0.91,0.87,0.86,0.91
