# Breast Cancer Wisconsin (Diagnostic) Data Set

We will be Predicting the cancer diagnosis of patients as either benign or malignant. 

Ten real-valued features are computed for each cell nucleus:

        - Radius (mean of distances from center to points on the perimeter)
        - Texture (standard deviation of gray-scale values)
        - Perimeter
        - Area
        - Smoothness (local variation in radius lengths)
        - Compactness (perimeter^2 / area - 1.0)
        - Concavity (severity of concave portions of the contour)
        - Concave points (number of concave portions of the contour)
        - Symmetry 
        - Fractal dimension ("coastline approximation" - 1)
        
The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

All feature values are recoded with four significant digits.

Missing attribute values: none

Class distribution: 357 benign, 212 malignant

Data Source: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data?select=data.csv

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing the dataset

In [None]:
ds = pd.read_csv('data.csv')
ds.head()

In [None]:
ds.info()

# Visualising the dataset

In [None]:
ds.head()

In [None]:
sns.pairplot(ds, hue = 'diagnosis', vars = ['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean'])

In [None]:
sns.countplot(ds.diagnosis)

In [None]:
plt.rcParams['figure.figsize'] = (8,5)
sns.scatterplot(data = ds, x = 'area_mean', y = 'smoothness_mean', hue = 'diagnosis')

In [None]:
plt.rcParams['figure.figsize'] = (30,15)
sns.heatmap(ds.corr(), annot=True) 
plt.show()

# Taking care of missing data

In [None]:
sns.heatmap(ds.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

In [None]:
ds.head(2)

In [None]:
X = ds.iloc[:, 2:-1].values
y = ds.iloc[:, 1].values

In [None]:
X.shape

In [None]:
y.shape

# Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X

In [None]:
# We observe that 1 corresponds to M (Malignant) and 0 corresponds to B (Benign)
y

# Splitting the dataset into the training set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Fitting the SVM to the dataset

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
# Predicting the test set values

y_pred = svc.predict(X_test)

# Model Evaluation - Confusion Matrix and K-Fold Cross Validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(estimator = svc, X = X_train, y = y_train, cv = 10)
print("Mean accuracy = ",round(accuracy.mean()*100, 1), '%')
print("Mean std is = ", accuracy.std())

In [None]:
print(classification_report(y_test, y_pred))

# Model Improvements - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100,1000], 'kernel':['linear']}, 
              {'C':[1,10,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1,0.01,0.001]},
              {'C':[1,10,100,1000], 'kernel':['poly'], 'degree':[2,3,4], 'gamma':[0.5,0.1,0.01,0.001]}]
grid_search = GridSearchCV(estimator = svc, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
print('Optimal accuracy is:', round(best_accuracy*100, 1),'%')

print(best_parameters)

# Using the improved model

In [None]:
# We are using the SVC with the optimal parameters obatined from the Grid Search CV

svc_new = SVC(C=10, gamma=0.01, kernel='rbf')
svc_new.fit(X_train, y_train)

y_pred = svc_new.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))