In [None]:
### Load necessary libraries
    
    #importing the libraries
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    


### Dataset

The Breast Cancer dataset, first obtained from Dr. William H. Wolberg at the University of
Wisconsin Hospitals, Madison, is composed of 30 continuous variables and 569 observations. The
dataset is based on ten original features describing cancerous cell nuclei derived from a digitized image
of a fine needle aspirate of a breast mass. For each of these ten features, the mean, standard error and
the ’worst’ value (defined as the mean of the three largest values) have been calculated, resulting in a
total of 30 continuous features. The original variable "area", for example, has been split into three separate
features, area_mean, area_SE and area_worst. The dataset reported only these derived features, not
the original variables. The response variable is a categorical variable indicating whether the tumour is
malignant (M) or benign (B). The dataset contains 357 benign and 212 malignant examples. The distribution of
all variables with respect to response variable is shown as violin plot below. 
![](VIOLIN.PNG)

Further details of dataset can be viewed at [UCI machine learning repo](https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names) . We have downloaded this for you as a CSV file: `data.csv`.

In [None]:

### Import data.csv as Pandas Dataframe. Split the dataset to create X (all features) and Y (Target variable)¶

#importing the dataset 
dataset = pd.read_csv('data.csv')
print("Cancer data set dimensions : {}".format(dataset.shape))
print(dataset.head())
X = dataset.iloc[:, 2:-1].values
Y = dataset.iloc[:, 1].values

In [None]:
### Find the dimensions of the data set using the panda dataset ‘shape’ attribute.¶

print("Cancer data set shape : {}".format(dataset.shape))

In [None]:
### Identify "Malignant" and "Benign" cases in the dataset¶

print(dataset.groupby('diagnosis').size())
diagnosis

### Visualize the dataset, showing distributions of all features with respect to both target classes

#Visualization of data
dataset.groupby('diagnosis').hist(figsize=(12, 12))

In [None]:
### Encode "Malignant" and "Benign" in Y to 0/1¶

from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
Y

In [None]:
### Perform an 80/20 train/test split to X and Y arrays¶

#Split the dataset into the Training set and Test set for X and Y 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
### Apply StandardScalar() to all features in X_train and X-test

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [None]:
### Fit the Naive Bayes Classifier¶

#Fitting Naive_Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_scaled, Y_train);
classifier.class_prior_

In [None]:
### Make predictions from trained classifier¶

#Make Predictions
Y_pred = classifier.predict(X_test_scaled)
Y_pred

In [None]:

# Calculate accuracy using scikit learn
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)