# BREAST CANCER DATA SET 

In [1]:
from sklearn.datasets import load_breast_cancer

In [2]:
bc = load_breast_cancer()

## Description  of dataset

In [3]:
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

## Separating training and testing dataset 

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train ,X_test,y_train ,y_test = train_test_split(bc.data,bc.target,random_state = 42)

## Importing necessary modules

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

## List having test scores of diffrent algs 

In [7]:
scores = []
names = ['Linear Regression model','K Nearest Neighbors','Decision Tree Regressor','Random Forest Regressor','Support Vector Machines','Gaussian Naive Bayes']

##  1. Linear Regression model 

In [8]:
lr = LinearRegression().fit(X_train,y_train)
print('\nLinear Regression model \n')
print('train set score : '+ str(lr.score(X_train,y_train)))
print('test set score : ' + str(lr.score(X_test,y_test)))
scores.append(str(lr.score(X_test,y_test)))


Linear Regression model 

train set score : 0.7751179832806713
test set score : 0.7324078874665129


## 2. K Nearest Neighbors

In [9]:
knn = KNeighborsRegressor(n_neighbors = 3).fit(X_train,y_train)
print('\nK Nearest Neigbors\n')
print('train set score : '+ str(knn.score(X_train,y_train)))
print('test set score : ' + str(knn.score(X_test,y_test)))
scores.append(str(knn.score(X_test,y_test)))


K Nearest Neigbors

train set score : 0.8680962277221487
test set score : 0.8380034216488648


## 3. Decision Tree Regressor

In [10]:
dt = DecisionTreeRegressor().fit(X_train,y_train)
print('\nDecision Tree Regressor\n')
print('train set score : '+ str(dt.score(X_train,y_train)))
print('test set score : ' + str(dt.score(X_test,y_test)))
scores.append(str(dt.score(X_test,y_test)))


Decision Tree Regressor

train set score : 1.0
test set score : 0.7619642114024137


## 4. Random Forest Regressor

In [11]:
rf = RandomForestRegressor(n_estimators= 100).fit(X_train,y_train)
print('\nRandom forest regressor\n')
print('train set score : '+ str(rf.score(X_train,y_train)))
print('test set score : ' + str(rf.score(X_test,y_test)))
scores.append(str(rf.score(X_test,y_test)))


Random forest regressor

train set score : 0.9778337946344229
test set score : 0.87006518934665


## 5. Support Vector Machines

In [12]:
sv = SVC(kernel='linear').fit(X_train,y_train)
print('\nSVM\n')
print('train set score : '+ str(sv.score(X_train,y_train)))
print('test set score : ' + str(sv.score(X_test,y_test)))
scores.append(str(sv.score(X_test,y_test)))


SVM

train set score : 0.9694835680751174
test set score : 0.958041958041958


## 6. Gaussian Naive Bayes

In [13]:
gnb = GaussianNB().fit(X_train,y_train)
print('\nGaussian Naive Bayes\n')
print('train set score : '+ str(gnb.score(X_train,y_train)))
print('test set score : ' + str(gnb.score(X_test,y_test)))
scores.append(str(gnb.score(X_test,y_test)))


Gaussian Naive Bayes

train set score : 0.9366197183098591
test set score : 0.958041958041958


## Comparison of the algorithms described above

In [14]:
list(zip(names , scores))

[('Linear Regression model', '0.7324078874665129'),
 ('K Nearest Neighbors', '0.8380034216488648'),
 ('Decision Tree Regressor', '0.7619642114024137'),
 ('Random Forest Regressor', '0.87006518934665'),
 ('Support Vector Machines', '0.958041958041958'),
 ('Gaussian Naive Bayes', '0.958041958041958')]

==================================================
# Conclusion :
==================================================
## The best to worst score is of - 
## Gaussian Naive Bayes == Support Vector Machines
## Random Forest Regressor
## K Nearest Neighbors
## Decision Tree Regressor
## Linear Regression model
