# Breast Cancer Dataset

In [None]:
# Questions:
#- 3 datasets present. 1st one is with less features
#- what about test set

In [None]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

#importing breast cancer dataset
dataset_1 = pd.read_csv('breast-cancer-wisconsin.data',header=None)
dataset_2 = pd.read_csv('wdbc.data',header=None)
dataset_3 = pd.read_csv('wpbc.data',header=None)
data_1 = pd.DataFrame(dataset_1)
data_2 = pd.DataFrame(dataset_2)
data_3 = pd.DataFrame(dataset_3)
X_1 = data_1.iloc[:, 2:31].values
Y_1 = data_1.iloc[:, 1].values
X_2 = data_2.iloc[:, 2:31].values
Y_2 = data_2.iloc[:, 1].values
X_3 = data_3.iloc[:, 2:31].values
Y_3 = data_3.iloc[:, 1].values


print(data_1)
print(data_2)
print(data_3)
print(X_1.shape)
print(Y_1.shape)
print(X_2.shape)
print(Y_2.shape)
print(X_3.shape)
print(Y_3.shape)

In [None]:
dataset_1.info()
print(X_2)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
Y_2 = labelencoder_Y.fit_transform(Y_2)
print(Y_2)

In [None]:
# #Imputing the missing values : Needed only if data has missing values
# from sklearn.impute import SimpleImputer
# imp = SimpleImputer(missing_values='?', strategy='mean')
# X_2 = imp.fit_transform(X_2)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_2, Y_2, test_size = 0.33, random_state = 0)

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Model Accuracy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

names = ['KNeighborsClassifier','SVC', 'DecisionTreeClassifier', 'RandomForestClassifier','AdaBoostClassifier','LogisticRegression','GaussianNB','MLPClassifier']
models = [KNeighborsClassifier(),
          SVC(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          AdaBoostClassifier(),
          LogisticRegression(),
          GaussianNB(),
          MLPClassifier()]

param_distributions = {
    'KNeighborsClassifier': {'n_neighbors': [5,11], 'metric':['minkowski','euclidean']},
    'SVC': {'kernel':['rbf']},
    'DecisionTreeClassifier': {'criterion':['gini','entropy'], 'max_depth': [1,10,20,30,50]},
    'RandomForestClassifier': {'n_estimators': [16, 32]},
    'AdaBoostClassifier': {'n_estimators': [16, 32], 'learning_rate':[0.8,1]},
    'LogisticRegression': {'max_iter':[100,130,140],'C': np.logspace(-1, 1, 3), 'solver':['lbfgs','liblinear']},
    'GaussianNB': {},
    'MLPClassifier': {'hidden_layer_sizes': [(100,),(50,30)],'activation':['tanh', 'relu'], 'max_iter': [100,200]}
}

accuracy= np.zeros(8)
for counter, model in enumerate(models):
    np.random.seed(0);
    gridcv = GridSearchCV(model, param_distributions[names[counter]], n_jobs=2, cv=3)
    gridcv.fit(X_train, Y_train)
    Y_pred = gridcv.best_estimator_.predict(X_test)
    print(gridcv.best_params_)
    accuracy[counter] = round(f1_score(Y_test, Y_pred, average='micro'),3)
    print("f1_score for " + names[counter] + ":",accuracy[counter])

In [None]:
#Comparision graph between all models
import seaborn as sns
y_pos = np.arange(len(names))
heights = [accuracy[0],accuracy[1],accuracy[2],accuracy[3],accuracy[4],accuracy[5],accuracy[6], accuracy[7]]

fig, ax=plt.subplots(1,1,figsize=(12,6))

plt.xticks(rotation='90')
sns.barplot(x=names, y=heights)
plt.ylabel('f1 score')
plt.title('Breast dataset models accuracy')