# Breast Cancer Wisconsin

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
headers = ["ID","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses","Diagnosis"]
data = pd.read_csv('breast-cancer-wisconsin.data', names = headers)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data["BN"] = pd.to_numeric(data["BN"])
data.info()

In [None]:
data = data.replace(['?'], 0)

In [None]:
data['BN'][23]

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
feature_scaling = StandardScaler()
X_train = feature_scaling.fit_transform(X_train)
X_test = feature_scaling.transform(X_test)

In [None]:
rfc = RandomForestClassifier(n_estimators=1000, max_features=1)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
import seaborn as sns; sns.set()
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value');

In [None]:
print(classification_report(y_test, y_pred))

# decision tree

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
tree_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, tree_pred)

# Adaboost

In [None]:
ada = AdaBoostClassifier(n_estimators=50)
ada.fit(X_train, y_train)

ada_pred = ada.predict(X_test)


In [None]:
accuracy_score(y_test, ada_pred)