In [1]:
# Description: This machine learning model detects breast cancer, based off of data, 
# thus, providing correct diagnosis of BC and classification of patients into malignant or benign groups

# Created on Wed Nov 11 23:07:20 2020 by Levy Naibei

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# import dataset on spyder
df = pd.read_csv('breastcancer.csv')

#use required features
cdf = df[['diagnosis', 'radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']]
# df.head(10)

In [3]:
# df.info()

In [4]:
cdf.shape

(569, 11)

In [5]:
# inspect dataset for empty values from each column - NaN, na, NAN
# find missing or null data points
# df.isna().sum()

In [6]:
# remove empty values
cdf=cdf.dropna(axis=1)

In [7]:
cdf.shape

(569, 11)

In [8]:
# df.head()

In [9]:
# count number of B's and M's
cdf['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [10]:
cdf.head(10)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
5,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
6,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742
7,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
8,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
9,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243


In [11]:
# visualize B and M count
# sns.countplot(df['diagnosis'], label='Sum')

In [12]:
X = cdf.iloc[:, 1:11].values
Y = cdf.iloc[:, 0:1].values

In [13]:
# encode catagorical data values - diagnosis
le = LabelEncoder()
Y_L = le.fit_transform(Y.ravel())

In [14]:
cdf["diagnosis_encoded"]=Y_L

In [15]:
# create a pairplot/scatterplot
# sns.pairplot(df, hue='diagnosis')

In [16]:
# df.info()

In [17]:
# split dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [18]:
# cdf.info()

In [19]:
# feature scaling
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [20]:
# 2. Using RandomForestClassification to enable class to use Random Forest Classification Algorithm
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
forest.fit(X_train, Y_train.ravel())

# percentage accuracy of the model
Y_pred = forest.predict(X_test)
forest_score = round(accuracy_score(Y_test, Y_pred) * 100, 2)
forest_score

94.74

In [21]:
# save the model
pickle.dump(forest, open('bcmodel.pkl', 'wb'))