In [1]:
# Description: This machine learning model detects breast cancer, based off of data, 
# thus, providing correct diagnosis of BC and classification of patients into malignant or benign groups

# Created on Wed Nov 11 10:35:40 2020 by Levy Naibei

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pickle

# import dataset on spyder
df = pd.read_csv('breastcancer.csv')

# df.head(10)

In [3]:
# df.info()

In [4]:
df.shape

(569, 33)

In [5]:
# inspect dataset for empty values from each column - NaN, na, NAN
# find missing or null data points
# df.isna().sum()

In [6]:
# remove empty values
df=df.dropna(axis=1)

In [7]:
df.shape

(569, 32)

In [8]:
# df.head()

In [9]:
# count number of B's and M's
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [10]:
# visualize B and M count
# sns.countplot(df['diagnosis'], label='Sum')

In [11]:
X = df.iloc[:, 2:31].values
Y = df.iloc[:, 1].values

In [12]:
# encode catagorical data values - diagnosis
labelencoder_Y = LabelEncoder()
Y_L = labelencoder_Y.fit_transform(Y)

In [13]:
df["Diagnosis_encoded"]=Y_L

In [14]:
df.sample(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Diagnosis_encoded
301,892604,B,12.46,19.89,80.43,471.3,0.08451,0.1014,0.0683,0.03099,...,23.07,88.13,551.3,0.105,0.2158,0.1904,0.07625,0.2685,0.07764,0
289,89143601,B,11.37,18.89,72.17,396.0,0.08713,0.05008,0.02399,0.02173,...,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994,0
523,917896,B,13.71,18.68,88.73,571.0,0.09916,0.107,0.05385,0.03783,...,25.63,99.43,701.9,0.1425,0.2566,0.1935,0.1284,0.2849,0.09031,0
351,899667,M,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,...,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105,1
84,8612080,B,12.0,15.65,76.95,443.3,0.09723,0.07165,0.04151,0.01863,...,24.9,87.78,567.9,0.1377,0.2003,0.2267,0.07632,0.3379,0.07924,0
212,8810703,M,28.11,18.47,188.5,2499.0,0.1142,0.1516,0.3201,0.1595,...,18.47,188.5,2499.0,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,1
253,8860702,M,17.3,17.08,113.0,928.2,0.1008,0.1041,0.1266,0.08353,...,25.09,130.9,1222.0,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113,1
265,88995002,M,20.73,31.12,135.7,1419.0,0.09469,0.1143,0.1367,0.08646,...,47.16,214.0,3432.0,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218,1
19,8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,...,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259,0
31,853612,M,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,...,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402,1


In [15]:
df.diagnosis.unique()

array(['M', 'B'], dtype=object)

In [16]:
# create a pairplot/scatterplot
# sns.pairplot(df, hue='diagnosis')

In [17]:
# df.info()

In [18]:
# split dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [19]:
# feature scaling
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

X_train

array([[-1.15036482, -0.39064196, -1.12855021, ..., -0.81232053,
        -0.75798367, -0.01614761],
       [-0.93798972,  0.68051405, -0.94820146, ..., -0.37504806,
        -0.60687023,  0.09669004],
       [ 0.574121  , -1.03333557,  0.51394098, ..., -0.18298917,
        -0.02371948, -0.20050207],
       ...,
       [-1.32422924, -0.20048168, -1.31754581, ..., -0.76769066,
        -0.97974953, -0.71542314],
       [-1.24380987, -0.2245526 , -1.28007609, ..., -1.34136004,
        -1.75401433, -1.58157125],
       [-0.73694129,  1.14989702, -0.71226578, ...,  0.47893704,
        -0.27460457, -1.25895095]])

In [20]:
# 2. Using RandomForestClassification to enable class to use Random Forest Classification Algorithm
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
forest.fit(X_train, Y_train)

# percentage accuracy of the model
Y_pred = forest.predict(X_test)
forest_score = round(accuracy_score(Y_test, Y_pred) * 100, 2)
forest_score

97.37

In [21]:
# save the model
pickle.dump(forest, open('bcmodel.pkl', 'wb'))