In [1]:
# Description: This machine learning model detects breast cancer, based off of data, 
# thus, providing correct diagnosis of BC and classification of patients into malignant or benign groups

# Created on Wed Nov 11 10:35:40 2020 by Levy Naibei

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# import dataset on spyder
df = pd.read_csv('breastcancer.csv')

# df.head(10)

In [3]:
# df.info()

In [4]:
df.shape

(569, 33)

In [5]:
# inspect dataset for empty values from each column - NaN, na, NAN
# find missing or null data points
# df.isna().sum()

In [6]:
# remove empty values
df=df.dropna(axis=1)

In [7]:
df.shape

(569, 32)

In [8]:
# df.head()

In [9]:
# count number of B's and M's
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [10]:
# visualize B and M count
# sns.countplot(df['diagnosis'], label='Sum')

In [11]:
X = df.iloc[:, 2:31].values
Y = df.iloc[:, 1].values

In [12]:
# encode catagorical data values - diagnosis
labelencoder_Y = LabelEncoder()
Y_L = labelencoder_Y.fit_transform(Y)

In [13]:
df["Diagnosis_encoded"]=Y_L

In [14]:
df.sample(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Diagnosis_encoded
439,909410,B,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,...,19.31,96.53,688.9,0.1034,0.1017,0.0626,0.08216,0.2136,0.0671,0
227,88147102,B,15.0,15.51,97.45,684.5,0.08371,0.1096,0.06505,0.0378,...,19.31,114.2,808.2,0.1136,0.3627,0.3402,0.1379,0.2954,0.08362,0
204,87930,B,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,...,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875,0
449,911157302,M,21.1,20.52,138.1,1384.0,0.09684,0.1175,0.1572,0.1155,...,32.07,168.2,2022.0,0.1368,0.3101,0.4399,0.228,0.2268,0.07425,1
115,864685,B,11.93,21.53,76.53,438.6,0.09768,0.07849,0.03328,0.02008,...,26.15,87.54,583.0,0.15,0.2399,0.1503,0.07247,0.2438,0.08541,0
191,875093,B,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,...,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871,0
209,8810436,B,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,...,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474,0
43,856106,M,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,...,28.0,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027,1
107,863270,B,12.36,18.54,79.01,466.7,0.08477,0.06815,0.02643,0.01921,...,27.49,85.56,544.1,0.1184,0.1963,0.1937,0.08442,0.2983,0.07185,0
390,90317302,B,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,...,15.65,73.23,394.5,0.1343,0.165,0.08615,0.06696,0.2937,0.07722,0


In [15]:
df.diagnosis.unique()

array(['M', 'B'], dtype=object)

In [16]:
# create a pairplot/scatterplot
# sns.pairplot(df, hue='diagnosis')

In [17]:
# df.info()

In [18]:
# split dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [19]:
# feature scaling
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

X_train

array([[-1.15036482, -0.39064196, -1.12855021, ..., -0.81232053,
        -0.75798367, -0.01614761],
       [-0.93798972,  0.68051405, -0.94820146, ..., -0.37504806,
        -0.60687023,  0.09669004],
       [ 0.574121  , -1.03333557,  0.51394098, ..., -0.18298917,
        -0.02371948, -0.20050207],
       ...,
       [-1.32422924, -0.20048168, -1.31754581, ..., -0.76769066,
        -0.97974953, -0.71542314],
       [-1.24380987, -0.2245526 , -1.28007609, ..., -1.34136004,
        -1.75401433, -1.58157125],
       [-0.73694129,  1.14989702, -0.71226578, ...,  0.47893704,
        -0.27460457, -1.25895095]])

In [20]:
# 2. Using RandomForestClassification to enable class to use Random Forest Classification Algorithm
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
forest.fit(X_train, Y_train)

# percentage accuracy of the model
Y_pred = forest.predict(X_test)
forest_score = round(accuracy_score(Y_test, Y_pred) * 100, 2)
forest_score

97.37