# Predicting Breast Cancer Types using Logistic and Decision Tree

# Import Data Set

In [None]:
# Importing the Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import hvplot.pandas
import itertools
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.offline as py
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
BC = pd.read_csv("breast_cancer_prediction.csv")

In [None]:
BC.head()

# Preprocessing

In [None]:
BC['diagnosis'].replace(['M', 'B'],
                        [1, 0], inplace=True)

In [None]:
print("Cancer data set dimensions : {}".format(BC.shape))


***Checking Null and Missing Values***

In [None]:
print("\nNull Values:\n", BC.isnull().sum())
print("\nMissing Values:\n", BC.isna().sum())

***Information of dataset***

In [None]:
BC.info()

After checking various aspects like null values count, missing values count, and info. This dataset is perfect because of no Nul and missing values.

***Statistical Description of Data***

In [None]:
BC.describe()

***Extracting Mean, Squared Error, and Worst Features***

In [None]:
BC_mean = BC[BC.columns[:11]]
BC_se = BC.drop(BC.columns[1:11], axis=1)
BC_se = BC_se.drop(BC_se.columns[11:], axis=1)
BC_worst = BC.drop(BC.columns[1:21], axis=1)

In [None]:
BC_mean

In [None]:
BC_worst

# Explorartory Data Analysis (EDA)

***Count Plot of Diagnosis***

In [None]:
# I visualized target data in the dataset.
sns.countplot(BC['diagnosis'])
print(BC.diagnosis.value_counts())

***Pie chart of Diagnosis***

In [None]:
import plotly.express as px


fig = px.pie(BC, values=BC['diagnosis'].value_counts().values, names=BC['diagnosis'].value_counts().index)
fig.show()

***Heat Map***

In [None]:

f, ax = plt.subplots(figsize = (32,20))
sns.heatmap(BC.corr(), annot = True, linewidths=0.5, linecolor = "black", fmt = ".4f", ax = ax)
plt.title("Correlation Between Features")
plt.show()


In [None]:
# Create correlation matrix
corr_mat = BC.corr()

# Create mask
mask = np.zeros_like(corr_mat, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True

# Plot heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr_mat, annot=True, fmt='.1f',
            cmap='RdBu_r', vmin=-1, vmax=1,
            mask=mask)

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(corr_mat[corr_mat > 0.8], annot=True,
            fmt='.1f', cmap=sns.cubehelix_palette(200), mask=mask)

***Correlation with Diagnosis:
Correlation of Mean Features with Diagnosis:***

In [None]:
plt.figure(figsize=(20, 8))
BC_mean.drop('diagnosis', axis=1).corrwith(BC_mean.diagnosis).plot(kind='bar', grid=True, 
title="Correlation of Mean Features with Diagnosis", color="cornflowerblue");


***Correlation of Squared Error Features with Diagnosis:***

In [None]:
plt.figure(figsize=(20, 8))
BC_se.drop('diagnosis', axis=1).corrwith(BC_se.diagnosis).plot(kind='bar', grid=True,
title="Correlation of Squared Error Features with Diagnosis", color="cornflowerblue");

***Correlation of Worst Features with Diagnosis:***

In [None]:
plt.figure(figsize=(20, 8))
BC_worst.drop('diagnosis', axis=1).corrwith(BC_worst.diagnosis).plot(kind='bar',
grid=True, title="Correlation of Worst Error Features with Diagnosis", color="cornflowerblue");

***Violinplot***

In [None]:
 y includes diagnosis column with M or B values
y = BC.diagnosis
# drop the column 'id' as it is does not convey any useful info
# drop diagnosis since we are separating labels and features 
list = ['diagnosis']
# X incudes our features
X = BC.drop(list,axis = 1)
# get the first ten features
data_dia = y
data = X
data_std = (data - data.mean() / (data.std() ))# standardization
# get the first 10 features
data = pd.concat([y,data_std.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars='diagnosis',
 var_name='features',
 value_name='value')
# make a violin plot
plt.figure(figsize=(10,10))
sns.violinplot(x='features', y='value', hue='diagnosis', data=data,split=True, inner='quart')
plt.xticks(rotation=90)#

In [None]:
sns.pairplot(BC, hue ='diagnosis')

# Feature Selection

In [None]:
Y = BC['diagnosis']
X = BC.drop(['diagnosis'], axis=1)

In [None]:
X.head()

In [None]:
Y.head()

# Train-Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .30, random_state= 8)

# Model Building

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

In [None]:
X_test

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regressor = LogisticRegression()
logistic_regressor.fit(X_train, Y_train)

In [None]:
y_pred = logistic_regressor.predict(X_test)


In [None]:
y_pred[0:5]

In [None]:
data = pd.DataFrame({'Actual': Y_test, 'Predicted': y_pred})
data

In [None]:
y_prob = logistic_regressor.predict_proba(X_test)

In [None]:
y_prob[0:5]

In [None]:
from sklearn.metrics import accuracy_score
acc_score1 = accuracy_score(Y_test, y_pred)
print(acc_score1)

In [None]:
print("Training Score: ",logistic_regressor.score(X_train,Y_train)*100)

In [None]:
print("Testing Score: ",logistic_regressor.score(X_test,Y_test)*100)

In [None]:
from sklearn.metrics import confusion_matrix
Cm1 = confusion_matrix(Y_test,y_pred)
Cm1

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix1 = confusion_matrix(Y_test, y_pred)
dataframe_conf_matrix1 = conf_matrix1
sns.heatmap(dataframe_conf_matrix1, annot=True)

In [None]:
from sklearn.metrics import classification_report
class_report1 = classification_report(Y_test, y_pred)
print(class_report1)

In [None]:
Accuracy = (Cm1[0][0] + Cm1[1][1]) / (Cm1[0][0] + Cm1[1][1] + Cm1[0][1] + Cm1[1][0])
print("Accuracy",Accuracy)
Error_rate = (Cm1[0][1] + Cm1[1][0]) / (Cm1[0][0] + Cm1[1][1] + Cm1[0][1] + Cm1[1][0])
print("Error_rate",Error_rate)
Sensitivity = Cm1[0][0]/(Cm1[0][0] + Cm1[1][0])
print("Sensitivity",Sensitivity)
Specificity = Cm1[1][1]/(Cm1[1][1] + Cm1[0][1])
print("Specificity",Specificity)


In [None]:
def doLogisticRegression(X, Y, test_size = 0.20, random_state = 42, penalty='l2', solver='lbfgs'):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)
    logistic_regressor = LogisticRegression(penalty=penalty, solver=solver)
    logistic_regressor.fit(X_train, Y_train)
    y_pred = logistic_regressor.predict(X_test)
    acc_score = accuracy_score(Y_test, y_pred)
    return acc_score

In [None]:
penalties = ['none', 'l2']
test_size = [0.30, 0.25, 0.20]
random_states = [10, 25, 55]
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
for t_size in test_size:
    for r_state in random_states:
        for penalty in penalties:
            for solver in solvers:
                accuracy = doLogisticRegression(X, Y, t_size, r_state, penalty)
                print("Test: {} | Random State: {} | Penalty: {} | Solver: {} | Accuracy : {}".format(t_size, r_state, penalty, solver, accuracy))

In [None]:
BC_1= pd.DataFrame(columns = ['Test Size', 'Random States', 'Penalty', 'Solvers', 'Accuracy'])
BC_1

In [None]:
penalties = ['none', 'l2']
test_size = [0.30, 0.25, 0.20]
random_states = [10, 25, 55]
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
for t_size in test_size:
    for r_state in random_states:
        for penalty in penalties:
            for solver in solvers:
                accuracy = doLogisticRegression(X, Y, t_size, r_state, penalty)
#print("Test: {} | Random State: {} | Penalty: {} | Solver: {} | Accuracy : {}".format
#(t_size, r_state, penalty, solver, accuracy))
                BCEvaluation = {}
                BCEvaluation['Test Size'] = t_size
                BCEvaluation['Random States'] = r_state
                BCEvaluation['Penalty'] = penalty
                BCEvaluation['Solvers'] = solver
                BCEvaluation['Accuracy'] = accuracy
                BC_1= BC_1.append(BCEvaluation, ignore_index = True)

In [None]:
BC_1

In [None]:
sns.displot(x = 'Accuracy', data = BC_1)

# Decision Tree

In [None]:
#Fitting Decision Tree classifier to the training set
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier()
classifier.fit(X_train, Y_train)

In [None]:
#Predicting the test set result
y_pred= classifier.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test, y_pred)
dataframe_conf_matrix = conf_matrix
sns.heatmap(dataframe_conf_matrix, annot=True)

In [None]:
#Creating the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
cm

In [None]:
from sklearn.metrics import accuracy_score
acc_score3 = accuracy_score(Y_test, y_pred)
print(acc_score3)

In [None]:
print("Training Score: ",classifier.score(X_train,Y_train)*100)

In [None]:
print("Testing Score: ",classifier.score(X_test,Y_test)*100)

In [None]:
from sklearn.metrics import classification_report
class_report = classification_report(Y_test, y_pred)
print(class_report)

In [None]:
Accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
print("Accuracy",Accuracy)
Error_rate = (cm[0][1] + cm[1][0]) / (cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
print("Error_rate",Error_rate)
Sensitivity = cm[0][0]/(cm[0][0] + cm[1][0])
print("Sensitivity",Sensitivity)
Specificity = cm[1][1]/(cm[1][1] + cm[0][1])
print("Specificity",Specificity)


In [None]:

def doDecisionTreeClassifier(X, Y, test_size = 0.20, random_state = 42,criterion='gini', max_leaf_nodes=6, min_samples_split=4):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)
    classifier= DecisionTreeClassifier()
    classifier.fit(X_train, Y_train)
    y_pred= classifier.predict(X_test)
    acc_score = accuracy_score(Y_test, y_pred)
    return acc_score

In [None]:
BC_3= pd.DataFrame(columns = ['Test Size', 'Random States','Criterion' ,'Max_leaf_node', 'Min_samples_split'])
BC_3

In [None]:
test_size = [0.30, 0.25, 0.20]
random_states = [10, 25, 55]
criterion=['gini','entropy']
max_leaf_nodes= [2,4,6,10,15]
min_samples_split=[2,3, 4]
for t_size in test_size:
    for r_state in random_states:
        for crt in criterion:
            for max_leaf in max_leaf_nodes:
                for min_sample in min_samples_split:
                    accuracy = doDecisionTreeClassifier(X, Y, t_size, r_state, max_leaf, min_sample)
                    print("Test: {} | Random State: {} |Criterion:{}|Max_leaf_nodes:{}|min_sample_split:{}| Accuracy : {}".format(t_size, r_state,crt, max_leaf, min_sample, accuracy))

In [None]:
test_size = [0.30, 0.25, 0.20]
random_states = [10, 25, 55]
criterion=['gini','entropy']
max_leaf_nodes= [2,4,6,10,15]
min_samples_split=[2,3, 4]

for t_size in test_size:
    for r_state in random_states:
        for crt in criterion:
            for max_leaf in max_leaf_nodes:
                for min_sample in min_samples_split:
                    accuracy = doDecisionTreeClassifier(X, Y, t_size, r_state,crt, max_leaf, min_sample)
                    
                    BCEvaluation = {}
                    BCEvaluation['Test Size'] = t_size
                    BCEvaluation['Random States'] = r_state
                    BCEvaluation['Criterion'] = crt
                    BCEvaluation['Max_leaf_node'] = max_leaf
                    BCEvaluation['Min_samples_split'] = min_sample
                    BCEvaluation['Accuracy'] = accuracy
                    BC_3= BC_3.append(BCEvaluation, ignore_index = True)
                
        
       # accuracy = doDecisionTreeClassifier(X, Y, t_size, r_state, max_leaf, min_sample)
       #accuracy = doDecisionTreeClassifier(X, Y, t_size, r_state)
                    #print("Test: {} | Random State: {} | Penalty: {} | Solver: {} | Accuracy : {}".format(t_size, r_state, crt, max_leaf, min_sample,accuracy))

        


In [None]:
BC_3

In [None]:
sns.displot(x = 'Accuracy', data = BC_3)

# Accuracy Score of Models

In [None]:
prediction_columns = ["NAME OF MODEL", "ACCURACY SCORE"]
df_pred = {"NAME OF MODEL" : ["LOGISTIC REGRESSION","DECISION TREE"],
"ACCURACY SCORE " : [acc_score1, acc_score3]}
df_predictions = pd.DataFrame (df_pred)
df_predictions

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
oHe = OneHotEncoder()

In [None]:
parameters = [{'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
grid_search = GridSearchCV(estimator = logistic_regressor,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search.fit(X_train, Y_train)
best_accuracy_log = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_log)
print(best_parameters)

In [None]:
parameters = [{'criterion':['gini','entropy'],
'max_leaf_nodes': [2,4,6,10,15], 'min_samples_split': [2,3, 4]}]
oHe = OneHotEncoder()
grid_search = GridSearchCV(estimator = classifier,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search.fit(X_train, Y_train)
best_accuracy_dtc = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_dtc)
print(best_parameters)

# Comparing models before and after ParameterTuning

In [None]:
prediction_columns = ["NAME OF MODEL", "ACCURACY SCORE", "BEST ACCURACY (AFTER HYPER-PARAMETER TUNING)"]
df_pred = {"NAME OF MODEL" : ["LOGISTIC REGRESSION", "DECISION TREE" ],
           "ACCURACY SCORE " : [acc_score1,acc_score3],
           "BEST ACCURACY (AFTER HYPER-PARAMETER TUNING)" : [best_accuracy_log,best_accuracy_dtc]}
df_predictions = pd.DataFrame (df_pred)
df_predictions