In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
#read data
df=pd.read_csv('./dataset/data.csv')

In [None]:
#data overview
df.head()

In [None]:
#shuffle the data bcause data in series
df=df.sample(frac=1)

In [None]:
df.iloc[:10]
#Now fine

In [None]:
#drop the unnamed and id columns.
#useless
df=df.drop(columns=['Unnamed: 32','id'])

In [None]:
#no of rows and columns
df.shape
#there is 569 rows and 31columns i.e 30 features and one target class 

In [None]:
#lets check data types
df.dtypes
#all are numeric except target label 'diagnosis'

In [None]:
df.describe()

In [None]:
#check any null values in database
df.isnull().values.any()

In [None]:
#lets count class labels
df['diagnosis'].value_counts()

In [None]:
#Data visualization
#histogram
df.hist(bins=50,figsize=(15,15))
plt.show()

In [None]:
#Scatter matrix to check correlation between two attributes

sns.pairplot(df,hue='diagnosis')

In [None]:
#Count each label 
ax=sns.countplot(y='diagnosis',data=df,palette='Set2')

In [None]:
#lets find correlation
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(df.corr(),ax=ax)

In [None]:
#box plot to check outlier in each category

#define function can be call later 
def boxPlot(dff):
    d=dff.drop(columns=['diagnosis'])
    for column in d:
        plt.figure(figsize=(5,2))
        sns.boxplot(x=column,data=d,palette="colorblind")
boxPlot(df)

In [None]:
#Quartile range

Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1

##---quartiles and IQR

#print("Quartile 1:\n",Q1)
#print("\nQuartile 3:\n",Q3)
#print("\nIQR :\n",IQR)

#--display outlier
#print((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR)))

In [None]:
#remove all outlier
# < Q1-1.5*IQR
# > Q3+1.5*IQR

df_out = df[~((df < (Q1 - (1.5 * IQR))) |(df > (Q3 + (1.5 * IQR)))).any(axis=1)]
df.shape,df_out.shape

In [None]:
#--visualize again boxplot
#boxPlot(df_out)

In [None]:
#good to go..
#Lets seprate labels and features
X=df_out.drop(columns=['diagnosis'])
y=df_out['diagnosis']

In [None]:
from sklearn.preprocessing import LabelEncoder
#Convert string labels to unique int number
lbl=LabelEncoder()
y=lbl.fit_transform(y)
y[:20]
#Here we can see M convert 1 and B convert to 0

In [None]:
#Spilt the train and test data
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
#we used 20% test data

In [None]:
#check the size before beginning
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
#cross validation function
from sklearn.metrics import confusion_matrix,make_scorer
from sklearn.model_selection import cross_validate


def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def acc(y_true,y_pred): return accuracy(y_true,y_pred)

#custom accuracy
def accuracy(y_true,y_pred):
    cnf=confusion_matrix(y_true, y_pred)
    N=sum(map(sum, cnf))
    tp=cnf[1,1]
    tn=cnf[0,0]
    return round((tp+tn)/N,2)
    
#cross validation purpose
scoring = {'accuracy': make_scorer(accuracy_score),'prec': 'precision'}
scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),
           'fp': make_scorer(fp), 'fn': make_scorer(fn),
          'acc': make_scorer(acc) }

# call function from each model
def display_result(result):
    print("TP: ",result['test_tp'])
    print("TN: ",result['test_tn'])
    print("FN: ",result['test_fn'])
    print("FP: ",result['test_fp'])
    print("Accuracy: ",result['test_acc'])

In [None]:
#acc list and rcc list will be used to store the output of each algo
acc=[]
roc=[]

#import library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score

#Logistic Regression
LR=LogisticRegression()
LR.fit(X_train,y_train)
LR.score(X_train,y_train)
y_pred=LR.predict(X_test)

#find accuracy
ac=accuracy_score(y_test,y_pred)
acc.append(ac)

#find the ROC_AOC curve
#more https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

rc=roc_auc_score(y_test,y_pred)
roc.append(rc)
print("Accuracy {0} ROC {1}".format(ac,rc))

#-- 95% accuracy but cross validation will give clear idea how accurate our model it is
#--cross validation

result=cross_validate(clf,X_train,y_train,scoring=scoring,cv=10)
display_result(result)

In [None]:
#Support Vector Machine
from sklearn.svm import SVC

clf=SVC(gamma='auto',kernel='linear')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

ac=accuracy_score(y_test,y_pred)
acc.append(ac)
rc=roc_auc_score(y_test,y_pred)
roc.append(rc)
print("Accuracy {0} ROC {1}".format(ac,rc))

#cross validation
result=cross_validate(clf,X_train,y_train,scoring=scoring,cv=10)
display_result(result)

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

clf=KNeighborsClassifier(n_neighbors=9,n_jobs=-1)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

ac=accuracy_score(y_test,y_pred)
acc.append(ac)
rc=roc_auc_score(y_test,y_pred)
roc.append(rc)
print("Accuracy {0} ROC {1}".format(ac,rc))

#cross validation
result=cross_validate(clf,X_train,y_train,scoring=scoring,cv=10)
display_result(result)

In [None]:
#Naivye Bayes

from sklearn.naive_bayes import GaussianNB

clf=GaussianNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

ac=accuracy_score(y_test,y_pred)
acc.append(ac)
rc=roc_auc_score(y_test,y_pred)
roc.append(rc)
print("Accuracy {0} ROC {1}".format(ac,rc))

#cross validation
result=cross_validate(clf,X_train,y_train,scoring=scoring,cv=10)
display_result(result)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=20,max_depth=10)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

ac=accuracy_score(y_test,y_pred)
acc.append(ac)
rc=roc_auc_score(y_test,y_pred)
roc.append(rc)
print("Accuracy {0} ROC {1}".format(ac,rc))

#cross validation
result=cross_validate(clf,X_train,y_train,scoring=scoring,cv=10)
display_result(result)

In [None]:
#lets plot the bar graph

#accuracy score
plt.figure(figsize=(8,5))
plt.bar(['Logistic Regression','SVM','KNN','Naivye Bayes','Random Forest'],acc,color=['salmon','r','g','b','orange'],label='Accuracy')
plt.ylabel('Accuracy Score')
plt.xlabel('Algortihms')

#roc auc
plt.figure(figsize=(8,5))
plt.bar(['Logistic Regression','SVM','KNN','Naivye Bayes','Random Forest'],roc,color=['salmon','r','g','b','orange'],label='ROC AUC')
plt.ylabel('ROC AUC')
plt.xlabel('Algortihms')
plt.show()