In [None]:
def sig(z):
    #the sigmoid function is given by 1/(1+e^(-z))
    return 1/(1+np.exp(-z)) 

In [None]:
def gradient(Y,a,X,m):
    #Calculating the partial differential of the cost function with respect to the weight
    #dw = -(1/m)*(y-a)*X, where a is the sigmoid of z
    dw=-(1/m)*(np.dot(X, np.transpose(Y-a)))
    return dw

In [None]:
def gradient_bias(M,a,m):
    #Calculating the partial differential of the cost function with respect to the bias
    #b_new = -(1/m)*(y-a)*1
    b_new=-(1/m)*np.sum(M-a)
    return b_new

In [None]:
def model_accuracy(Y,predicted_val):
    Y_int=Y.astype(int) #converting the float values of y into integer
    Y_list=Y_int[0] #obtaining elements in the form of elements in a list (Y is in ndarray form)
    pred=[int(i) for i in predicted_val] #list comprehension to get the int values from string data type
    TP=0
    FN=0
    TN=0
    FP=0
    for i in range(len(Y_list)):
        if (Y_list[i]==1) and (pred[i]==1): #TP = 1, if both the label value and predicted value are 1
                TP=TP+1 
        elif (Y_list[i]==1) and (pred[i]==0): #FN = 1, if the label value is 1 and predicted value is 0
                FN=FN+1
        elif (Y_list[i]==0) and (pred[i]==0): #TN = 1, if both the label value and predicted value are 0
                TN=TN+1
        elif (Y_list[i]==0) and (pred[i]==1): #FP = 1, if the label value is 0 and predicted value is 1
            FP=FP+1
    #calculating accuracy using (TP+TN)/(TP+TN+FP+FN)
    accuracy=(TP+TN)/(TP+TN+FP+FN)
    return accuracy

In [None]:
def cost_func(a, Y, m):
    #Calculating the cost of the model
    #cost = -(1/m)summation((Y*log(a)+(1-Y)*log(1-a)))
    cost = -(1/m)*(np.sum(Y*np.log(a)+(1-Y)*np.log(1-a))) 
    return cost

In [None]:
def model_precision(Y,predicted_val):
    Y_int=Y.astype(int) 
    Y_list=Y_int[0] 
    pred=[int(i) for i in predicted_val] 
    TP=0
    FP=0
    for i in range(len(Y_list)):
        if (Y_list[i]==1) and (pred[i]==1):
                TP=TP+1 
        elif (Y_list[i]==0) and (pred[i]==1):
                FP=FP+1
    #calculating the precision using the formula (TP)/(TP+FP)
    precision = TP/(TP+FP)
    return precision

In [None]:
def model_recall(Y,predicted_val):
    Y_int=Y.astype(int)
    Y_list=Y_int[0] 
    pred=[int(i) for i in predicted_val] 
    TP=0
    FN=0
    for i in range(len(Y_list)):
        if (Y_list[i]==1) and (pred[i]==1):
                TP=TP+1
        elif (Y_list[i]==1) and (pred[i]==0):
                FN=FN+1
    #calculating recall using the formula (TP)/(TP+FN)
    recall = TP/(TP+FN)
    return recall

In [None]:
def f1_score1(Y,predicted_val):
    prec_val = model_precision(Y,predicted_val)
    recall_val = model_recall(Y,predicted_val)
    g=1/prec_val
    f=1/recall_val
    #Calculating f1 score using the formula 2/((1/precision)+(1/recall))
    f1_score_val=2/(g+f)
    return f1_score_val

In [None]:
def test_data_eval(test,w_final,b_final):
    #creating the label for test data
    test_label = np.array(test[1]) # Y is of the data type - numpy.ndarray
    shape=(1, 57) 
    test_label = test_label.reshape(shape)
    
    #dropping the label from the test dataset
    test.drop(test.columns[0],axis=1, inplace=True) #removing the label #you have to re-run the code to eliminate key error. restart the kernels
    test = test.T
    
    #Calculating the basis equation
    z_test=np.array(np.dot(w_final.T,test)+b_final)
    
    #calculating the sigmoid of the basis equation
    a_test=sig(z_test)
    
    #calculating the predicted_val of X
    predicted_val_test=[]
    for x in np.nditer(a_test):
        if x>=0.5:
            predicted_val_test.append('1')
        else:
            predicted_val_test.append('0')
    
    #Calculating the accuracy, precision and recall of the test set
    accuracy_obtained_test = model_accuracy(test_label,predicted_val_test)
    print("Applying key metrics on test data")
    print("Accuracy of test data: ",accuracy_obtained_test)
    precision_test = model_precision(test_label,predicted_val_test)
    print("Precision of test data: ",precision_test)
    recall_test = model_recall(test_label,predicted_val_test)
    print("Recall of test data: ",recall_test)
    f1_score_value= f1_score1(test_label,predicted_val_test)
    print("F1 score of test data: ",f1_score_value)


In [None]:
def predict(a):
    predicted_value=[]
    for x in np.nditer(a):
        if x>=0.5:
            predicted_value.append('1')
        else:
            predicted_value.append('0')
    return predicted_value

In [None]:
import pandas as pd
import sklearn
import numpy as np
import math
%matplotlib inline
import matplotlib.pyplot as plt

cancer_types = {"B":0,"M":1}

#step 1 : read the csv file [works]

my_dataset=pd.read_csv("ml_dataset.csv",header=None)
#type(my_dataset)
#del my_dataset['0']

#step 2 : dropping the first column and replacing the M & B with 0 and 1 [works]

my_dataset1=pd.DataFrame(my_dataset)
my_dataset1.drop(my_dataset1.columns[0],axis=1, inplace=True)
my_dataset1.replace({my_dataset1.columns[0]:cancer_types},inplace=True)

#step 3: normalization min-max [works]

norm=my_dataset1.copy()
for features in my_dataset1.columns:
    max_value=my_dataset1[features].max()
    min_value=my_dataset1[features].min()
    norm[features]=(my_dataset1[features]-min_value)/(max_value-min_value)   
norm

#step 4 : splitting data into training, validation and testing [works]. Split the data into 80%,10%,10%

X, validate, test = np.split(norm.sample(frac=1), [int(.8*len(norm)), int(.9*len(norm))])
#shapes of {X= (455,31), validate = (57,31), test = (57,31) 


#step 5 : We have now extracted the label column from the X dataset and stored it in Y, which is then reshaped to give the right shape.

Y = np.array(X[1]) # Y is of the data type - numpy.ndarray
#Y.shape = (455,)
shape=(1, 455) 
Y = Y.reshape(shape)
#Y.shape = (1,455)


#step 6: We have now extracted the label column from the Validate dataset and stored it in validate_label, which is then reshaped to give the right shape.
        
validate_label=np.array(validate[1])
shape1=(1,57)
validate_label=validate_label.reshape(shape1)
#validate_label.shape = 1,57


#step 7 : Dropping the label column from the X data set
X.drop(X.columns[0],axis=1, inplace=True) #removing the label
X = X.T
#X.shape = (30, 455)

#step 8: dropping the label column from the validate set
validate.drop(validate.columns[0],axis=1,inplace=True)
validate=validate.T
#validate.shape = 30,57


#step 9 : Initializing weights, bias and learning factor

w = np.zeros((X.shape[0], 1))
#w.shape = (30,1)
b=0 #bias is equal to 0
n=0.1 #n is the learning rate
m=X.shape[1] #no. of samples. m is the no. of samples for X

u=validate.shape[1] #u.shape() = 57 #u is the no. of samples in validate


#step 10 : Iteration begins 
training_accuracy = []
loss=[]
loss_validate=[]
validate_accuracy=[]
for epoch in range(10000):
    
    #calculating the basis function of X and validate
    z1=np.array(np.dot(w.T,X)+b)
    z_validate=np.array(np.dot(w.T,validate)+b)
    

    #calculating the sigmoid function of X and validate
    a=sig(z1)
    a_validate=sig(z_validate)
    
    #calculating the predicted_val of X
    predicted_val=predict(a)
    predicted_val_data=predict(a_validate)
           
    #calculating the accuracy of both X & validation data
    accuracy_obtained = model_accuracy(Y,predicted_val)
    accuracy_validate = model_accuracy(validate_label,predicted_val_data)
    
    #appending the accuracy of X and validation data to a list for plotting purpose
    training_accuracy.append(accuracy_obtained)
    validate_accuracy.append(accuracy_validate)
    
    #calculating the cost of X and validate
    cost2=cost_func(a,Y,m) #cost func of X
    cost_validate = cost_func(a_validate,validate_label,u) #cost func of validate
    
    #calculating the weights and training the model
    w=w-n*gradient(Y,a,X,m) #calculating the new value of w using the formula for updation of weights => w=w-(n*gradient(weight)
    b=b-n*gradient_bias(Y,a,m) #updating the bias term
    
    #lists of the X and validate cost function
    loss.append(cost2) #list of X cost
    loss_validate.append(cost_validate) #list of validate cost
    

#weight for test data
w_final=w
b_final=b

#to plot the epochs
i=0
epochs=[]
for i in range(10000):
    i=i+1
    epochs.append(i)
    
#plotting the training accuracy vs number of epochs
plt.plot(epochs,training_accuracy,'r')
plt.xlabel('Number of epochs')
plt.ylabel('Training accuracy')
plt.title('Variation of training accuracy with respect to epochs')
plt.show()

plt.plot(epochs,validate_accuracy,'b')
plt.xlabel('Number of epochs')
plt.ylabel('Validation accuracy')
plt.title('Variation of validation accuracy with respect to epochs')
plt.show()

plt.plot(epochs,loss,'r')
plt.xlabel('Number of epochs')
plt.ylabel('Cost of training data')
plt.title('Variation of cost with respect to epochs')
plt.show()

plt.plot(epochs,loss_validate,'b')
plt.xlabel('Number of epochs')
plt.ylabel('Cost of validation data')
plt.title('Variation of cost with respect to epochs')
plt.show()


#evaluates the key metrics of the test data
test_data_eval(test,w_final,b_final)