In [1]:
import numpy as np
import pandas as pd
import processorData as process

In [2]:
#opening the file that contains the data set and removing the missing values
file = open("breast-cancer.data", "r")
file_data = file.readlines()
data = []
for line in file_data:
    if '?' in line:    #Excluding the missing values in the data set
        continue
    data.append(line.strip("\n").split(','))

In [3]:
data2D = data

In [None]:
len(data2D)

In [4]:
process = process.processor(data2D)

In [5]:
normalizedData = process.normalizedData()

In [6]:
np.random.shuffle(normalizedData)

In [7]:
training_data = normalizedData[:221]
test_data = normalizedData[221:260]
validation_data = normalizedData[260:]

In [8]:
def h(x, θ): # Regression function
    return 1/(1+np.exp(-np.dot(x, θ)))

In [9]:
#Logistic regression with regularazation
def logistic_regression(training_data):
    x_values = np.array(training_data)[:,1:]
    y_values = np.array(training_data)[:,0]
    X = np.hstack([np.ones(x_values.shape[0])[np.newaxis].T, x_values[:,1:]]) #the design matrix
    α = 1e-4 # define our learning rate
    λ = 1e-2
    θ = np.ones(X.shape[1]) # initialize our parameters
    θ_old = np.zeros(X.shape[1]) # initialize the old parameter values (must be different from the parameter values so we enter the while loop below)
    while np.sqrt(np.sum(np.power(θ - θ_old, 2))) > 0.0005: # while euclidean norm > 0.0005 (so ϵ = 0.0005) 
        θ_old = θ # set old parameter values to parameter values before they are updated
        for i in range(0,X.shape[0]): # loop over each row of the design matrix (each data point)
            θ1 = θ - α*( (h(X[i], θ) - y_values[i]) * X[i] ) # update the parameters using the update rule
            θ = θ - α*( (h(X[i], θ) - y_values[i]) * X[i] + λ*θ)
            θ[0] = θ1[0] 
            
    return θ

In [10]:
θ= logistic_regression(training_data)

In [11]:
θ

array([0.2250108 , 0.20714064, 0.45732607, 0.75844611, 0.16267763,
       1.25892994, 0.46625225, 0.5289725 , 0.30524507])

In [12]:
def countClass(data):
    counter1 = 0 # for no-occurrence-events
    counter2 = 0 # for occurence-events
    for i in range(0,len(data)):
        if(data[i][0] == 0):
            counter1 = counter1 + 1
        if (data[i][0] == 1):
            counter2 = counter2 + 1
    return counter1,counter2

In [13]:
def confusion_matrix(true_class,app_class):
    TP = 0
    TN = 0 
    FP = 0
    FN = 0
    for i in range(0,len(app_class)):
        if (app_class[i] == 0):
            if (app_class[i] == true_class[i]):
                TP = TP +1
            else :
                FP = FP+1
        if (app_class[i] == 1):
            if (app_class[i] == true_class[i]):
                TN = TN +1
            else :
                FN = FN+1
    return TP,FP,TN,FN

In [14]:
def apply_LR(data,θ):
    x_values = np.array(data)[:,1:]
    actual_no_occurrence_events = 0 #actual no-occurence-events indicated by 0
    actual_occurrence_events = 0   #actual occurence-events indicated by 1  
    actual_no_occurrence_events, actual_occurrence_events = countClass(data)
    arr = []
    for i in x_values:
        tmp = h(i,θ)
        if tmp < 0.5:
            arr.append(0) # classify as no-occurence-events
        elif tmp >= 0.5:
            arr.append(1) # classify as occurence-events
    approximated_no_occurrence_events,  approximated_occurrence_events = countClass(np.array(arr)[np.newaxis].T)
    TP,FP,TN,FN = confusion_matrix(np.array(data)[:,0].astype(int).tolist(),arr)  
    accuracy = 'accuracy is: '+str(round(((TP+TN)/(TP+TN+FP+FN))*100))+'%'
    d = {'approxiamted no-occurence-events': [TP,FP], 'approximated occurence-events': [FN,TN]}
    df = pd.DataFrame(data=d,index=['actual no-occurence-events','actual occurence-events'])
    return df,accuracy

In [15]:
d,a= apply_LR(test_data,θ)

In [16]:
d.head()

Unnamed: 0,approxiamted no-occurence-events,approximated occurence-events
actual no-occurence-events,21,0
actual occurence-events,11,7


In [17]:
a

'accuracy is: 72%'