In [32]:
import csv
import os

import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler

In [33]:
def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

In [34]:
def loadDataMoreInputs(fileName, inputVariabNames, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable1 = dataNames.index(inputVariabNames[0])
    selectedVariable2 = dataNames.index(inputVariabNames[1])
    inputs = [[float(data[i][selectedVariable1]), float(data[i][selectedVariable2])] for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [1 if data[i][selectedOutput] == 'M' else 0 for i in range(len(data))]

    return inputs, outputs

In [71]:
from MyLogisticRegressor import LogisticRegressorBinary

crtDir = os.getcwd()
filePath = os.path.join(crtDir, 'data', 'wdbc.csv')

inputs, outputs = loadDataMoreInputs(filePath, ['Radius', 'Texture'], 'Diagnosis')

# split data into training data (80%) and testing data (20%) and normalise data
np.random.seed(5)
indexes = [i for i in range(len(inputs))]
trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
testSample = [i for i in indexes if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

# normalization
scaler = StandardScaler()
if not isinstance(trainInputs[0], list):
    trainInputs = [[d] for d in trainInputs]
    testInputs = [[d] for d in testInputs]

    scaler.fit(trainInputs)
    trainInputs = scaler.transform(trainInputs)
    testInputs = scaler.transform(testInputs)

    trainInputs = [el[0] for el in trainInputs]
    testInputs = [el[0] for el in testInputs]
else:
    scaler.fit(trainInputs)
    trainInputs = scaler.transform(trainInputs)
    testInputs = scaler.transform(testInputs)

modelType = 'tool'
# training step
if modelType == "tool":
    model = LogisticRegression()
    model.fit(trainInputs, trainOutputs)
    w0, w1 = model.intercept_, model.coef_[0]
    print('the learnt model: f(x) = ', w0[0], ' + ', w1[0], ' * x1 + ', w1[1], ' * x2')
else:
    model = LogisticRegressorBinary()
    trainOutputs = np.array(trainOutputs)
    model.fit(trainInputs, trainOutputs)
    w0 = model.intercept_
    w1 = model.coef_[0]
    w2 = model.coef_[1]
    print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x1 + ', w2, ' * x2')

computedTestOutputs = model.predict(testInputs)

print('Accuracy: ', accuracy_score(testOutputs, computedTestOutputs))
print('Precision: ', precision_score(testOutputs, computedTestOutputs))
print('Recall: ', recall_score(testOutputs, computedTestOutputs))

normalized_inputs = scaler.transform([[18, 10]])
prediction = model.predict(np.array(normalized_inputs))
if prediction[0] == 0:
    print("Result: benign.")
else:
    print("Result: malign.")




the learnt model: f(x) =  -0.9122440356107672  +  3.714265538441941  * x1 +  0.9215248354552286  * x2
Accuracy:  0.7982456140350878
Precision:  0.8461538461538461
Recall:  0.66
Result: malign.
