<a href="https://colab.research.google.com/github/MartinKahabka/ClIntKahabka/blob/main/01_linear_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from sklearn import datasets

In [3]:
# we are going to classify a popular breast cancer dataset
data, target = datasets.load_breast_cancer(return_X_y=True)

In [4]:
# data is a matrix of shape (569, 30)
# the first number refers to the number of cases or samples while the second
# number refers to the number of features from which we try to predict
# breast cancer
#
# please see the documentation @ https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
# for more details
data.shape

(569, 30)

In [5]:
# target is a binary vector of size (569,) in which each entry is the label of
# each case, either malignant (0) or benign (1).
target[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [6]:
# for simplicity we cut the number of features from 30 to just two features
data = data[:, :2]

In [9]:
# Excercise 1.1
# Try to find a linear classifier by hand that can classify the dataset
# as good as possible. Let the classification rule be w1 * f1 + w2 * f2 + w3 > 0
# where w1, w2 and w3 are the weights to be learnt and f1, f2 are the features.
# What accuracy can you reach?
best_accuracy = 0
best_preds = None

for w1 in np.arange(-1, 2, step=0.1):
    for w2 in np.arange(-1, 2, step=0.1):
        for w3 in np.arange(-40, 40, 1):
          preds = (w1 * data[:, 0] + w2 * data[:, 1] + w3 > 0).astype(int)
          accuracy = np.mean(preds != target)

          if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_preds = (w1, w2, w3)


w1, w2 ,w3 = best_preds
print("Best weights are")
print(best_preds)
print(f"Accuracy: {accuracy}")

plt.scatter(data[:, 0], data[:, 1], c=target)
x_vals = np.linspace(data[:, 0].min(), data[:, 0].max(), 100)
y_vals = - (w1 * x_vals + w3) / w2
plt.plot(x_vals, y_vals)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Linear Classifier w. Decision Boundary")
plt.show()



TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
# Excercise 1.2
# Learn the optimal parameters automatically using gradient descent. For the
# loss function use a simple squared loss (t - tp)**2 where t is the target
# label (either 0 or 1) and tp = w1 * f1 + w2 * f2 + w3 is the predicted label.
# What accuracy can you reach? Why is this loss function problematic? Is there
# a better alternative?