<a href="https://colab.research.google.com/github/MarinCervinschi/DeepLearning/blob/main/1b_LogReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
import csv

import gdown
#from googledrivedownloader import GoogleDriveDownloader
#import GoogleDriveDownloader
import zipfile

eps_torch = torch.finfo(float).eps

torch.manual_seed(191090)

<torch._C.Generator at 0x7d558934c490>

# Core goals of the lab
1) Learn how to use torch.optim instead of manual parameter updates
2) Implement logistic regression (useful for classification)


In [None]:
gdown.download(f"https://drive.google.com/uc?id=1SagLh5XNSV4znhlnkLRkV7zHPSDbOAqv",
               output="./got.zip", quiet=False)

with zipfile.ZipFile("got.zip", 'r') as zip_ref:
    zip_ref.extractall()

Downloading...
From: https://drive.google.com/uc?id=1SagLh5XNSV4znhlnkLRkV7zHPSDbOAqv
To: /content/got.zip
100%|██████████| 84.6k/84.6k [00:00<00:00, 57.3MB/s]


In [None]:
def load_got_dataset(path, train_split=0.8, verbose=True):
    """
    Loads the Game of Thrones dataset.

    Parameters
    ----------
    path: str
        the relative path of the csv file.
    train_split: float
        percentage of training examples in [0, 1].

    Returns
    -------
    tuple
        x_train: np.array
            training characters. shape=(n_train_examples, n_features)
        y_train: np.array
            training labels. shape=(n_train_examples,)
        train_names: np.array
            training names. shape=(n_train_examples,)
        x_test: np.array
            test characters. shape=(n_test_examples, n_features)
        y_test: np.array
            test labels. shape=(n_test_examples,)
        test_names: np.array
            test names. shape=(n_test_examples,)
        feature_names: np.array
            an array explaining each feature. shape=(n_test_examples,)
    """

    # read file into string ndarray
    with open(path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        data = np.array([row for row in reader])

    if verbose:
        print(f"\nLoaded dataset from {path}")
        print(f"Shape: {data.shape[0]} rows × {data.shape[1]} columns")

        # print header
        header = data[0]
        print("Columns:", ", ".join(header))

        # print a preview of first 5 rows
        print("\nSample rows:")
        for row in data[1:6]:
            print("  ", row)

    # extract feature names
    feature_names = data[0, 1:-1]

    # shuffle data
    data = data[1:]
    np.random.shuffle(data)

    # extract character names
    character_names = data[:, 0]

    # extract features X and targets Y
    X = np.float32(data[:, 1:-1])
    Y = np.float32(data[:, -1])

    # normalize X
    X -= np.min(X, axis=0)
    X /= np.max(X, axis=0)

    # add bias to X
    X = np.concatenate((X, np.ones(shape=(X.shape[0], 1))), axis=1)
    feature_names = np.concatenate((feature_names, np.array(['bias'])), axis=-1)

    total_characters = X.shape[0]
    test_sampling_probs = np.ones(shape=total_characters)
    test_sampling_probs[Y == 1] /= float(np.sum(Y == 1))
    test_sampling_probs[Y == 0] /= float(np.sum(Y == 0))
    test_sampling_probs /= np.sum(test_sampling_probs)

    # sample test people without replacement
    n_test_characters = int(total_characters * (1 - train_split))
    test_idx = np.random.choice(np.arange(0, total_characters), size=(n_test_characters,),
                                replace=False, p=test_sampling_probs)
    x_test = X[test_idx]
    y_test = Y[test_idx]
    test_names = character_names[test_idx]

    # sample train people
    train_sampling_probs = test_sampling_probs.copy()
    train_sampling_probs[test_idx] = 0
    train_sampling_probs /= np.sum(train_sampling_probs)

    n_train_characters = int(total_characters * train_split)
    train_idx = np.random.choice(np.arange(0, total_characters), size=(n_train_characters,),
                                 replace=True, p=train_sampling_probs)
    x_train = X[train_idx]
    y_train = Y[train_idx]
    train_names = character_names[train_idx]

    return x_train, y_train, train_names, x_test, y_test, test_names, feature_names

In [None]:
x_train, y_train, train_names, x_test, y_test, test_names, feature_names = load_got_dataset(path='got.csv', train_split=0.8)

#convert from np_array to tensors
x_train = torch.from_numpy(x_train).to(dtype=torch.float32)
x_test = torch.from_numpy(x_test).to(dtype=torch.float32)
y_train = torch.from_numpy(y_train).to(dtype=torch.float32)
y_test = torch.from_numpy(y_test).to(dtype=torch.float32)


Loaded dataset from got.csv
Shape: 1947 rows × 27 columns
Columns: name, male, numDeadRelations, book1, book2, book3, book4, book5, bookCount, isMarried, isPopular, witnessed_wins, witnessed_losses, hadMoreWinsThanLosses, wasAttackerCommander, wasDefenderCommander, wasCommander, witnessed_own_attacker_size_mean, witnessed_opponent_attacker_size_mean, witnessed_own_defender_size_mean, witnessed_opponent_defender_size_mean, witnessed_major_deaths, witnessed_major_capture, battleCountAsAttackerCommander, battleCountAsDefenderCommander, battleCountAsCommander, isAlive

Sample rows:
   ['Viserys II Targaryen' '1' '11' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
   ['Walder Frey' '1' '1' '1' '1' '1' '1' '1' '5' '1' '1' '3' '0' '1' '1' '0'
 '1' '3166' '0' '0' '1166' '1' '2' '3' '0' '3' '1']
   ['Addison Hill' '1' '0' '0' '0' '0' '1' '0' '1' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1']
   ['Aemma Arryn' '0' '0'

In [None]:
def sigmoid(x):
    """
    Element-wise sigmoid function

    Parameters
    ----------
    x: torch.tensor
        a torch tensor of any shape

    Returns
    -------
    torch.tensor
        a tensor having the same shape of x.
    """

    """
    Apply the sigmoid function on x.
    See https://en.wikipedia.org/wiki/Sigmoid_function
    """
    return torch.exp(x) / (1 + torch.exp(x))

In [None]:
class LogisticRegression:
    """ Models a logistic regression classifier. """

    def __init__(self):
        """Constructor methd"""

        #super().__init__()

        # weights placeholder
        self._w = None

        # optimizer placeholder
        self.optim = None

        # loss placeholder
        self.loss = None


    def fit_sgd(self, X, Y, n_epochs, learning_rate, verbose=False):
        """
        Implements the gradient descent training procedure.

        Parameters
        ----------
        X: torch.tensor
            data. shape=(n_examples, n_features)
        Y: np.array
            labels. shape=(n_examples,)
        n_epochs: int
            number of gradient updates.
        learning_rate: float
            step towards the descent.
        verbose: bool
            whether or not to print the value of cost function.
        """

        n_samples, n_features = X.shape

        # weight initialization
        self._w = torch.randn(n_features, requires_grad=True)

        # optimizer initialization
        self.optim = torch.optim.SGD([self._w], learning_rate)

        # loss initialization
        self.loss = torch.nn.BCELoss()

        for e in range(n_epochs):

            """
            # Empy optimizer gradient buffer
            """
            self.optim.zero_grad()

            """
            # Compute predictions
            # -> preds = ...
            """
            #preds = torch.sigmoid(X @ self._w)
            preds = sigmoid(X @ self._w)

            """
            # Print loss between Y and predictions p
            # -> loss = ...
            """
            #loss = self.loss(preds, Y)
            loss = -(Y * torch.log(preds + eps_torch) + (1 - Y) * torch.log(1 - preds + eps_torch)).mean()

            if verbose and e % 500 == 0:
                print(f'Epoch {e:4d}: loss={loss}')

            """
            # Gradient backpropagation
            """
            loss.backward()

            """
            # Parameters update
            """
            self.optim.step()


    def predict(self, X):
        """
        Function that predicts.

        Parameters
        ----------
        X: torch.tensor
            data to be predicted. shape=(n_test_examples, n_features)

        Returns
        -------
        prediction: torch.tensor
            prediction in {0, 1}.
            Shape is (n_test_examples,)
        """

        """
        Compute predictions.
        a) compute the dot product between X and w
        b) apply the sigmoid function (this way, y in [0,1])
        c) discretize the output (this way, y in {0,1})
        """
        with torch.no_grad():
          return torch.round(sigmoid(X @ self._w))

In [None]:
"""Main function"""

logistic_reg = LogisticRegression()

# train

logistic_reg.fit_sgd(x_train, y_train, n_epochs=10000, learning_rate=0.01, verbose=True)

# test
predictions = logistic_reg.predict(x_test)

accuracy = float(torch.sum(predictions == y_test)) / y_test.shape[0]
print(f'Test accuracy: {accuracy}')


# plot_boundar(x_train, y_train, logistic_reg, title='Training Set')

Epoch    0: loss=1.1644622087478638
Epoch  500: loss=0.8126418590545654
Epoch 1000: loss=0.7340261340141296
Epoch 1500: loss=0.6853218078613281
Epoch 2000: loss=0.6546803712844849
Epoch 2500: loss=0.6351860165596008
Epoch 3000: loss=0.6225240230560303
Epoch 3500: loss=0.6140925288200378
Epoch 4000: loss=0.6083299517631531
Epoch 4500: loss=0.6042888164520264
Epoch 5000: loss=0.6013832688331604
Epoch 5500: loss=0.5992438197135925
Epoch 6000: loss=0.5976317524909973
Epoch 6500: loss=0.5963902473449707
Epoch 7000: loss=0.5954136252403259
Epoch 7500: loss=0.5946295857429504
Epoch 8000: loss=0.5939877033233643
Epoch 8500: loss=0.5934524536132812
Epoch 9000: loss=0.592998206615448
Epoch 9500: loss=0.5926061272621155
Test accuracy: 0.6452442159383034
