### Fish classifier

Author: Łukasz Szarecki 

Dataset: https://www.kaggle.com/aungpyaeap/fish-market

1. Uploading data to pandas dataframes

In [576]:
import pandas as pd
import numpy as np
fish_df = pd.read_csv("Fish.csv")
print(fish_df.head())

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


2. We have to mix data because there are sorted.
3. Preapering training and testing data.

In [577]:
#sorting dataset
fish_df = fish_df.sample(frac=1)
#corect indexing
fish_df = fish_df.reset_index(drop=True)
print(fish_df.head())
print(f'There are {len(fish_df)} samples')

  Species  Weight  Length1  Length2  Length3  Height   Width
0   Smelt    19.9     13.8     15.0     16.2  2.9322  1.8792
1   Perch   100.0     16.2     18.0     19.2  5.2224  3.3216
2   Smelt    10.0     11.3     11.8     13.1  2.2139  1.2838
3   Perch   265.0     25.4     27.5     28.9  7.0516  4.3350
4  Parkki   170.0     19.0     20.7     23.2  9.3960  3.4104
There are 159 samples


3. Input and output data

In [578]:
#input
X = np.array(fish_df.iloc[:, 1::])
print(X.shape)
#output
Y = (fish_df.iloc[:,0])
print(Y.shape)

(159, 6)
(159,)


### Encoder class

3. Each fish will have its own identifier
* fit() - create connection between identifiers and fish names
* transform() - fish name (str) to id (int)
* inverse_transform() - id (int) to fish name (str)

In [579]:
class Encoder:
    def __init__(self):
        self.names = [] #to not store duplicated names
        self.num_classes = 0 #number of all classes
    def fit(self, y):
        for sample in y:
            if sample not in self.names:
                self.names.append(sample)
        self.num_classes = len(self.names)
    def transform(self, y):
        encoded_samples = np.zeros(len(y))
        for index,sample in enumerate(y):
            encoded_samples[index] = self.names.index(sample)
        return encoded_samples
    def inverse_transform(self, encoded_y):
        samples = []
        for sample in encoded_y:
            samples.append(self.names[int(sample)])
        return samples

In [580]:
#tests
encoder = Encoder()
encoder.fit(Y)
print(encoder.names)
print(encoder.num_classes) 

test_list = ['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']
encoded_num_test = encoder.transform(test_list)
print(test_list)
print(encoder.inverse_transform(encoded_num_test))

['Smelt', 'Perch', 'Parkki', 'Pike', 'Bream', 'Whitefish', 'Roach']
7
['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']
['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']


3. Encoding dataset 

In [581]:
encoder = Encoder()
encoder.fit(Y)
encoded_y = encoder.transform(Y)
# print(encoded_y)

### Feature Scaling - Normalization

* test fraction - percentage of test data
* xte - test data
* xtr - training data

xtr ->
* mean = 0
* standard deviation = 1

In [582]:
class Dataset:
    def __init__(self, x,y,test_fraction=0.05):
        test_samples = int(test_fraction * y.size)
        self.xte = x[:test_samples,:]   
        self.xtr = x[test_samples:,:]   
        self.yte = y[:test_samples]   
        self.ytr = y[test_samples:]
        self.mean = np.mean(self.xtr, axis=0)
        self.std = np.std(self.xtr, axis=0)#standard deviation
        self.xtr = self.normalize(self.xtr)
        self.xte = self.normalize(self.xte)

    def normalize(self, x):
        x_temp = (x - self.mean)/self.std
        return x_temp

# creating dataset        
fish_ds = Dataset(X,encoded_y,0.20)
# fish_ds.xte 


### Classifiers models

- Random Classifier
- KNN - K - Nearest Neighbors Algorithm
- Linear Regression Model 

In [583]:
class Classfier:
    def __init__(self):
        pass
    def fit(self, xtr, ytr):
        pass
    def predict(self, x):
        pass
    def evaluate(self, xte, yte):
        ypred = self.predict(xte)
        print(yte.astype(int))
        print(ypred)
        acc = np.sum(ypred == yte) / yte.size #Accuracy
        return acc

# Baseline (method)
class RandomClassifier(Classfier):
    #expected accuracy = 1/7
    def fit(self, xtr, ytr):
        self.num_classes = int(np.max(ytr) + 1)
    def predict(self, x):
        return np.random.choice(self.num_classes, x.shape[0])

class KNNClassifier(Classfier):
    def __init__(self, k=3):   #hyperparameters
        self.k = k
    def fit(self, xtr, ytr):
        self.xtr = xtr
        self.ytr = ytr
    def predict(self, x):
        num_samples =  x.shape[0]
        ypred = np.zeros(num_samples, dtype=int)
        for i in range(num_samples):
            distance = np.sum((self.xtr - x[i, :])**2, axis=1)
            order = np.argsort(distance)
            knn_label = self.ytr[order][:self.k].astype(int)
            # print(f'knn labels {knn_label}')
            binc = np.bincount(knn_label)
            # print(f'Bin count {binc}')
            # print(f'Y pred {binc.argmax()}')
            ypred[i] = binc.argmax()    #argument which store max value
        return ypred    
        
class LNRClassifier(Classfier):
    def __init__ (self, num_inputs, num_classes, lr = 0.001):
        self.W = np.random.randn(num_inputs, num_classes)/num_inputs
        self.b = np.zeros(num_classes)
        self.num_classes = num_classes
        self.lr = lr
    def fit(self, xtr, ytr, num_epochs=1):
        for e in range(num_epochs):
            z = np.dot(xtr, self.W) + self.b
            ypred = self._softmax(z)
            loss = self._cross_entropy_loss(ypred, ytr)
            if e%10 == 9:
                print(f'Epoch: {e+1}, loss: {loss:.3}, acc: {self._acc(ytr,ypred):.3}')
            dz = ypred - self._one_hot(ytr)
            dW = np.dot(xtr.T, dz)
            db = np.sum(dz, axis=0)
            self.W -= dW * self.lr
            self.b -= db * self.lr
    def _softmax(self, z):
        max_z = np.max(z, axis=1, keepdims=True)
        return np.exp(z - max_z) / np.sum(np.exp(z - max_z), axis=1, keepdims=True)
    def _cross_entropy_loss(self, ypred, ytr):
        ytr_one_hot = self._one_hot(ytr)
        return np.mean(-ytr_one_hot * np.log(ypred))
    def _one_hot(self, ytr):
        ytr_one_hot = np.zeros((ytr.size, self.num_classes))
        ytr_one_hot[np.arange(ytr.size), ytr.astype(int)] = 1
        return ytr_one_hot
    def _acc(self, ytr, ypred):
        ypred = np.argmax(ypred, axis=1)
        return np.mean(ytr == ypred)
    def predict(self, x):
        z = np.dot(x, self.W) + self.b
        ypred = self._softmax(z)
        ypred = np.argmax(ypred, axis=1)
        return ypred



        

In [584]:
random_classifier = RandomClassifier()
random_classifier.fit(fish_ds.xtr, fish_ds.ytr)
randow_acc = random_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(f'Random classifier accuracy\n {randow_acc}')

knn_classifier = KNNClassifier(4)
knn_classifier.fit(fish_ds.xtr, fish_ds.ytr)
knn_acc = knn_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(f'KNN classifier accuracy\n {knn_acc}')

lnr_classifier = LNRClassifier(X.shape[1], int(np.max(encoded_y) + 1))
lnr_classifier.fit(fish_ds.xtr, fish_ds.ytr, 1000)
lnr_acc = lnr_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(f'lnr classifier accuracy\n {lnr_acc}')


[0 1 0 1 2 3 4 1 3 5 1 6 6 4 1 2 4 2 2 1 1 3 4 0 1 1 4 6 1 4 4]
[4 4 4 1 1 6 5 1 3 2 6 4 2 5 0 6 6 2 3 3 3 0 1 1 3 5 3 5 5 5 2]
Random classifier accuracy
 0.12903225806451613
[0 1 0 1 2 3 4 1 3 5 1 6 6 4 1 2 4 2 2 1 1 3 4 0 1 1 4 6 1 4 4]
[1 1 0 1 2 3 4 1 3 1 6 1 1 4 1 2 4 2 1 1 1 3 4 0 1 1 4 1 1 4 4]
KNN classifier accuracy
 0.7741935483870968
Epoch: 10, loss: 0.229, acc: 0.43
Epoch: 20, loss: 0.201, acc: 0.523
Epoch: 30, loss: 0.183, acc: 0.547
Epoch: 40, loss: 0.17, acc: 0.594
Epoch: 50, loss: 0.16, acc: 0.641
Epoch: 60, loss: 0.152, acc: 0.672
Epoch: 70, loss: 0.145, acc: 0.711
Epoch: 80, loss: 0.14, acc: 0.719
Epoch: 90, loss: 0.135, acc: 0.727
Epoch: 100, loss: 0.131, acc: 0.727
Epoch: 110, loss: 0.128, acc: 0.727
Epoch: 120, loss: 0.124, acc: 0.727
Epoch: 130, loss: 0.122, acc: 0.734
Epoch: 140, loss: 0.119, acc: 0.742
Epoch: 150, loss: 0.117, acc: 0.758
Epoch: 160, loss: 0.115, acc: 0.758
Epoch: 170, loss: 0.113, acc: 0.758
Epoch: 180, loss: 0.111, acc: 0.758
Epoch: 190, loss:

**Note [PL]**
Każdy model w uczeniu maszynowym ma parametry. Jedne mają ich więcej drugie mniej. Jeśli parametr wyliczany jest samodzielnie przez algorytm podczas uczenia nazywamy go po prostu parametrem. Przykładem mogą być wagi w sieciach neuronowych.

Natomiast jeśli parametr podawany jest przez użytkownika, który używa algorytmu, wówczas nazywamy go hiperparametrem