### Fish classifier

Author: Łukasz Szarecki 

Dataset: https://www.kaggle.com/aungpyaeap/fish-market

1. Uploading data to pandas dataframes

In [175]:
import pandas as pd
import numpy as np
fish_df = pd.read_csv("Fish.csv")
print(fish_df.head())

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


2. We have to mix data because there are sorted.
3. Preapering training and testing data.

In [176]:
#sorting dataset
fish_df = fish_df.sample(frac=1)
#corect indexing
fish_df = fish_df.reset_index(drop=True)
print(fish_df.head())
print(f'There are {len(fish_df)} samples')

  Species  Weight  Length1  Length2  Length3   Height   Width
0    Pike   200.0     30.0     32.3     34.8   5.5680  3.3756
1   Perch   130.0     20.0     22.0     23.5   6.1100  3.5250
2   Bream   714.0     32.7     36.0     41.5  16.5170  5.8515
3   Bream   620.0     31.5     34.5     39.7  15.5227  5.2801
4    Pike   540.0     40.1     43.0     45.8   7.7860  5.1296
There are 159 samples


3. Input and output data

In [177]:
#input
X = np.array(fish_df.iloc[:, 1::])
print(X.shape)
#output
Y = (fish_df.iloc[:,0])
print(Y.shape)

(159, 6)
(159,)


### Encoder class

3. Each fish will have its own identifier
* fit() - create connection between identifiers and fish names
* transform() - fish name (str) to id (int)
* inverse_transform() - id (int) to fish name (str)

In [178]:
class Encoder:
    def __init__(self):
        self.names = [] #to not store duplicated names
        self.num_classes = 0 #number of all classes
    def fit(self, y):
        for sample in y:
            if sample not in self.names:
                self.names.append(sample)
        self.num_classes = len(self.names)
    def transform(self, y):
        encoded_samples = np.zeros(len(y))
        for index,sample in enumerate(y):
            encoded_samples[index] = self.names.index(sample)
        return encoded_samples
    def inverse_transform(self, encoded_y):
        samples = []
        for sample in encoded_y:
            samples.append(self.names[int(sample)])
        return samples

In [179]:
#tests
encoder = Encoder()
encoder.fit(Y)
print(encoder.names)
print(encoder.num_classes) 

test_list = ['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']
encoded_num_test = encoder.transform(test_list)
print(encoder.inverse_transform(encoded_num_test))

['Pike', 'Perch', 'Bream', 'Smelt', 'Whitefish', 'Parkki', 'Roach']
7
['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']


3. Encoding dataset 

In [180]:
encoder = Encoder()
encoder.fit(Y)
encoded_y = encoder.transform(Y)
print(encoded_y)

[0. 1. 2. 2. 0. 3. 2. 1. 4. 3. 1. 1. 5. 2. 1. 1. 6. 3. 5. 6. 4. 6. 1. 3.
 1. 5. 5. 1. 6. 2. 1. 1. 1. 1. 1. 1. 4. 6. 2. 2. 0. 6. 6. 1. 1. 2. 6. 6.
 5. 4. 5. 3. 1. 0. 2. 1. 5. 1. 3. 1. 1. 1. 2. 1. 1. 1. 1. 6. 2. 0. 1. 1.
 6. 2. 2. 2. 1. 5. 0. 0. 2. 1. 2. 1. 1. 6. 4. 3. 1. 1. 1. 2. 2. 1. 6. 2.
 1. 4. 1. 0. 1. 0. 6. 6. 2. 1. 2. 0. 6. 1. 3. 3. 0. 6. 2. 2. 0. 1. 0. 1.
 2. 0. 5. 0. 6. 5. 1. 2. 1. 2. 6. 1. 3. 2. 2. 1. 1. 0. 3. 6. 2. 0. 1. 2.
 1. 3. 1. 2. 1. 1. 5. 3. 2. 2. 2. 1. 1. 3. 2.]


In [181]:
qw = X - np.mean(X, axis=0)
qw
np.mean(qw, axis=0)

array([ 1.14401849e-14, -1.07251734e-15,  9.20577381e-15,  9.20577381e-15,
        3.35161668e-16, -4.24538113e-16])

### Feature Scaling - Normalization

* test fraction - percentage of test data
* xte - test data
* xtr - training data

xtr ->
* mean = 0
* standard deviation = 1

In [182]:
class Dataset:
    def __init__(self, x,y,test_fraction=0.05):
        test_samples = int(test_fraction * y.size)
        self.xte = x[:test_samples,:]   
        self.xtr = x[test_samples:,:]   
        self.yte = y[:test_samples]   
        self.ytr = y[test_samples:]
        self.mean = np.mean(self.xtr, axis=0)
        self.std = np.std(self.xtr, axis=0)#standard deviation
        self.xtr = self.normalize(self.xtr)
        self.xte = self.normalize(self.xte)

    def normalize(self, x):
        x_temp = (x - self.mean)/self.std
        return x_temp
        
fish_ds = Dataset(X,encoded_y,0.15)
# fish_ds.xte 


In [183]:
np.size(fish_ds.yte)


23

### Classifiers models

- Random Classifier
- KNN - K - Nearest Neighbors Algorithm
- Linear Regression Model 

In [184]:
class Classfier:
    def __init__(self):
        pass
    def fit(self, xtr, ytr):
        pass
    def predict(self, x):
        pass
    def evaluate(self, xte, yte):
        ypred = self.predict(xte)
        print(yte.astype(int))
        print(ypred)
        acc = np.sum(ypred == yte) / yte.size #Accuracy
        return acc

# Baseline (method)
class RandomClassifier(Classfier):
    #expected accuracy = 1/7
    def fit(self, xtr, ytr):
        self.num_classes = int(np.max(ytr) + 1)
    def predict(self, x):
        return np.random.choice(self.num_classes, x.shape[0])

class KNNClassifier(Classfier):
    def __init__(self, k=3):   #hyperparameters
        self.k = k
    def fit(self, xtr, ytr):
        self.xtr = xtr
        self.ytr = ytr
    def predict(self, x):
        num_samples =  x.shape[0]
        ypred = np.zeros(num_samples, dtype=int)
        for i in range(num_samples):
            distance = np.sum((self.xtr - x[i, :])**2, axis=1)
            order = np.argsort(distance)
            knn_label = self.ytr[order][:self.k].astype(int)
            print(f'knn labels {knn_label}')
            binc = np.bincount(knn_label)
            print(f'Bin count {binc}')
            print(f'Y pred {binc.argmax()}')
            ypred[i] = binc.argmax()    #argument which store max value
        return ypred    
        


        

In [185]:
random_classifier = RandomClassifier()
random_classifier.fit(fish_ds.xtr, fish_ds.ytr)
randow_acc = random_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(randow_acc)

knn_classifier = KNNClassifier()
knn_classifier.fit(fish_ds.xtr, fish_ds.ytr)
knn_acc = knn_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(knn_acc)

[0 1 2 2 0 3 2 1 4 3 1 1 5 2 1 1 6 3 5 6 4 6 1]
[5 1 6 0 3 1 0 5 2 2 5 5 5 5 6 3 2 2 0 6 2 6 2]
0.17391304347826086
knn labels [0 0 0]
Bin count [3]
Y pred 0
knn labels [1 1 1]
Bin count [0 3]
Y pred 1
knn labels [2 2 2]
Bin count [0 0 3]
Y pred 2
knn labels [2 2 2]
Bin count [0 0 3]
Y pred 2
knn labels [0 0 0]
Bin count [3]
Y pred 0
knn labels [3 3 3]
Bin count [0 0 0 3]
Y pred 3
knn labels [2 2 2]
Bin count [0 0 3]
Y pred 2
knn labels [1 1 1]
Bin count [0 3]
Y pred 1
knn labels [1 1 1]
Bin count [0 3]
Y pred 1
knn labels [3 3 3]
Bin count [0 0 0 3]
Y pred 3
knn labels [6 6 6]
Bin count [0 0 0 0 0 0 3]
Y pred 6
knn labels [1 6 1]
Bin count [0 2 0 0 0 0 1]
Y pred 1
knn labels [5 5 5]
Bin count [0 0 0 0 0 3]
Y pred 5
knn labels [2 2 2]
Bin count [0 0 3]
Y pred 2
knn labels [1 6 1]
Bin count [0 2 0 0 0 0 1]
Y pred 1
knn labels [1 1 1]
Bin count [0 3]
Y pred 1
knn labels [6 1 1]
Bin count [0 2 0 0 0 0 1]
Y pred 1
knn labels [3 3 3]
Bin count [0 0 0 3]
Y pred 3
knn labels [5 2 2]
Bin count

**Note [PL]**
Każdy model w uczeniu maszynowym ma parametry. Jedne mają ich więcej drugie mniej. Jeśli parametr wyliczany jest samodzielnie przez algorytm podczas uczenia nazywamy go po prostu parametrem. Przykładem mogą być wagi w sieciach neuronowych.

Natomiast jeśli parametr podawany jest przez użytkownika, który używa algorytmu, wówczas nazywamy go hiperparametrem