### Fish classifier

Author: Łukasz Szarecki 

Dataset: https://www.kaggle.com/aungpyaeap/fish-market

1. Uploading data to pandas dataframes

In [66]:
import pandas as pd
import numpy as np
fish_df = pd.read_csv("Fish.csv")
print(fish_df.head())

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


2. We have to mix data because there are sorted.
3. Preapering training and testing data.

In [67]:
#sorting dataset
fish_df = fish_df.sample(frac=1)
#corect indexing
fish_df = fish_df.reset_index(drop=True)
print(fish_df.head())
print(f'There are {len(fish_df)} samples')

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Perch   556.0     32.0     34.5     36.5  10.2565  6.3875
1   Perch    51.5     15.0     16.2     17.2   4.5924  2.6316
2   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
3   Perch    32.0     12.5     13.7     14.7   3.5280  1.9992
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340
There are 159 samples


3. Input and output data

In [68]:
#input
X = np.array(fish_df.iloc[:, 1::])
print(X.shape)
#output
Y = (fish_df.iloc[:,0])
print(Y.shape)

(159, 6)
(159,)


### Encoder class

3. Each fish will have its own identifier
* fit() - create connection between identifiers and fish names
* transform() - fish name (str) to id (int)
* inverse_transform() - id (int) to fish name (str)

In [69]:
class Encoder:
    def __init__(self):
        self.names = [] #to not store duplicated names
        self.num_classes = 0 #number of all classes
    def fit(self, y):
        for sample in y:
            if sample not in self.names:
                self.names.append(sample)
        self.num_classes = len(self.names)
    def transform(self, y):
        encoded_samples = np.zeros(len(y))
        for index,sample in enumerate(y):
            encoded_samples[index] = self.names.index(sample)
        return encoded_samples
    def inverse_transform(self, encoded_y):
        samples = []
        for sample in encoded_y:
            samples.append(self.names[int(sample)])
        return samples

In [70]:
#tests
encoder = Encoder()
encoder.fit(Y)
print(encoder.names)
print(encoder.num_classes) 

test_list = ['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']
encoded_num_test = encoder.transform(test_list)
print(encoder.inverse_transform(encoded_num_test))

['Perch', 'Bream', 'Pike', 'Smelt', 'Roach', 'Whitefish', 'Parkki']
7
['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']


3. Encoding dataset 

In [71]:
encoder = Encoder()
encoder.fit(Y)
encoded_y = encoder.transform(Y)
print(encoded_y)

[0. 0. 1. 0. 1. 2. 2. 0. 0. 0. 1. 2. 3. 1. 2. 0. 4. 4. 0. 1. 0. 0. 2. 5.
 1. 0. 1. 6. 1. 0. 6. 3. 0. 0. 5. 0. 0. 0. 2. 1. 1. 2. 4. 0. 6. 0. 4. 4.
 1. 1. 0. 1. 0. 6. 6. 1. 2. 6. 6. 0. 0. 0. 3. 4. 5. 3. 4. 1. 1. 2. 2. 4.
 2. 0. 1. 3. 0. 0. 4. 0. 0. 2. 2. 0. 0. 2. 4. 3. 0. 4. 3. 6. 0. 3. 1. 4.
 1. 1. 1. 0. 0. 1. 3. 1. 4. 1. 6. 1. 0. 1. 5. 2. 0. 0. 0. 0. 5. 1. 0. 0.
 6. 1. 1. 0. 4. 3. 0. 1. 3. 5. 0. 0. 0. 0. 1. 3. 0. 1. 0. 4. 2. 6. 0. 3.
 3. 4. 1. 0. 4. 2. 4. 4. 0. 1. 0. 0. 1. 0. 4.]


In [72]:
qw = X - np.mean(X, axis=0)
qw
np.mean(qw, axis=0)

array([ 1.43002312e-14, -1.25127023e-15, -1.69815245e-15, -1.25127023e-15,
        2.68129334e-16,  4.58054279e-16])

### Feature Scaling - Normalization

* test fraction - percentage of test data
* xte - test data
* xtr - training data

xtr ->
* mean = 0
* standard deviation = 1

In [73]:
class Dataset:
    def __init__(self, x,y,test_fraction=0.05):
        test_samples = int(test_fraction * y.size)
        self.xte = x[:test_samples,:]   
        self.xtr = x[test_samples:,:]   
        self.yte = y[:test_samples]   
        self.ytr = y[test_samples:]
        self.mean = np.mean(self.xtr, axis=0)
        self.std = np.std(self.xtr, axis=0)#standard deviation
        self.xtr = self.normalize(self.xtr)
        self.xte = self.normalize(self.xte)

    def normalize(self, x):
        x_temp = (x - self.mean)/self.std
        return x_temp
        
fish_ds = Dataset(X,encoded_y,0.15)
# fish_ds.xte 


In [78]:
np.size(fish_ds.yte)


23

### Classifiers models

* KNN - K - Nearest Neighbors Algorithm


In [112]:
class Classfier:
    def __init__(self):
        pass
    def fit(self, xtr, ytr):
        pass
    def predict(self, x):
        pass
    def evaluate(self, xte, yte):
        ypred = self.predict(xte)
        print(yte.astype(int))
        print(ypred)
        acc = np.sum(ypred == yte) / yte.size #Accuracy
        return acc

# Baseline (method)
class RandomClassifier(Classfier):
    #expected accuracy = 1/7
    def fit(self, xtr, ytr):
        self.num_classes = int(np.max(ytr) + 1)
    def predict(self, x):
        return np.random.choice(self.num_classes, x.shape[0])

        

In [115]:
random_classifier = RandomClassifier()
random_classifier.fit(fish_ds.xtr, fish_ds.ytr)
randow_acc = random_classifier.evaluate(fish_ds.xte, fish_ds.yte)
print(randow_acc)

[0 0 1 0 1 2 2 0 0 0 1 2 3 1 2 0 4 4 0 1 0 0 2]
[4 6 4 5 3 6 5 0 5 4 0 6 2 4 0 4 3 2 0 1 3 4 0]
0.13043478260869565
