### Fish classifier

Author: Łukasz Szarecki 

Dataset: https://www.kaggle.com/aungpyaeap/fish-market

1. Uploading data to pandas dataframes

In [16]:
import pandas as pd
import numpy as np
fish_df = pd.read_csv("Fish.csv")
print(fish_df.head())

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


2. We have to mix data because there are sorted.
3. Preapering training and testing data.

In [17]:
#sorting dataset
fish_df = fish_df.sample(frac=1)
#corect indexing
fish_df = fish_df.reset_index(drop=True)
print(fish_df.head())
print(f'There are {len(fish_df)} samples')

     Species  Weight  Length1  Length2  Length3   Height   Width
0      Perch  1015.0     37.0     40.0     42.4  12.3808  7.4624
1      Perch   150.0     20.5     22.5     24.0   6.7920  3.6240
2      Bream   700.0     31.9     35.0     40.5  16.2405  5.5890
3      Roach   140.0     21.0     22.5     25.0   6.5500  3.3250
4  Whitefish   270.0     24.1     26.5     29.3   8.1454  4.2485
There are 159 samples


3. Input and output data

In [18]:
#input
X = np.array(fish_df.iloc[:, 1::])
print(X.shape)
#output
Y = (fish_df.iloc[:,0])
print(Y.shape)

(159, 6)
(159,)


### Encoder class

3. Each fish will have its own identifier
* fit() - create connection between identifiers and fish names
* transform() - fish name (str) to id (int)
* inverse_transform() - id (int) to fish name (str)

In [19]:
class Encoder():
    def __init__(self):
        self.names = [] #to not store duplicated names
        self.num_classes = 0 #number of all classes
    def fit(self, y):
        for sample in y:
            if sample not in self.names:
                self.names.append(sample)
        self.num_classes = len(self.names)
    def transform(self, y):
        encoded_samples = np.zeros(len(y))
        for index,sample in enumerate(y):
            encoded_samples[index] = self.names.index(sample)
        return encoded_samples
    def inverse_transform(self, encoded_y):
        samples = []
        for sample in encoded_y:
            samples.append(self.names[int(sample)])
        return samples

In [20]:
#tests
encoder = Encoder()
encoder.fit(Y)
print(encoder.names)
print(encoder.num_classes) 

test_list = ['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']
encoded_num_test = encoder.transform(test_list)
print(encoder.inverse_transform(encoded_num_test))

['Perch', 'Bream', 'Roach', 'Whitefish', 'Parkki', 'Smelt', 'Pike']
7
['Pike', 'Smelt', 'Perch', 'Pike', 'Pike', 'Bream', 'Bream', 'Roach']


3. Encoding dataset 

In [21]:
encoder = Encoder()
encoder.fit(Y)
encoded_y = encoder.transform(Y)
print(encoded_y)

[0. 0. 1. 2. 3. 0. 1. 0. 4. 0. 4. 4. 4. 4. 1. 4. 0. 0. 5. 0. 4. 1. 6. 1.
 6. 0. 1. 6. 1. 6. 5. 0. 5. 0. 0. 1. 2. 1. 6. 0. 0. 2. 2. 0. 2. 1. 6. 0.
 3. 0. 2. 0. 4. 0. 1. 0. 2. 0. 1. 2. 1. 5. 6. 2. 0. 0. 5. 5. 0. 2. 6. 0.
 0. 6. 0. 1. 0. 6. 0. 0. 0. 1. 2. 4. 2. 5. 0. 5. 0. 1. 6. 1. 2. 5. 5. 1.
 1. 1. 0. 1. 1. 3. 1. 0. 0. 5. 6. 1. 1. 3. 0. 1. 5. 0. 1. 3. 0. 0. 1. 0.
 2. 0. 1. 0. 2. 1. 6. 0. 0. 5. 0. 2. 0. 0. 6. 4. 0. 2. 6. 1. 0. 1. 6. 0.
 6. 2. 5. 0. 1. 0. 0. 1. 0. 4. 0. 2. 2. 3. 1.]


In [45]:
qw = X - np.mean(X, axis=0)
qw
np.mean(qw, axis=0)

array([ 1.28702080e-14,  6.03291002e-15, -1.60877601e-15, -1.02782911e-15,
        2.79301390e-16,  4.35710168e-16])

### Feature Scaling - Normalization

* test fraction - percentage of test data
* xte - test data
* xtr - training data

xtr ->
* mean = 0
* standard deviation = 1

In [54]:
class Dataset():
    def __init__(self, x,y,test_fraction=0.05):
        test_samples = int(test_fraction * y.size)
        self.xte = x[:test_samples,:]   
        self.xtr = x[test_samples:,:]   
        self.yte = y[:test_samples]   
        self.ytr = y[test_samples:]
        self.mean = np.mean(self.xtr, axis=0)
        self.std = np.std(self.xtr, axis=0)#standard deviation
        self.xtr = self.normalize(self.xtr)
        self.xte = self.normalize(self.xte)

    def normalize(self, x):
        x_temp = (x - self.mean)/self.std
        return x_temp
        
fish_ds = Dataset(X,Y,0.15)
# fish_ds.xte 


### KNN - K - Nearest Neighbors Algorithm
