In [1]:
import os
import urllib
import fastai.vision.all as fai_vision
import numpy as np
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import platform
import pathlib
import altair as alt
import pandas as pd

In [2]:
def load_classification_model():
    plt = platform.system()

    if plt == 'Linux' or plt == 'Darwin': 
        pathlib.WindowsPath = pathlib.PosixPath
    inf_model = fai_vision.load_learner('models/perumixed3.pkl', cpu=True)

    return inf_model

classification_model = load_classification_model()

In [13]:
images_df = pd.read_csv('dropbox_photos_fish_metadata.tsv', sep='\t')
images_df.head()

Unnamed: 0,folder,file_name,genus,species,data_source,country,specimen_id
0,2018 November,DSC_0001-CV1-TRAMPA.JPG,Hoplias,malabaricus,field,Peru,CV1-298
1,2018 November,DSC_0002-CV2-TRAMPA.JPG,Leporinus,friderici,field,Peru,CV2-492
2,2018 November,DSC_0002-KANT1.JPG,Laetacara,flavilabris,field,Peru,CV2-300
3,2018 November,DSC_0003-CV2-TRAMPA.JPG,Hoplias,malabaricus,field,Peru,CV2-493
4,2018 November,DSC_0005-KANT1.JPG,Moenkhausia,sp. 1,field,Peru,CV2-303


In [15]:
masked_df = pd.read_csv('mask_results.tsv', sep='\t')
masked_df.head()

Unnamed: 0,file_name,mask_percent
0,DSC_0385.JPG,10.467529
1,DSC_0095-CHT1.JPG,13.569641
2,DSC_0391.JPG,20.661926
3,DSC_1058.JPG,5.815125
4,DSC_0346.JPG,26.457214


In [16]:
masked_results = images_df.merge(masked_df, on='file_name')
masked_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1418 entries, 0 to 1417
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   folder        1418 non-null   object 
 1   file_name     1418 non-null   object 
 2   genus         1418 non-null   object 
 3   species       1418 non-null   object 
 4   data_source   1418 non-null   object 
 5   country       1418 non-null   object 
 6   specimen_id   1418 non-null   object 
 7   mask_percent  1418 non-null   float64
dtypes: float64(1), object(7)
memory usage: 99.7+ KB


In [7]:
def predictions_to_output(prediction, classes):
    pred_rows = []
    for i, conf in enumerate(list(prediction[2])):
        pred_row = {'class': classes[i],
                    'probability': round(float(conf) * 100,2)}
        pred_rows.append(pred_row)
    pred_df = pd.DataFrame(pred_rows)
    pred_df.head()
    top_probs = pred_df.sort_values('probability', ascending=False).head(5)
    return top_probs

In [33]:
test_files = [test_file for test_file in list(Path('masked').glob('*.JPG'))[:10]]
dl = classification_model.dls.test_dl(test_files)
preds = classification_model.get_preds(dl=dl, with_decoded=True)
print(preds)

(TensorImage([[1.5887e-06, 9.6528e-01, 2.6377e-09, 4.4427e-07, 1.3045e-10, 3.4671e-02,
         7.8634e-07, 6.8744e-11, 1.4317e-08, 6.8989e-09, 1.2996e-06, 4.0655e-09,
         2.1576e-07, 2.7385e-08, 3.7594e-05, 3.5870e-06, 5.4085e-09, 9.5903e-10,
         5.7940e-10, 1.8720e-09, 7.9118e-10, 2.0995e-09, 1.2046e-07, 2.2119e-08,
         4.2950e-10, 1.0140e-07, 3.4909e-07, 4.9847e-09, 1.2002e-07, 1.4946e-07,
         4.0322e-08, 3.6842e-09, 2.6465e-08],
        [1.4758e-04, 1.3632e-04, 3.5703e-05, 1.7881e-04, 5.2279e-07, 3.4862e-04,
         8.3527e-05, 5.9662e-06, 1.2390e-07, 4.5115e-07, 4.7909e-05, 3.0684e-06,
         2.5040e-06, 5.0111e-07, 8.9669e-06, 9.9871e-01, 2.2857e-07, 1.8938e-07,
         1.3364e-07, 2.4547e-05, 4.2755e-05, 1.1767e-07, 1.3868e-06, 2.9763e-06,
         1.8771e-06, 3.7135e-05, 1.0658e-06, 1.0909e-05, 1.6186e-04, 1.0350e-06,
         1.0104e-07, 9.0898e-07, 4.1064e-07],
        [4.8200e-08, 2.9329e-07, 9.9976e-01, 1.7361e-05, 8.5648e-07, 1.9566e-08,
         4.

In [34]:
classification_results = []
for row in masked_results.to_dict(orient='records'):
    test_file = Path('masked') / row['file_name']
    original_pil = Image.open(test_file)
    prediction = classification_model.predict(test_file)
    top_probs = predictions_to_output(prediction, classes = classification_model.dls.vocab)
    for rank, prob_row in enumerate(top_probs.to_dict(orient='records')):
        row[f'class_{rank + 1}'] = prob_row['class']
        row[f'conf_{rank + 1}'] = prob_row['probability']
    classification_results.append(row)

In [35]:
classification_df = pd.DataFrame(classification_results)
classification_df.head()

Unnamed: 0,folder,file_name,genus,species,data_source,country,specimen_id,mask_percent,class_1,conf_1,class_2,conf_2,class_3,conf_3,class_4,conf_4,class_5,conf_5
0,2018 November,DSC_0001-CV1-TRAMPA.JPG,Hoplias,malabaricus,field,Peru,CV1-298,16.755676,Ancistrus,54.52,Erythrinus,41.9,Prochilodus,1.81,Gymnotus,0.87,Bujurquina,0.35
1,2018 November,DSC_0002-CV2-TRAMPA.JPG,Leporinus,friderici,field,Peru,CV2-492,39.595032,Prochilodus,99.92,Moenkhausia,0.06,Ancistrus,0.01,Hemigrammus,0.0,Tetragonopterus,0.0
2,2018 November,DSC_0002-KANT1.JPG,Laetacara,flavilabris,field,Peru,CV2-300,10.177612,Bujurquina,91.77,Corydoras,5.07,Apistogramma,1.3,Bario,0.65,Moenkhausia,0.52
3,2018 November,DSC_0003-CV2-TRAMPA.JPG,Hoplias,malabaricus,field,Peru,CV2-493,41.471863,Prochilodus,99.69,Erythrinus,0.29,Charax,0.02,Moenkhausia,0.0,Otocinclus,0.0
4,2018 November,DSC_0005-KANT1.JPG,Moenkhausia,sp. 1,field,Peru,CV2-303,19.670105,Moenkhausia,93.9,Astyanax,5.16,Tetragonopterus,0.61,Prochilodus,0.13,Bario,0.05


In [36]:
classification_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1418 entries, 0 to 1417
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   folder        1418 non-null   object 
 1   file_name     1418 non-null   object 
 2   genus         1418 non-null   object 
 3   species       1418 non-null   object 
 4   data_source   1418 non-null   object 
 5   country       1418 non-null   object 
 6   specimen_id   1418 non-null   object 
 7   mask_percent  1418 non-null   float64
 8   class_1       1418 non-null   object 
 9   conf_1        1418 non-null   float64
 10  class_2       1418 non-null   object 
 11  conf_2        1418 non-null   float64
 12  class_3       1418 non-null   object 
 13  conf_3        1418 non-null   float64
 14  class_4       1418 non-null   object 
 15  conf_4        1418 non-null   float64
 16  class_5       1418 non-null   object 
 17  conf_5        1418 non-null   float64
dtypes: float64(6), object(12)
me

In [37]:
classification_df.to_csv('classification_results.tsv', sep='\t', index=False)

In [38]:
classification_model.dls.vocab

['Ancistrus', 'Apistogramma', 'Astyanax', 'Bario', 'Bryconops', 'Bujurquina', 'Bunocephalus', 'Characidium', 'Charax', 'Copella', 'Corydoras', 'Creagrutus', 'Curimata', 'Doras', 'Erythrinus', 'Gasteropelecus', 'Gymnotus', 'Hemigrammus', 'Hyphessobrycon', 'Knodus', 'Moenkhausia', 'Otocinclus', 'Oxyropsis', 'Phenacogaster', 'Pimelodella', 'Prochilodus', 'Pygocentrus', 'Pyrrhulina', 'Rineloricaria', 'Sorubim', 'Tatia', 'Tetragonopterus', 'Tyttocharax']