In [2]:
import os
from math import sqrt
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
test_ids = set()
for fn in os.listdir("../data/Test/"):
    if fn.endswith(".jpg"):
        tid, _ = os.path.splitext(fn)
        test_ids.add(int(tid))

In [4]:
with open("../data/test_sub.json", "w") as jfile:
    json.dump(list(test_ids), jfile, indent=2, sort_keys=True)

In [9]:
with open("../data/test_sub.json", "r") as jfile:
    test_ids = json.load(jfile)
test_ids = [int(iid) for iid in test_ids]

In [15]:
with open("../data/val.json", "r") as jfile:
    val_ids = json.load(jfile)
val_ids = [int(iid) for iid in val_ids]

In [16]:
len(val_ids)

178

In [5]:
!head ../data/test_sub.json

[
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,


In [7]:
!wc -l ../data/test_sub.json

18637 ../data/test_sub.json


In [17]:
def get_groundtruth_counts(path):
    df = pd.read_csv(path)
    gt_counts = dict()
    
    def fill_counts(x, counts):
        counts[int(x.train_id)] = [x.adult_males, x.subadult_males, x.adult_females, x.juveniles, x.pups]
    
    _ = df.apply(fill_counts, axis=1, args=(gt_counts, ))
    return gt_counts

In [18]:
def get_groundtruth_pups_counts(path):
    df = pd.read_csv(path)
    gt_counts = dict()
    
    def fill_counts(x, counts):
        counts[int(x.train_id)] = [x.pups]
    
    _ = df.apply(fill_counts, axis=1, args=(gt_counts, ))
    return gt_counts

In [19]:
gt_counts = get_groundtruth_pups_counts("../data/my_correct_counts.csv")

In [20]:
!wc -l /home/ubuntu/sealion/data/Train/pups_mean.csv

178 /home/ubuntu/sealion/data/Train/pups_mean.csv


In [31]:
pred_counts = dict()
with open("/home/ubuntu/sealion/data/Train/pups_mean.csv", "r") as ifile:
    for line in ifile:
        tid, cnt = line.split(",")
        pred_counts[int(tid)] = [round(float(cnt))]

In [32]:
errors = []
for tid, cnt in pred_counts.items():
    gt = gt_counts[tid]
    error = np.abs(cnt[0] - gt[0])
    errors.append((error, tid, cnt[0], gt[0]))

In [None]:
import pr

In [33]:
sorted(errors)[::-1]

[(165, 291, 72, 237),
 (163, 364, 208, 45),
 (145, 77, 228, 83),
 (101, 524, 101, 0),
 (75, 298, 108, 183),
 (69, 906, 71, 2),
 (68, 777, 68, 0),
 (63, 590, 237, 300),
 (47, 566, 47, 0),
 (43, 847, 194, 151),
 (38, 240, 151, 113),
 (37, 299, 92, 55),
 (34, 252, 33, 67),
 (33, 686, 33, 0),
 (33, 418, 33, 0),
 (32, 881, 149, 117),
 (31, 569, 74, 105),
 (31, 158, 118, 87),
 (30, 773, 108, 138),
 (30, 720, 54, 24),
 (28, 647, 28, 0),
 (28, 316, 28, 0),
 (28, 130, 28, 0),
 (28, 124, 74, 46),
 (27, 367, 82, 55),
 (26, 704, 79, 105),
 (23, 338, 271, 248),
 (22, 349, 22, 0),
 (22, 78, 27, 5),
 (21, 823, 21, 0),
 (21, 700, 2, 23),
 (21, 229, 21, 0),
 (20, 834, 20, 0),
 (19, 799, 22, 3),
 (19, 627, 19, 0),
 (18, 831, 18, 0),
 (18, 804, 18, 0),
 (18, 722, 91, 109),
 (17, 178, 17, 0),
 (16, 805, 15, 31),
 (15, 837, 15, 0),
 (14, 609, 14, 0),
 (14, 163, 86, 72),
 (14, 145, 14, 0),
 (13, 856, 13, 0),
 (13, 181, 15, 2),
 (12, 576, 12, 0),
 (12, 470, 12, 0),
 (12, 287, 12, 0),
 (11, 28, 11, 0),
 (10, 

In [34]:
def get_rmse(predicted_counts, groundtruth_counts):
    key = list(predicted_counts.keys())[0]
    N_SEALION_TYPES = len(predicted_counts[key])
    
    # Check if all predicted ids are also in validation ids
    predicted_ids = set(predicted_counts.keys())
    set_validation_ids  = set(groundtruth_counts.keys())
    assert(predicted_ids.issubset(set_validation_ids))
    
    # Compute rmse for each type
    rmses = [0 for _ in range(N_SEALION_TYPES)]
    for iid, pred_counts in predicted_counts.items():
        true_counts = groundtruth_counts[iid]
        for sid in range(N_SEALION_TYPES):
            dc = pred_counts[sid] - true_counts[sid]
            rmses[sid] += dc * dc
            
    for sid in range(N_SEALION_TYPES):
        rmses[sid] /= len(predicted_counts)
        rmses[sid] = sqrt(rmses[sid])
    print(rmses)    
    return np.mean(rmses)

In [35]:
get_rmse(pred_counts, gt_counts)

[27.28913023922681]


27.289130239226811