In [1]:
import os
from math import sqrt
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!head ../data/sealion/sample_submission.csv

test_id,adult_males,subadult_males,adult_females,juveniles,pups
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,0
8,0,0,0,0,0


In [61]:
!wc -l ../data/sealion/sample_submission.csv

18637 ../data/sealion/sample_submission.csv


In [47]:
df_sample = pd.read_csv("../data/sealion/sample_submission.csv")

In [48]:
real_test_ids_expected = set(df_sample.test_id)

In [49]:
real_test_ids = set()
for fn in os.listdir("../data/sealion/Test/"):
    if fn.endswith(".jpg"):
        uid, _ = os.path.splitext(fn)
        real_test_ids.add(int(uid))

In [50]:
real_test_ids == real_test_ids_expected

True

In [15]:
def predict_counts(ids):
    """ Return a dict id -> counts (5 elts list)
    """
    counts = dict()
    for iid in ids:
        counts[iid] = [0, 0, 0, 0, 0]
    
    return counts

In [31]:
def write_submission(counts, out_fn):
    with open(out_fn, "w") as ofile:
        ofile.write("test_id,adult_males,subadult_males,adult_females,juveniles,pups\n")
        for iid in sorted(counts.keys()):
            str_counts = ",".join([str(int(round(c))) for c in counts[iid]])
            ofile.write("{iid},{counts}\n".format(iid=iid, counts=str_counts))

In [9]:
def get_rmse(predicted_counts, groundtruth_counts):
    N_SEALION_TYPES = 5
    # Check if all predicted ids are also in validation ids
    predicted_ids = set(predicted_counts.keys())
    set_validation_ids  = set(groundtruth_counts.keys())
    assert(predicted_ids.issubset(set_validation_ids))
    
    # Compute rmse for each type
    rmses = [0 for _ in range(N_SEALION_TYPES)]
    for iid, pred_counts in predicted_counts.items():
        true_counts = groundtruth_counts[iid]
        for sid in range(N_SEALION_TYPES):
            dc = pred_counts[sid] - true_counts[sid]
            rmses[sid] += dc * dc
            
    for sid in range(N_SEALION_TYPES):
        rmses[sid] /= len(predicted_counts)
        rmses[sid] = sqrt(rmses[sid])
        
    return np.mean(rmses)

In [10]:
with open("../data/sealion/train.json", "r") as jfile:
    train_ids = json.load(jfile)
train_ids = [int(iid) for iid in train_ids]

In [19]:
with open("../data/sealion/val.json", "r") as jfile:
    val_ids = json.load(jfile)
val_ids = [int(iid) for iid in val_ids]

In [24]:
with open("../data/sealion/test.json", "r") as jfile:
    test_ids = json.load(jfile)
test_ids = [int(iid) for iid in test_ids]

In [12]:
def get_groundtruth_counts(path):
    df = pd.read_csv(path)
    gt_counts = dict()
    
    def fill_counts(x, counts):
        counts[int(x.train_id)] = [x.adult_males, x.subadult_males, x.adult_females, x.juveniles, x.pups]
    
    _ = df.apply(fill_counts, axis=1, args=(gt_counts, ))
    return gt_counts

In [13]:
gt_counts = get_groundtruth_counts("../data/sealion/my_correct_counts.csv")

In [29]:
predicted_counts = predict_counts(val_ids)

In [30]:
get_rmse(predicted_counts, gt_counts)

33.434013750741066

In [40]:
def predict_counts_mean(ids, train_counts):
    """ Return a dict id -> counts (5 elts list)
    """
    means = [0, 0, 0, 0, 0]
    for _, counts in train_counts.items():
        for sid in range(5):
            means[sid] += counts[sid]
            
    for sid in range(5):
        means[sid] /= len(train_counts)
        means[sid] = int(round(means[sid]))
        
    counts = dict()
    for iid in ids:
        counts[iid] = means
    
    return counts

In [41]:
train_counts = dict()
for iid in train_ids:
    train_counts[iid] = gt_counts[iid]

In [42]:
predicted_counts_mean = predict_counts_mean(val_ids, train_counts)

In [45]:
get_rmse(predicted_counts_mean, gt_counts)

28.9993039875602

# Cross validation pipeline

In [62]:
from sklearn.model_selection import KFold

In [96]:
kf = KFold(n_splits=4, shuffle=True, random_state=42)

In [97]:
X = np.array(list(gt_counts.keys())).reshape(-1, 1)

In [109]:
rmses = []
for train, test in kf.split(X):
    train_ids = list(train)
    test_ids = list(test)
    train_counts = dict()
    for iid in train_ids:
        train_counts[iid] = gt_counts[iid]
    predicted_counts_mean = predict_counts_mean(test_ids, train_counts)
    rmse = get_rmse(predicted_counts_mean, gt_counts)
    rmses.append(rmse)
    print(list(predicted_counts_mean.values())[0], rmse)
    

[6, 4, 39, 20, 18] 34.327967592
[6, 4, 40, 22, 17] 30.4188003353
[6, 4, 39, 20, 17] 35.5571450452
[5, 4, 37, 21, 16] 35.2574232483


In [110]:
np.mean(rmses)

33.890334055197286

In [111]:
get_rmse(gt_counts, gt_counts)

0.0

# Create a submission with all train data

In [52]:
real_predicted_counts_mean = predict_counts_mean(real_test_ids, gt_counts)   

In [54]:
from datetime import datetime

In [56]:
int(datetime.timestamp(datetime.now()))

1495097083

In [58]:
write_submission(real_predicted_counts_mean, "../data/submissions/1495097083_mean_prediction_all_train_data.csv")

In [59]:
!head "../data/submissions/1495097083_mean_prediction_all_train_data.csv"

test_id,adult_males,subadult_males,adult_females,juveniles,pups
0,6,4,39,21,17
1,6,4,39,21,17
2,6,4,39,21,17
3,6,4,39,21,17
4,6,4,39,21,17
5,6,4,39,21,17
6,6,4,39,21,17
7,6,4,39,21,17
8,6,4,39,21,17


In [60]:
!wc -l "../data/submissions/1495097083_mean_prediction_all_train_data.csv"

18637 ../data/submissions/1495097083_mean_prediction_all_train_data.csv
