In [2]:
import csv
import bq
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

In [3]:
client = bq.Client.Get()

def Query(q):
    t0 = time.time()
    answer = client.ReadTableRows(client.Query(q)['configuration']['query']['destinationTable'])
    print 'Query time: ' + str(time.time() - t0) + ' seconds.'
    return answer

In [4]:
# see http://ec.europa.eu/fisheries/fleet/index.cfm?method=Codification.Cod_gear
eu_gear_codes = {"DRB":"Boat dredges",
"DRH":"Hand dredges operating from a boat",
"HMD":"Mechanised dredges including suction dredges",
"GNC":"Encircling gillnets",
"GND":"Drift nets",
"GNS":"Set gillnets (anchored)",
"GTN":"Combined gillnets-trammel nets",
"GTR":"Trammel nets",
"HAR":"Harpoons",
"LLS":"Set longlines",
"LLD":"Drifting longlines",
"LHM":"Handlines and pole-lines (mechanised)",
"LHP":"Handlines and pole-lines (hand operated)",
"LLD":"Drifting longlines",
"LLS":"Set longlines",
"LTL":"Trolling lines",
"LNB":"Boat-operated lift nets",
"LNS":"Shore-operated stationary lift nets",
"NO":"No gear",
"SB":"Beach seines",
"SDN":"Danish seines",
"SPR":"Pair seines",
"SSC":"Scottish seines",
"LA":"Lampara nets",
"PS":"Purse seines",
"FPO":"Pots and traps",
"OTB":"Bottom otter trawls",
"OTM":"Midwater otter trawls",
"OTT":"Otter twin trawls",
"PTB":"Bottom pair trawls",
"PTM":"Midwater pair trawls",
"TBB":"Beam trawls",
"NK":"Unknown gear"};


eu_gear2 = {"DRB":"Trawlers",
"DRH":"Trawlers",
"HMD":"Trawlers",
"GNC":"Set gillnets",
"GND":"Fishing vessel",
"GNS":"Set gillnets",
"GTN":"Set gillnets",
"GTR":"Set gillnets",
"HAR":"Fishing vessel",
"LHM":"Pole and line",
"LHP":"Pole and line",
"LLD":"Drifting longlines",
"LLS":"Set longlines",
"LTL":"Trollers",
"LNB":"Fishing vessel",
"LNS":"Fishing vessel",
"NO":"Fishing vessel",
"SB":"Fishing vessel",
"SDN":"Fishing vessel",
"SPR":"Fishing vessel",
"SSC":"Fishing vessel",
"LA":"Set gillnets",
"PS":"Purse seines",
"FPO":"Pots and traps",
"OTB":"Trawlers",
"OTM":"Trawlers",
"OTT":"Trawlers",
"PTB":"Trawlers",
"PTM":"Trawlers",
"TBB":"Trawlers",
"NK":"Unknown gear"}

In [5]:
q = '''select mmsi, group_concat(geartype) geartype, 
group_concat(secondary_geartype) secondary_geartype,
avg(length), avg(tonnage), 
count(*) num_mmsi from
(select a.mmsi mmsi, b.Gear_Main_Code geartype, b.Gear_Sec_Code secondary_geartype,
b.Loa length, b.Ton_Gt tonnage from 
(SELECT mmsi,row_number FROM [world-fishing-827:EU_match_results.EU_v2]) a
left join [Registry_matching_sources.EU_registry_311215] b
on a.row_number = b.row_number)
group by mmsi
having num_mmsi = 1'''
eu = Query(q)

Waiting on bqjob_r4b6956a574fcab6c_000001584a99a2e8_1 ... (0s) Current status: DONE   
Query time: 5.23639297485 seconds.


In [6]:
rows = []
mmsis = []

for c in eu:
    if c[1] in eu_gear_codes: # if it is a shiptype we are including
        mmsi = c[0]
        geartype = eu_gear2[c[1]] # replace with the right geartype
        geartype2 = eu_gear2[c[2]]
        if geartype2 == "Fishing vessel":
            geartype2 = eu_gear_codes[c[2]]
#         if geartype2[:3] == "Set" or geartype2 =="Pots and traps":
#             geartype = "Set gear"
#         length = c[2]
#         tonnage = c[3]
#         if length !=None: length=float(c[2])
#         if tonnage !=None: tonnage = float(c[3])
        rows.append([geartype, geartype2])
        mmsis.append(mmsi)

In [7]:
eu_frame = pd.DataFrame(rows,index=mmsis,columns=['primary_geartype','secondary_geartype'])

In [8]:
eu_frame.groupby(['primary_geartype']).size()
#     print j


primary_geartype
Drifting longlines     206
Fishing vessel          82
Pole and line           48
Pots and traps         141
Purse seines           775
Set gillnets           593
Set longlines          235
Trawlers              4352
Trollers                 7
dtype: int64

In [9]:
eu_frame[eu_frame['primary_geartype'] == "Drifting longlines"].groupby(['primary_geartype','secondary_geartype']).size()

types = []
for e in eu_gear2:
    types.append(eu_gear2[e])

types = list(set(types))
for t in types:
    if t != 'Unknown gear':
        print eu_frame[eu_frame['primary_geartype'] == t].groupby(['primary_geartype','secondary_geartype']).size()


primary_geartype  secondary_geartype
Trawlers          Danish seines           11
                  Drift nets              11
                  Drifting longlines      16
                  No gear               2196
                  Pair seines              1
                  Pole and line           11
                  Pots and traps          26
                  Purse seines           306
                  Scottish seines          8
                  Set gillnets           214
                  Set longlines          199
                  Trawlers              1349
                  Trollers                 1
                  Unknown gear             3
dtype: int64
primary_geartype  secondary_geartype
Set gillnets      Danish seines           4
                  Drift nets              6
                  Drifting longlines     20
                  No gear               116
                  Pair seines             1
                  Pole and line          22
                  P

In [10]:
# get EU values

q = '''
select mmsi, 
group_concat(geartype) geartype, 
avg(length), 
avg(tonnage),
count(*) num_mmsi,
group_concat(secondary_geartype) secondary_geartype
from
(select a.mmsi mmsi, b.Gear_Main_Code geartype, 
b.Loa length, b.Ton_Gt tonnage, b.Gear_Sec_Code as secondary_geartype
from 
(SELECT mmsi,row_number FROM [world-fishing-827:EU_match_results.EU_v2]) a
left join [Registry_matching_sources.EU_registry_311215] b
on a.row_number = b.row_number)
group by mmsi
having num_mmsi = 1

'''
eu = Query(q)

Waiting on bqjob_r41ec5ce79982baaa_000001584a99d78e_2 ... (0s) Current status: DONE   
Query time: 3.94446206093 seconds.


In [11]:
eu_gear2 = {"DRB":"Trawlers",
"DRH":"Trawlers",
"HMD":"Trawlers",
"GNC":"Set gillnets",
"GND":"Fishing vessel",
"GNS":"Set gillnets",
"GTN":"Set gillnets",
"GTR":"Set gillnets",
"HAR":"Fishing vessel",
"LHM":"Pole and line",
"LHP":"Pole and line",
"LLD":"Drifting longlines",
"LLS":"Set longlines",
"LTL":"Trollers",
"LNB":"Fishing vessel",
"LNS":"Fishing vessel",
"NO":"Fishing vessel",
"SB":"Fishing vessel",
"SDN":"Fishing vessel",
"SPR":"Fishing vessel",
"SSC":"Fishing vessel",
"LA":"Set gillnets",
"PS":"Purse seines",
"FPO":"Pots and traps",
"OTB":"Trawlers",
"OTM":"Trawlers",
"OTT":"Trawlers",
"PTB":"Trawlers",
"PTM":"Trawlers",
"TBB":"Trawlers",
"NK":"Unknown gear",
"NO":"No gear"}

In [16]:
rows = []
mmsis = []
bad_rows = []
bad_mmsis = {}

for c in eu:
    if c[1] in eu_gear2 and c[1] != "NO": # if it is a shiptype we are including
        mmsi = c[0]
        geartype2 = eu_gear2[c[1]] # replace with the right geartype
        secondary_geartype = eu_gear2[c[5]]
        if secondary_geartype == "No gear" or secondary_geartype == geartype2:

            geartype = geartype2
            if geartype2[:3] == "Set" or geartype2 =="Pots and traps":
                geartype = "Set gear"
            length = c[2]
            tonnage = c[3]
            if length !=None: length=float(c[2])
            if tonnage !=None: tonnage = float(c[3])
            rows.append([geartype2, secondary_geartype, length, tonnage])
            mmsis.append(mmsi)
        else:
            
            if geartype2[:3] == "Set" or geartype2 =="Pots and traps":
                geartype = "Set gear"
            length = c[2]
            tonnage = c[3]
            if length !=None: length=float(c[2])
            if tonnage !=None: tonnage = float(c[3])
            bad_rows.append([geartype2, secondary_geartype, length, tonnage])
            bad_mmsis[mmsi] = [geartype2, secondary_geartype, length, tonnage]         

In [17]:
len(bad_mmsis)

1640

In [20]:
pybossa_mmsi = []
with open('pybossa/tasks_20160927A.csv', 'rU') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        pybossa_mmsi.append(row['mmsi'])

In [22]:
pybossa_mmsi = set(pybossa_mmsi)

In [31]:
print len(set(bad_mmsis).intersection(pybossa_mmsi))
print len(pybossa_mmsi)

51
222


In [25]:
mmsi_pybossa_double = []
for m in pybossa_mmsi:
    if m in bad_mmsis:
        print m, bad_mmsis[m]

 247100480 ['Trawlers', 'Set gillnets', 27.7, 159.0]
247152310 ['Trawlers', 'Set longlines', 20.1, 75.0]
247144080 ['Trawlers', 'Set longlines', 21.3, 75.0]
247142730 ['Trawlers', 'Set longlines', 26.9, 157.0]
227316570 ['Pots and traps', 'Set gillnets', 11.95, 15.86]
204225000 ['Set longlines', 'Drifting longlines', 25.36, 154.0]
227883000 ['Set gillnets', 'Set longlines', 30.33, 195.0]
247131490 ['Purse seines', 'Set longlines', 17.02, 21.0]
238561110 ['Purse seines', 'Trawlers', 29.44, 144.0]
204248000 ['Purse seines', 'Set gillnets', 21.14, 66.35]
265661180 ['Trollers', 'Pots and traps', 10.2, 8.18]
207421000 ['Trawlers', 'Pole and line', 25.5, 117.36]
250000608 ['Set gillnets', 'Trawlers', 20.4, 103.0]
225382000 ['Set longlines', 'Set gillnets', 30.5, 287.0]
227142200 ['Purse seines', 'Trollers', 15.9, 33.0]
265704020 ['Trollers', 'Fishing vessel', 8.53, 5.3]
238361240 ['Purse seines', 'Trawlers', 26.45, 163.0]
239329000 ['Drifting longlines', 'Pots and traps', 23.22, 81.0]
265629

In [81]:
rows = []
doubles = 0
with open('dumped_predictions_20161107.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['mmsi'] in bad_mmsis:
            doubles += 1
            if row['true'] != row['inferred']:
                rows.append([row['true'],row['inferred'], bad_mmsis[row['mmsi']][0],bad_mmsis[row['mmsi']][1]])




In [82]:
m = 0
for r in rows:
    if r[1]==r[3]: m+=1

print m, len(rows), doubles
        

90 267 695


In [98]:
rows = []
doubles = 0
mmsis = []
with open('dumped_predictions_20161107.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['mmsi'] in bad_mmsis:
            doubles += 1
            if row['true'] != row['inferred']:
                mmsis.append(row['mmsi'])
                rows.append([row['true'],row['inferred'],bad_mmsis[row['mmsi']][1]])

In [99]:
df = pd.DataFrame(rows,index=mmsis,columns=['true','inferred','secondary'])

In [109]:
df.groupby(['true','inferred','secondary']).size()

true                inferred        secondary         
Drifting longlines  Pole and Line   Set gillnets           1
                                    Set longlines          2
                    Pots and Traps  Set gillnets           1
                    Purse seines    Set gillnets           1
                                    Set longlines          1
                    Sailing         Purse seines           1
                    Set gillnets    Set gillnets           1
                                    Set longlines          1
                    Trawlers        Set gillnets           1
                                    Set longlines          3
                                    Trawlers               1
Pole and Line       Pots and Traps  Set gillnets           1
                    Set gillnets    Set gillnets           1
                    Trollers        Fishing vessel         1
Pots and Traps      Purse seines    Fishing vessel         1
                              

In [106]:
with open("net_training_20161107.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['mmsi'] in bad_mmsis and row['label']!="":
            print row['label'], bad_mmsis[row['mmsi']]

Pole and line ['Pole and line', 'Drifting longlines', 20.0, 77.39]
Pole and line ['Pole and line', 'Fishing vessel', 25.0, 148.42]
Pole and line ['Pole and line', 'Drifting longlines', 20.0, 86.64]
Trawlers ['Drifting longlines', 'Set longlines', 34.5, 273.0]
Drifting longlines ['Set longlines', 'Drifting longlines', 25.36, 154.0]
Drifting longlines ['Set longlines', 'Drifting longlines', 24.75, 157.94]
Drifting longlines ['Drifting longlines', 'Set longlines', 28.0, 215.0]
Drifting longlines ['Set longlines', 'Drifting longlines', 28.1, 215.0]
Drifting longlines ['Pole and line', 'Drifting longlines', 28.1, 184.0]
Pole and line ['Pole and line', 'Pots and traps', 25.41, 116.0]
Drifting longlines ['Set longlines', 'Drifting longlines', 29.46, 171.0]
Drifting longlines ['Drifting longlines', 'Set gillnets', 25.4, 82.0]
Drifting longlines ['Drifting longlines', 'Set longlines', 21.3, 80.0]
Drifting longlines ['Drifting longlines', 'Purse seines', 20.0, 70.21]
Trawlers ['Trawlers', 'Set g