In [31]:
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
mypath = 'lists/'
files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and ".csv" in f]

In [32]:
detail_to_general = {'Sailing':'Passenger',
                    'Motor passenger':'Passenger',
                    'Passenger':'Passenger',
                    'Tug':'Tug/Pilot/Supply',
                    'Pilot':'Tug/Pilot/Supply',
                    'Supply':'Tug/Pilot/Supply',
                    'Tug/Pilot/Supply':'Tug/Pilot/Supply',
                    'Cargo':'Cargo/Tanker',
                    'Tanker':'Cargo/Tanker',
                    'Cargo/Tanker':'Cargo/Tanker',
                    'Seismic vessel':'Seismic vessel',
                    'Drifting longlines':'Drifting longlines',
                    'Set longlines':'Fixed gear', 
                    'Pole and line':'Pole and line',
                    'Pots and traps':'Fixed gear',
                    'Purse seines':'Purse seines',
                    'Reefer':'Reefer',
                    'Set gillnets':'Fixed gear',
                    'Trawlers':'Trawlers',
                    'Trollers':'Trollers',
                     'Fishing vessel':'Fishing vessel',
                     "Squid":"Squid"
                    }

general_to_fishing = {'Passenger':"Non-fishing",
                     'Tug/Pilot/Supply':"Non-fishing",
                     'Cargo/Tanker': "Non-fishing",
                     "Seismic vessel": "Non-fishing",
                     "Drifting longlines":"Fishing",
                     "Pole and line":"Fishing",
                     "Purse seines":"Fishing",
                     "Reefer":"Non-fishing", # this is debatable
                     "Fixed gear":"Fishing",
                     "Squid":"Fishing",
                      "Trawlers":"Fishing",
                      "Trollers":"Fishing",
                      "Fishing vessel":"Fishing",
                     }

cats_without_subcats = ["Seismic vessel",
                        "Trawlers","Purse seines","Reefer",
                        "Squid","Seismic vessel","Drifting lonlines"
                       "Pole and line","Trollers"]

In [50]:
dfs = []

for f in files:
    if f != "countries.csv": #ignore this for now
#         for i in df.index:
#             try:
#                 if df['mmsi'].ix[i] == 205010000:
#                     print f
#             except:
#                 print f
#                 break
        df = pd.read_csv(mypath+f)
        df = df.set_index('mmsi')
        dfs.append(df)


In [51]:
lv = pd.concat(dfs, join='outer', axis = 1)

In [52]:
headers = list(lv.columns.values)

In [36]:
def average_value(alist):
    if len(alist) == 0:
        return ""
    alist = np.array(alist)
    avg = np.nanmean(alist)
    stddev = np.nanstd(alist)
    if stddev/avg > .1:
        return ""
    else:
        return avg

In [37]:
# this is based on a review of vessels that had two geartypes that were
# labeled in pybossa
# the original google sheet is here: 
#https://docs.google.com/spreadsheets/d/1WTF-KoFCXQbxvn7WJrg97BNs_Ha7I_qojVQyQeB3GyE/edit#gid=0

label_override = {}
with open ("pybossa/double_geartype_override.csv",'rU') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['newlabel'] != "IGNORE":
            label_override[row['mmsi']] = row['newlabel']


In [63]:
rows = []
bad_labels = []
k = 0
for mmsi in lv.index:
    
    ton_list = []
    for h in headers:
        if "tonnage" in h and not np.isnan(lv[h].ix[mmsi]):
            ton_list.append(lv[h].ix[mmsi])
    tonnage = average_value(ton_list)
    
    length_list = []
    for h in headers:
        if "length" in h and not np.isnan(lv[h].ix[mmsi]):
            length_list.append(lv[h].ix[mmsi])  
    
    length = average_value(length_list)
    
    if length > 100 and tonnage < 1000:
        length = ''
    
    # Earlier analysis showed we should get rid of the outliers on 
    # length -- where tonnage is less than 1000 and length is greater 100
    
    
    fishings = []
    fishings_map = {'Fishing':'Fishing',
                             'Non-fishing':'Non-fishing',
                             'Fishing vessel':'Fishing',
                            'Nonfishing vessel':'Non-fishing'} 
    list_sources = []
    for h in headers:
        if "_fishing" in h and str(lv[h].ix[mmsi]) != 'nan':
            fishings.append(fishings_map[lv[h].ix[mmsi]])
            list_sources.append(h.replace("_label",""))

    fish = set(fishings)
    if len(fish)>1:
        continue 
        # skip these vessels if the lists disagree about 
        # whether it is a fishing vessel
    fishing = fishings[0]

    
    labels = []
    for h in headers:
        if "_label" in h and str(lv[h].ix[mmsi]) != 'nan':
            labels.append(lv[h].ix[mmsi]) 
    if 'Fishing vessel' in labels:
        labels.remove('Fishing vessel') 
    fish = set(labels)
    if len(fish)>1 or len(fish) == 0:
        bad_labels.append(fish)
        label = ''
        sublabel = ''
    else:
        label = labels[0]   
    

        sublabels = []
        for h in headers:
            if "_sublabel" in h and str(lv[h].ix[mmsi]) != 'nan':
                sublabels.append(lv[h].ix[mmsi]) 
        if 'Fishing vessel' in sublabels:
            sublabels.remove('Fishing vessel')
        fish = set(sublabels)
        if len(fish)>1 or len(fish) == 0:
            sublabel = ''
        else:
            sublabel = sublabels[0]   
 

    if sublabel == label:
        sublabel = ""
    
    # these have been manually identified they are still a bit unclear, as they 
    # are multi-gear, and we looked at only a few specific months
    if mmsi in label_override:
        sublabel = label_override[mmsi]
        label = detail_to_general[sublabel]
    
    if 'eu2' in list_sources: 
        list_sources.remove("eu2")
    rows.append([mmsi,fishing,label,sublabel,length,tonnage,";".join(list_sources)])
    


In [66]:
k = 0
for b in bad_labels:
    if 'Drifting longlines' in b:
        k +=1
k

125

In [67]:
# we lost 125 drifting longlines because the lists differed on what they were


In [59]:
with open('net_training_20161115.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['mmsi','is_fishing','label','sublabel','length','tonnage','list_sources'])
    writer.writerows(rows)

In [60]:
df = pd.read_csv('net_training_20161115.csv')
df.groupby("is_fishing").count()


Unnamed: 0_level_0,mmsi,label,sublabel,length,tonnage,list_sources
is_fishing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fishing,17680,6676,815,6703,11707,17680
Non-fishing,5291,5283,3779,338,340,5291


In [61]:
df.groupby("list_sources").count()


Unnamed: 0_level_0,mmsi,is_fishing,label,sublabel,length,tonnage
list_sources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alex_fishing,215,215,215,70,0,0
alex_fishing;eu_fishing,1,1,1,0,1,1
alex_fishing;five24_fishing,1,1,1,0,0,0
cargotanker_fishing,2278,2278,2278,2278,0,0
cargotanker_fishing;reefer_fishing,4,4,0,0,3,3
ccamlr_fishing,37,37,35,0,37,37
ccamlr_fishing;eu_fishing;itu_fishing,1,1,1,0,1,1
ccamlr_fishing;itu_fishing,3,3,3,0,3,2
clav_fishing,214,214,214,4,206,199
clav_fishing;eu_fishing,361,361,258,12,359,355


In [62]:
df.groupby("label").count()


Unnamed: 0_level_0,mmsi,is_fishing,sublabel,length,tonnage,list_sources
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cargo/Tanker,3095,3095,3095,0,0,3095
Drifting longlines,177,177,0,174,173,177
Fixed gear,1070,1070,815,809,1060,1070
Passenger,888,888,254,0,0,888
Pole and line,114,114,0,111,113,114
Purse seines,587,587,0,491,493,587
Reefer,705,705,0,333,335,705
Seismic vessel,164,164,0,0,0,164
Squid,164,164,0,0,0,164
Trawlers,4513,4513,0,3712,4092,4513
