In [155]:
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
mypath = 'lists/'
files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and ".csv" in f]

In [156]:
detail_to_general = {'Sailing':'Passenger',
                    'Motor passenger':'Passenger',
                    'Passenger':'Passenger',
                    'Tug':'Tug/Pilot/Supply',
                    'Pilot':'Tug/Pilot/Supply',
                    'Supply':'Tug/Pilot/Supply',
                    'Tug/Pilot/Supply':'Tug/Pilot/Supply',
                    'Cargo':'Cargo/Tanker',
                    'Tanker':'Cargo/Tanker',
                    'Cargo/Tanker':'Cargo/Tanker',
                    'Seismic vessel':'Seismic vessel',
                    'Drifting longlines':'Drifting longlines',
                    'Set longlines':'Set gear', 
                    'Pole and line':'Pole and line',
                    'Pots and traps':'Set gear',
                    'Purse seines':'Purse seines',
                    'Reefer':'Reefer',
                    'Set gillnets':'Set gear',
                    'Trawlers':'Trawlers',
                    'Trollers':'Trollers',
                     'Fishing vessel':'Fishing vessel',
                     "Squid":"Squid"
                    }

general_to_fishing = {'Passenger':"Non-fishing",
                     'Tug/Pilot/Supply':"Non-fishing",
                     'Cargo/Tanker': "Non-fishing",
                     "Seismic vessel": "Non-fishing",
                     "Drifting longlines":"Fishing",
                     "Pole and line":"Fishing",
                     "Purse seines":"Fishing",
                     "Reefer":"Non-fishing", # this is debatable
                     "Set gear":"Fishing",
                     "Squid":"Fishing",
                      "Trawlers":"Fishing",
                      "Trollers":"Fishing",
                      "Fishing vessel":"Fishing",
                     }

cats_without_subcats = ["Seismic vessel",
                        "Trawlers","Purse seines","Reefer",
                        "Squid","Seismic vessel","Drifting lonlines"
                       "Pole and line","Trollers"]

In [157]:
dfs = []

for f in files:
    df = pd.read_csv(mypath+f)
    df = df.set_index('mmsi')
    dfs.append(df)


In [158]:
lv = pd.concat(dfs, join='outer', axis = 1)

In [236]:
headers = list(lv.columns.values)

In [196]:
def average_value(alist):
    if len(alist) == 0:
        return ""
    alist = np.array(alist)
    avg = np.nanmean(alist)
    stddev = np.nanstd(alist)
    if stddev/avg > .1:
        return ""
    else:
        return avg

In [215]:
# this is based on a review of vessels that had two geartypes that were
# labeled in pybossa
# the original google sheet is here: 
#https://docs.google.com/spreadsheets/d/1WTF-KoFCXQbxvn7WJrg97BNs_Ha7I_qojVQyQeB3GyE/edit#gid=0

label_override = {}
with open ("pybossa/double_geartype_override.csv",'rU') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['newlabel'] != "IGNORE":
            label_override[row['mmsi']] = row['newlabel']


In [222]:
rows = []
k = 0
for mmsi in lv.index:
    
    ton_list = []
    for h in headers:
        if "tonnage" in h and not np.isnan(lv[h].ix[mmsi]):
            ton_list.append(lv[h].ix[mmsi])
    tonnage = average_value(ton_list)
    
    length_list = []
    for h in headers:
        if "length" in h and not np.isnan(lv[h].ix[mmsi]):
            length_list.append(lv[h].ix[mmsi])  
    
    length = average_value(length_list)
    
    if length > 100 and tonnage < 1000:
        length = ''
    
    # Earlier analysis showed we should get rid of the outliers on 
    # length -- where tonnage is less than 1000 and length is greater 100
    
    
    fishings = []
    fishings_map = {'Fishing':'Fishing',
                             'Non-fishing':'Non-fishing',
                             'Fishing vessel':'Fishing',
                            'Nonfishing vessel':'Non-fishing'} 
    for h in headers:
        if "_fishing" in h and str(lv[h].ix[mmsi]) != 'nan':
            fishings.append(fishings_map[lv[h].ix[mmsi]])
            
    fish = set(fishings)
    if len(fish)>1:
        continue 
        # skip these vessels if the lists disagree about 
        # whether it is a fishing vessel
    fishing = fishings[0]
    
    
    list_sources = []
    labels = []
    for h in headers:
        if "_label" in h and str(lv[h].ix[mmsi]) != 'nan':
            labels.append(lv[h].ix[mmsi]) 
            list_sources.append(h.replace("_label",""))
    if 'Fishing vessel' in labels:
        labels.remove('Fishing vessel') 
    fish = set(labels)
    if len(fish)>1 or len(fish) == 0:
        label = ''
        sublabel = ''
    else:
        label = labels[0]   
    

        sublabels = []
        for h in headers:
            if "_sublabel" in h and str(lv[h].ix[mmsi]) != 'nan':
                sublabels.append(lv[h].ix[mmsi]) 
        if 'Fishing vessel' in sublabels:
            sublabels.remove('Fishing vessel')
        fish = set(sublabels)
        if len(fish)>1 or len(fish) == 0:
            sublabel = ''
        else:
            sublabel = sublabels[0]   
 

    if sublabel == label:
        sublabel = ""
    
    # these have been manually identified they are still a bit unclear, as they 
    # are multi-gear, and we looked at only a few specific months
    if mmsi in label_override:
        sublabel = label_override[mmsi]
        label = detail_to_general[sublabel]
    
    if 'eu2' in list_sources: 
        list_sources.remove("eu2")
    rows.append([mmsi,fishing,label,sublabel,length,tonnage,";".join(list_sources)])
    


In [223]:
with open('net_training_20161111.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['mmsi','fishing','label','sublabel','length','tonnage','list_sources'])
    writer.writerows(rows)