# Generate Lists for Neural Net

In [367]:
import csv
import bq
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

# Previous Labels
### Fishing
 - Purse seine
 - Longliner
 - Pots and traps
 - Trawler

### Non-fishing
 - Passenger
 - Tug/Pilot/Supply
 - Cargo/Tanker
 - Seismic vessel

# New Labels [and sublabels]
### Fishing
 - Longliners [Set longlines, Drifting longlines] 
 - Pole and Line    
 - Pots and Traps      
 - Purse seines        
 - Reefer              
 - Set gillnets        
 - Squid fishing               
 - Trawlers           
 - Trollers  
 - Squid
 

### Non-fishing
 - Passenger [Sailing, Motor Passenger]
 - Tug/Pilot/Supply [Tug, Piolot, Supply, Other]
 - Cargo/Tanker [Cargo, Tanker]
 - Seismic vessel [Seismic]

In [565]:
detail_to_general = {'Sailing':'Passenger',
                    'Motor Passenger':'Passenger',
                    'Passenger':'Passenger',
                    'Tug':'Tug/Pilot/Supply',
                    'Pilot':'Tug/Pilot/Supply',
                    'Supply':'Tug/Pilot/Supply',
                    'Tug/Pilot/Supply':'Tug/Pilot/Supply',
                    'Cargo':'Cargo/Tanker',
                    'Tanker':'Cargo/Tanker',
                    'Cargo/Tanker':'Cargo/Tanker',
                    'Seismic vessel':'Seismic vessel',
                    'Longliners':'Longliners',
                    'Set longlines':'Longliners', 
                    'Drifting longlines':'Longliners',
                    'Pole and Line':'Pole and Line',
                    'Pots and Traps':'Pots and Traps',
                    'Purse seines':'Purse seines',
                    'Reefer':'Reefer',
                    'Set gillnets':'Set gillnets',
                    'Squid fishing':'Squid fishing',
                    'Trawlers':'Trawlers',
                    'Trollers':'Trollers',
                     'Fishing vessel':'Fishing vessel',
                     "Squid":"Squid"
                     
                    }

# Lists Previously Used by the Neural Net for Training and Testing
As of October 10, 2016, the following lists are used in the Nerual Net Classifier. Below are David's notes on what to change and use instead:

## ITU_Dec_2015_full_list.csv
 - 11774 vessels
 - It is strange that it only has the follwing categories:
```
     'FBT': _PASSENGER,
    'PA': _PASSENGER,
    'TUG': _TUG_PILOT_SUPPLY,
    'LOU': _PASSENGER,
    'GOU': _PASSENGER,
    'SLO': _PASSENGER,
    'VLR': _PASSENGER,
    'YAT': _PASSENGER,
    'RAV': _TUG_PILOT_SUPPLY,
    'LAN': _POTS_AND_TRAPS,
    # TODO: [bitsofbits] More 
```

This pdf gives all the values that are linked http://www.itu.int/net/ITU-R/terrestrial/mars/help/table-2.pdf

This is very weird because we could be including trawlers here.


## CLAVRegistryMatchingv5.csv
 - 4803 different vessels
 - I'm skeptical of the shiptype generated in this list because I know that when I concat the geartypes, it isn't always clear what the vessel type is
 - Bjorn now has newer matches

## KnownVesselCargoTanker.csv
 - 2285 vessels
 - no idea where they came from

## KristinaManualClassification.csv
 - 2184 vessels


## PyBossaNonFishing.csv
 - 153 tug boats

## AlexWManualNonFishing.csv
 - 218 vessles that are Tugs, Passengers, Tangers, Cargo

## EUFishingVesselRegister.csv
 - 6489 vessels that are matched to the EU list, with their EU list geartype. See http://ec.europa.eu/fisheries/fleet/index.cfm?method=Codification.Cod_gear for the geartype

## PeruvianSquidFleet.csv
 - Complied by Bjorn, 104 vessels

## WorldwideSeismicVesselDatabase4Dec15.csv
 - 169 vessels, with length included



 # Lists not Used
 - rivervessels_20160502.csv
 - verify5and24_20160318.csv
 - verify5and24_20160502.csv
 - FishingVesselsV2_HighConfidenceStudents_20160502.csv




# Lists We Are Going to Use


## KnownVesselCargoTanker.csv
 - 2285 vessels
 - no idea where they came from

## KristinaManualClassification.csv
 - 2184 vessels


## PyBossaNonFishing.csv
 - 153 tug boats

## AlexWManualNonFishing.csv
 - 218 vessles that are Tugs, Passengers, Tangers, Cargo

## EUFishingVesselRegister.csv
 - 6489 vessels that are matched to the EU list, with their EU list geartype. See http://ec.europa.eu/fisheries/fleet/index.cfm?method=Codification.Cod_gear for the geartype

## PeruvianSquidFleet.csv
 - Complied by Bjorn, 104 vessels

## WorldwideSeismicVesselDatabase4Dec15.csv
 - 169 vessels, with length included



 # Lists not Used
 - rivervessels_20160502.csv
 - verify5and24_20160318.csv
 - verify5and24_20160502.csv
 - FishingVesselsV2_HighConfidenceStudents_20160502.csv


In [369]:
df_cargotanker = pd.read_csv('../data/classification-list-sources/KnownVesselCargoTanker.csv')

In [370]:
df_cargotanker.groupby(['label']).count()

# Passenger [sailing, motor]
# Tug/Pilot/Supply [Tug, Piolot, Supply, Other]
# Cargo/Tanker [Cargo, Tanker]
# Seismic vessel [Seismic]

Unnamed: 0_level_0,mmsi
label,Unnamed: 1_level_1
Cargo,1128
Tanker,1156


In [371]:
cargotanker_map = {'Cargo': 'Cargo/Tanker', 
                   'Tanker': 'Cargo/Tanker'}

df_cargotanker['cargotanker_sublabel'] = df_cargotanker['label']
df_cargotanker['label'] = df_cargotanker['label'].map(cargotanker_map)


In [372]:
df_cargotanker = df_cargotanker.set_index('mmsi')
df_cargotanker=df_cargotanker.rename(columns = {'label':'cargotanker_label'})
df_cargotanker.head()

Unnamed: 0_level_0,cargotanker_label,cargotanker_sublabel
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1
111111110,Cargo/Tanker,Cargo
204708000,Cargo/Tanker,Cargo
205256290,Cargo/Tanker,Cargo
205258890,Cargo/Tanker,Cargo
205263390,Cargo/Tanker,Cargo


In [373]:
df_kristina = pd.read_csv('../data/classification-list-sources/KristinaManualClassification.csv')
df_kristina.head()

# df_kristina.ix[np.nan(df_kristina['detail'])]#['detail'] = "" 
# df_kristina['detail'][0]# is np.nan

Unnamed: 0,mmsi,label,detail
0,10421670,Purse seine,
1,123450020,Purse seine,
2,123450800,Purse seine,
3,203226200,Passenger,sail
4,203745200,Passenger,yacht


In [374]:
# replace the detail with label
df_kristina['detail'] = df_kristina['detail'].fillna(df_kristina['label'])

In [375]:
df_kristina.head()

Unnamed: 0,mmsi,label,detail
0,10421670,Purse seine,Purse seine
1,123450020,Purse seine,Purse seine
2,123450800,Purse seine,Purse seine
3,203226200,Passenger,sail
4,203745200,Passenger,yacht


In [376]:
df_kristina.groupby(['label','detail'])['mmsi'].count()

label         detail         
Cargo         bulk carrier       564
              cargo              190
              container          153
              timber carrier       2
              vehicle carrier    118
Dredger       dredger              2
Longliner     Longliner           89
Passenger     ferry                1
              passenger           14
              sail               103
              yacht               53
Purse seine   Purse seine        105
Tanker        tanker             303
Trawler       Trawler            268
Tug           tug                 61
Unclassified  chemicals            1
              construction         1
              crane                1
              crew boat            6
              fish carrier         1
              oil tanker           1
              patrol               1
              pilot                2
              platform             3
              reefer              81
              research             7
        

In [377]:
df_subcat_map = {'bluk carrier':'Cargo',
         'cargo':'Cargo',
         'container':'Cargo',
         'timber carrier':'Cargo',
         'ferry':'Motor Passenger',
         'sail':'Sailing',
          'yacht':'Passenger',
           'passenger':'Passenger',
          'tanker':'Tanker',
          'tug':'Tug',
           'reefer': 'Reefer',
           'pilot':'Pilot',
          'fish carrier':'Reefer',
         'Purse seine':'Purse seines',
         'Trawler':'Trawlers',
         'Longliner':'Longliners'
         }

df_kristina = df_kristina[df_kristina['detail'].map(lambda x: x in df_subcat_map)]
df_kristina['detail']=df_kristina['detail'].map(df_subcat_map)
df_kristina['label'] = df_kristina['detail'].map(detail_to_general)


In [378]:
df_kristina.groupby(['label','detail'])['mmsi'].count()

label             detail         
Cargo/Tanker      Cargo              349
                  Tanker             303
Longliners        Longliners          89
Passenger         Motor Passenger      1
                  Passenger           67
                  Sailing            103
Purse seines      Purse seines       105
Reefer            Reefer              86
Trawlers          Trawlers           268
Tug/Pilot/Supply  Pilot                4
                  Tug                 61
Name: mmsi, dtype: int64

In [379]:
df_kristina=df_kristina.rename(columns = {'label':'kristina_label','detail':'kristina_sublabel'})


In [380]:
df_kristina = df_kristina.set_index('mmsi')
df_kristina.head()

Unnamed: 0_level_0,kristina_label,kristina_sublabel
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1
10421670,Purse seines,Purse seines
123450020,Purse seines,Purse seines
123450800,Purse seines,Purse seines
203226200,Passenger,Sailing
203745200,Passenger,Passenger


In [381]:
df_tugs = pd.read_csv('../data/classification-list-sources/PyBossaNonFishing.csv',skiprows=[0])
df_tugs.head()

Unnamed: 0,mmsi,label
0,205203390,Tug
1,205252690,Tug
2,205264290,Tug
3,205273990,Tug
4,205360090,Tug


In [382]:
df_tugs.groupby(['label'])['mmsi'].count()

label
Tug    153
Name: mmsi, dtype: int64

In [383]:
df_tugs['tugs_label'] = df_tugs['label'].map(detail_to_general)
df_tugs = df_tugs.rename(columns = {'label':'tugs_sublabel'})
df_tugs.head()

Unnamed: 0,mmsi,tugs_sublabel,tugs_label
0,205203390,Tug,Tug/Pilot/Supply
1,205252690,Tug,Tug/Pilot/Supply
2,205264290,Tug,Tug/Pilot/Supply
3,205273990,Tug,Tug/Pilot/Supply
4,205360090,Tug,Tug/Pilot/Supply


In [384]:
df_Alex = pd.read_csv('../data/classification-list-sources/AlexWManualNonFishing.csv')

In [385]:
df_Alex.groupby(['label']).count()

Unnamed: 0_level_0,mmsi
label,Unnamed: 1_level_1
Cargo,10
Dredger,1
Longliner,3
Passenger,142
Tanker,20
Trawler,1
Tug,40


In [386]:
alex_subcat_map = {'Cargo':'Cargo',
         'Dredger':'Tug/Pilot/Supply',
         'Longliner':'Longliners',
         'Passenger':'Passenger',
         'Tanker':'Tanker',
         'Trawler':'Trawlers',
                   'Tug':'Tug'
         }

df_Alex['label'] = df_Alex['label'].map(alex_subcat_map)
df_Alex['Alex_label'] = df_Alex['label'].map(detail_to_general)

In [387]:
df_Alex = df_Alex.rename(columns = {'label':'Alex_sublabel'})
df_Alex.head()

Unnamed: 0,mmsi,Alex_sublabel,Alex_label
0,203310200,Passenger,Passenger
1,205269190,Passenger,Passenger
2,205596910,Passenger,Passenger
3,211123610,Passenger,Passenger
4,211149170,Passenger,Passenger


In [388]:
df_Alex = df_Alex.set_index('mmsi')
df_Alex.groupby(['Alex_label']).count()

Unnamed: 0_level_0,Alex_sublabel
Alex_label,Unnamed: 1_level_1
Cargo/Tanker,30
Longliners,3
Passenger,142
Trawlers,1
Tug/Pilot/Supply,41


In [506]:
df_seismic = pd.read_csv('../data/classification-list-sources/WorldwideSeismicVesselDatabase4Dec15.csv')
df_seismic = df_seismic.rename(columns = {'MMSI #':'mmsi','Label':'seismic_label','Vessel length (m)':'seismic_length'})
df_seismic = df_seismic[['mmsi','seismic_label']]
df_seismic = df_seismic.drop_duplicates()
df_seismic = df_seismic.set_index('mmsi')
df_seismic.head()


Unnamed: 0_level_0,seismic_label
mmsi,Unnamed: 1_level_1
209108000,seismic vessel
209587000,seismic vessel
210228000,seismic vessel
210582000,seismic vessel
212338000,seismic vessel


In [507]:
# df_seismic = pd.read_csv('../data/classification-list-sources/WorldwideSeismicVesselDatabase4Dec15.csv')
# # column 'Vessel length (m)' has the length im neters
# # df_seismic.groupby(['Label'])['MMSI #'].count()
# df_seismic = df_seismic.rename(columns = {'MMSI #':'mmsi','Label':'seismic_label','Vessel length (m)':'seismic_length'})
# df_seismic = df_seismic[['seismic_label','seismic_length']]

df_seismic['seismic_label'] = ['Seismic vessel' for s in df_seismic['seismic_label']]
df_seismic['seismic_sublabel'] = df_seismic['seismic_label']

# df_seismic['seismic_label']=['seismic' for s in df_seismic['mmsi']]

In [508]:
df_seismic.head()

Unnamed: 0_level_0,seismic_label,seismic_sublabel
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1
209108000,Seismic vessel,Seismic vessel
209587000,Seismic vessel,Seismic vessel
210228000,Seismic vessel,Seismic vessel
210582000,Seismic vessel,Seismic vessel
212338000,Seismic vessel,Seismic vessel


In [391]:
df_river = pd.read_csv('../data/classification-list-sources/rivervessels_20160502.csv')
df_river.groupby(['label']).count()

Unnamed: 0_level_0,mmsi
label,Unnamed: 1_level_1
Supply,1
bad_data,9
cargo,104
multiple_vessles,1
not_fishing,11
not_known,16
passenger,7
tanker,3


In [392]:
df_river = df_river.set_index('mmsi')
df_river = df_river.rename(columns = {'label':'river_sublabel'})
river_subcat_map = {'cargo':'Cargo',
         'passenger':'Passenger',
         'Supply':'Supply',
         'Passenger':'Passenger',
         'tanker':'Tanker',
         }

df_river = df_river[df_river['river_sublabel'].map(lambda x: x in river_subcat_map)]
df_river['river_sublabel']=df_river['river_sublabel'].map(river_subcat_map)
df_river['river_label'] = df_river['river_sublabel'].map(detail_to_general)

In [393]:
df_river.head()

Unnamed: 0_level_0,river_sublabel,river_label
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1
203999396,Passenger,Passenger
203999399,Passenger,Passenger
211169130,Passenger,Passenger
211489990,Cargo,Cargo/Tanker
211512450,Passenger,Passenger


In [394]:
df = pd.read_csv('../data/classification-list-sources/verify5and24_20160318.csv')

In [395]:
df.groupby(['label']).count()

Unnamed: 0_level_0,task id,mmsi
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Diving_ops,80,80
Dredging_or_underwater_ops,112,112
Law_enforcement,131,131
Medical_transport,15,15
Military_ops,166,166
Passenger,147,147
Pleasure_craft,107,107
Search_and_Rescue,164,164
Tanker,153,153
Towing,4,4


In [396]:
df_five24 = pd.read_csv('../data/classification-list-sources/verify5and24_20160502.csv')
df_five24.head()

Unnamed: 0,task id,mmsi,label
0,2031,244690101,cargo
1,2572,211512210,cargo
2,2587,211668930,cargo
3,2608,244660859,cargo
4,2609,258222000,cargo


In [397]:
df_five24.groupby(['label']).count()

Unnamed: 0_level_0,task id,mmsi
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Diving_ops,80,80
Dredging_or_underwater_ops,112,112
Law_enforcement,133,133
Medical_transport,15,15
Military_ops,168,168
OffShore Support vessel,8,8
Passenger,193,193
Passenger and Cargo,4,4
Passenger/ferry,1,1
Patrol vessel,3,3


In [None]:
df_five24 = pd.read_csv('../data/classification-list-sources/verify5and24_20160502.csv')


# Ignore verify5and24_20160318.csv and use verify5and24_20160502

In [398]:
df_five24 = df_five24.set_index('mmsi')
df_five24 = df_five24.rename(columns = {'label':'five24_sublabel'})

five24_subcat_map = {'Passenger':'Passenger',
         'Sailing':'Sailing',
         'Trawler':'Trawlers',
         'Tug':'Tug',
         'cargo':'Cargo',
         'Port_tender':'Pilot',
         'sailing':'Sailing',
         'passenger/ferry':'Passenger',
         'tanker':'Tanker'
         }

df_five24 = df_five24[df_five24['five24_sublabel'].map(lambda x: x in five24_subcat_map)]
df_five24['five24_sublabel']=df_five24['five24_sublabel'].map(five24_subcat_map)
df_five24['five24_label'] = df_five24['five24_sublabel'].map(detail_to_general)
df_five24 = df_five24[['five24_label','five24_sublabel']]
df_five24.groupby(['five24_label']).count()

Unnamed: 0_level_0,five24_sublabel
five24_label,Unnamed: 1_level_1
Cargo/Tanker,32
Passenger,358
Trawlers,1
Tug/Pilot/Supply,261


# Ignore FishingVesselsV2_HighConfidenceStudents_20160502.csv -- doesn't add many vessels

In [399]:
df_five24.head()

Unnamed: 0_level_0,five24_label,five24_sublabel
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1
244690101,Cargo/Tanker,Cargo
211512210,Cargo/Tanker,Cargo
211668930,Cargo/Tanker,Cargo
244660859,Cargo/Tanker,Cargo
258222000,Cargo/Tanker,Cargo


In [464]:
df_ITU = pd.read_csv('../data/classification-list-sources/ITU_Dec_2015_full_list.csv')
df_ITU.head()

Unnamed: 0,MMSI,Call sign,Selective call number,Name of station,Administration/ Geographical area,Lifeboats,Auxiliary installations (EPIRBs),General classification,Individual classification,Registration number,Gross tonnage,Capacity of persons on board,Telegraph transmission frequency bands,Telegraph transmission frequency bands 2,Accounting Authority Identification Code (AAIC)
0,203002100,OEX2217,,ERIKA,AUT,,,PL,MTB,W21365,,8,,V,AU01
1,203058200,OEX2794,,TUAT GUAT,AUT,1.0,BE1,PL,YAT,N27794,1968.0,8,,V,AU01
2,203111400,OEX4631,,YLVI,AUT,,BE1,PL,MTB,N31105,299.0,6,,V,AU01
3,203116200,OEX6722,,SEVENS,AUT,1.0,BE1,PL,YAT,N29009,1872.0,12,,V,
4,203125100,OEX2289,,N-23782,AUT,,,PL,MTB,N23782,19.0,9,,V,


In [468]:
df_ITU.groupby(['Individual classification']).count()['MMSI']

Individual classification
AUX           1
BLK          16
BLS           1
CA           35
CAB           1
CHA           1
CHR           1
CON           9
CTR           1
DRG          54
EXP           5
GRF           1
HYD           6
LNG           2
LPG           1
MTB         116
OBO           5
OIL          32
PA            7
PH          116
PLT           7
PMX           1
PON           8
RAM           2
ROU           6
SAU           6
SMN           1
SRV           3
TPG          28
TPT           1
TUG          66
VLR           5
X X\nA        1
XX\n5         1
XXX       10949
YAT         203
Name: MMSI, dtype: int64

In [469]:
df_ITU = df_ITU.rename(columns = {'MMSI':'mmsi','Individual classification':'itu_sublabel'})

df_ITU = df_ITU.set_index('mmsi')

# that is a lot of categories... you can see the categories here:
# https://www.ofcom.org.uk/__data/assets/pdf_file/0024/16359/of168a.pdf
# I'm going to use the same distinctions that were used by Alex previously, 
# with some updates to divide into sailboats

itu_sublabels = {
    'FBT': 'Passenger',
 'PA': 'Passenger',
 'TUG': 'Tug',
 'LOU': "Sailing",
 'GOU': "Passenger", # these don't exist...
 'SLO': "Passenger",
 'VLR': "Sailing",
 'YAT': "Passenger",
 'RAV': "Tug/Pilot/Supply"}


df_ITU = df_ITU[df_ITU['itu_sublabel'].map(lambda x: x in itu_sublabels)]
df_ITU['itu_sublabel']=df_ITU['itu_sublabel'].map(itu_sublabels)
df_ITU['itu_label'] = df_ITU['itu_sublabel'].map(detail_to_general)
df_ITU = df_ITU[['itu_label','itu_sublabel']]
df_ITU.groupby(['itu_label']).count()

Unnamed: 0_level_0,itu_sublabel
itu_label,Unnamed: 1_level_1
Passenger,215
Tug/Pilot/Supply,66


In [470]:
ft = pd.read_csv('fishing_training.csv')
ft = ft.set_index('mmsi')
ft.head()

Unnamed: 0_level_0,geartype,secondary geartype,length,tonnage,country
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100000005,Fishing vessel,Fishing vessel,,,
100000018,Fishing vessel,Fishing vessel,,,
100000046,Fishing vessel,Fishing vessel,,,
100000668,Fishing vessel,Fishing vessel,,,
100001001,Fishing vessel,Fishing vessel,,,


In [517]:
result = df_cargotanker.join(df_kristina, how='outer')
result = result.join(df_tugs, how='outer')
result = result.join(df_Alex, how='outer')
result = result.join(df_seismic, how='outer')
result = result.join(df_river, how='outer')
result = result.join(df_five24, how='outer')
result = result.join(ft,how='outer')
result = result.join(df_ITU,how='outer')

In [566]:
rows = []

vessels_to_remove = []

for i in result.index:
    if 1:
        mmsi = i
        kristina = result['kristina_label'].ix[i]
        Alex = result['Alex_label'].ix[i]
        tugs = result['tugs_label'].ix[i]
        seismic = result['seismic_label'].ix[i]
        five24_geartype = result['five24_label'].ix[i]
        fishing = result['geartype'].ix[i]
        river = result['river_label'].ix[i]
        ITU = result['itu_label'].ix[i]
        cargotanker = result['cargotanker_label'].ix[i]
        
        s =  set([kristina,Alex,tugs,seismic,five24_geartype,fishing,ITU,cargotanker,river])
        if np.nan in s:
            s.remove(np.nan)
        if "Fishing vessel" in s and len(s)!=1: # this is not perfect logic
            s.remove("Fishing vessel")
        if len(s)!=1:
            pass
            vessels_to_remove.append(i)
            print s #, result.ix[i],i
#             break
        else:
            label = list(s)[0]
            
            
        kristina = result['kristina_sublabel'].ix[i]
        Alex = result['Alex_sublabel'].ix[i]
        tugs = result['tugs_sublabel'].ix[i]
        seismic = result['seismic_sublabel'].ix[i]
        five24_geartype = result['five24_sublabel'].ix[i]
        fishing = result['secondary geartype'].ix[i]
        river = result['river_sublabel'].ix[i]
        ITU = result['itu_sublabel'].ix[i]
        cargotanker = result['cargotanker_sublabel'].ix[i]
        
        sublabels = [kristina,Alex,tugs,seismic,five24_geartype,fishing,ITU,cargotanker,river]
        s = set(sublabels)
        if np.nan in s:
            s.remove(np.nan)
            
        if "Fishing vessel" in s and len(s)!=1: # this is not perfect logic
            s.remove("Fishing vessel")
            
        if len(s)!=1: #ideally we'll do something smarter here
            print s
#             if len(list(s))==2:
#                 if 'Drifting longlines' in list(s):
#                     sublabel = "Drifting longlines"
#                 elif 'Set longlines' in list(s):
#                     sublabel = 'Set longlines'
#                 else:
            pass
            
        else:
            sublabel = list(s)[0]
        
        
        row = [i,label,sublabel,result.ix[i]['length'],result.ix[i]['tonnage'],result.ix[i]['country']]
        
        if detail_to_general[sublabel] == label:
            rows.append(row)
        
    else:
        vessels_to_remove.append(i)
        print result.ix[i]
        break
            
        


set(['Reefer', 'Cargo/Tanker'])
set(['Cargo', 'Reefer'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Longliners', 'Trawlers'])
set(['Drifting longlines', 'Trawlers'])
set(['Set longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Drifting longlines', 'Longliners'])
set(['Reefer', 'Cargo/Tanker'])
set(['Cargo', 'Reefer'])
set(['Reefer', 'Cargo/Tanker'])
set(['Tanker', 'Reefer'])
set(['Reefer', 'Cargo/Tanker'])
set(['Cargo', 'Reefer'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
set(['Trawlers', 'Trollers'])
se

In [567]:
header = ['mmsi','label','sublabel','length','tonnage','country']
with open("net_training_20161016.csv", 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(rows)

In [568]:
df = pd.read_csv('net_training_20161016.csv')

In [569]:
df.head()

Unnamed: 0,mmsi,label,sublabel,length,tonnage,country
0,0,Tug/Pilot/Supply,Tug,,,
1,1,Tug/Pilot/Supply,Tug,,,
2,2,Tug/Pilot/Supply,Tug,,,
3,3,Tug/Pilot/Supply,Tug,,,
4,4,Tug/Pilot/Supply,Tug,,,


In [570]:
df.groupby(['label','sublabel'])['mmsi'].count()

label             sublabel          
Cargo/Tanker      Cargo                  1612
                  Tanker                 1487
Fishing vessel    Fishing vessel        38895
Longliners        Drifting longlines      192
                  Longliners             1204
                  Set longlines           224
Passenger         Motor Passenger           1
                  Passenger               634
                  Sailing                 254
Pole and Line     Pole and Line           137
Pots and Traps    Pots and Traps          398
Purse seines      Purse seines            806
Reefer            Reefer                  710
Seismic vessel    Seismic vessel          166
Set gillnets      Set gillnets            576
Squid             Squid                   164
Trawlers          Trawlers               5179
Trollers          Trollers                 53
Tug/Pilot/Supply  Pilot                    90
                  Supply                    1
                  Tug                     4