# ITU List Production
This notebook shows some of the logic that was used to do name normalization on the ITU list and then check the ITU list agaisnt the AIS data. A good deal of this was done in BigQuery, and the queries are saved here so that one can recreate the logic for matching. 

The matching is somewhat strict; we could probably include more vessels if we weakened the logic.

Then this notebook reads the ITU list, maps the labels to our lables (see `readme.md`), and then saves the file to `../classification-list-sources/itu_2016_processed.csv`

In [1]:
import csv
import pandas as pd
import pandas.io.gbq as bq
import numpy as np

In [3]:
# First, we need to match the ITU list to the AIS to make sure we have real vessels
'''
SELECT type, mmsi, imo, shipname, shiptype_text, callsign, timestamp, lon, lat, speed, course, tagblock_station 
FROM (TABLE_DATE_RANGE([pipeline_740__classify.], TIMESTAMP('2014-01-01'), TIMESTAMP('2015-12-31')))  
WHERE type in (5,24)
'''
# do this for each year, combine into table [world-fishing-827:Vessel_identity_messages.type5_24_2012_2016]

# then normalize the names and group

'''

SELECT 
type,
mmsi,
shipname,
shiptype_text,
callsign,
timestamp,
lon,
lat,
speed,
course,
tagblock_station 
from
normalize5_24Name(select * from [Vessel_identity_messages.type5_24_2012_2016])


///UDF Funcrtion

function deromanize(str) {
	var	str = str.toUpperCase(),
		validator = /^M*(?:D?C{0,3}|C[MD])(?:L?X{0,3}|X[CL])(?:V?I{0,3}|I[XV])$/,
		token = /[MDLV]|C[MD]?|X[CL]?|I[XV]?/g,
		key = {M:1000,CM:900,D:500,CD:400,C:100,XC:90,L:50,XL:40,X:10,IX:9,V:5,IV:4,I:1},
		num = 0, m;
	if (!(str && validator.test(str)))
		return false;
	while (m = token.exec(str))
		num += key[m[0]];
	return num;
};


function normalizeName(vessel){ 

  //Get rid of roman numerals
    vs = vessel.split(" ");
    rom = deromanize(vs[vs.length-1]);
    if(rom){
        vs[vs.length-1] = rom;
        vessel = vs.join(' ');
    }
    // if the vessel is vessel 036, change to vessel 36
    else if(!isNaN(vs[vs.length-1])){
        vs[vs.length-1] = parseInt(vs[vs.length-1]);
        vessel = vs.join(' ');
    }
    else if(vs[vs.length-1].match(/no\d+/gi)){
        vs[vs.length-1] = vs[vs.length-1].replace("no","");
        vessel = vs.join(' ');
    }



    vessel = vessel.replace(/\s/g,''); // get rid of white spaces

    // if the vessel starts with "no. or no53"
    if (vessel.length>3 && (vessel.slice(0,3).toUpperCase() == "NO." || 
       (vessel.slice(0,2).toUpperCase() == "NO" && vessel[2].match(/\d+/))) ){vessel = vessel.slice(2,vessel.length);}

    vessel = vessel.replace(/no\./gi,""); // get rid of no.

    // find all the numbers
    var number_find = /\d+/g;
    no_array =  vessel.match(number_find)||[];
    // move any number at the front of a vessel to the end
    if (no_array.length == 1){
        vessel = vessel.replace(number_find,"")+no_array[0].replace(/no\./i,"");
    }   

    vessel = vessel.replace(/\./g,""); // get rid of periods
    vessel = vessel.replace(/\'/g,""); // get rid of single quotes
    vessel = vessel.replace(/f\/b/ig,''); // get rid of all F/B
    vessel = vessel.replace(/fb/ig,''); // get rid of all F/B
    vessel = vessel.replace(/f\/v/ig,''); // get rid of F/V
    vessel = vessel.replace(/fv/ig,''); // get rid of FV and fv
    vessel = vessel.replace(/#/g,''); // get ride of #
    vessel = vessel.replace(/-/g,''); // get ride of -
    return vessel.toUpperCase(); // make uppercase and return

}



function normalize5_24Name(row, emit) {
  var shipname = row.shipname;
  if(shipname){
    shipname =  normalizeName(shipname);
  }
  
  emit({
    type:row.type,
    mmsi:row.mmsi,
    shipname:shipname,
    shiptype_text:row.shiptype_text,
    callsign:row.callsign,
    timestamp:row.timestamp,
    lon:row.lon,
    lat:row.lat,
    speed:row.speed,
    course:row.course,
    tagblock_station:row.tagblock_station 
  });
}



bigquery.defineFunction(
  'normalize5_24Name',  // Name of the function exported to SQL
  
  ['type','mmsi','shipname',
   'shiptype_text','callsign',
   'timestamp','lon','lat','speed',
   'course','tagblock_station'],  // Names of input columns
  
  [{'name':'type','type':'integer'},
  {'name':'mmsi','type':'integer'},
  {'name':'imo','type':'integer'},
  {'name':'shipname','type':'string'},
  {'name':'shiptype_text','type':'string'},
  {'name':'callsign','type':'string'},
  {'name':'timestamp','type':'timestamp'},
  {'name':'lon','type':'float'},
  {'name':'lat','type':'float'},
  {'name':'speed','type':'float'},
  {'name':'course','type':'float'},
  {'name':'tagblock_station','type':'string'}],
  
  normalize5_24Name                       // Reference to JavaScript UDF
);

'''



# now group the results

'''SELECT mmsi, shipname, callsign, count(*) num 
FROM [world-fishing-827:Vessel_identity_messages.type5_24_2013_2016_NORM] 
group by mmsi, shipname, callsign
having num > 200'''


# saved to [Vessel_identity_messages.type5_24_2012_2016_grouped]



'SELECT mmsi, shipname, callsign, count(*) num \nFROM [world-fishing-827:Vessel_identity_messages.type5_24_2013_2016_NORM] \ngroup by mmsi, shipname, callsign\nhaving num > 200'

In [4]:
# now normalize the ITU list

'''

SELECT
  mmsi,
  shipname,
  callsign,
  itu_General_classification,
  itu_Individual_classification,
  national_id,
  tonnage,
  itu_Capacity_of_persons_on_board
FROM
  normalize_itu_name([world-fishing-827:ITU_complete_2016.ITU_2016_complete])

///UDF Funcrtion

function deromanize(str) {
	var	str = str.toUpperCase(),
		validator = /^M*(?:D?C{0,3}|C[MD])(?:L?X{0,3}|X[CL])(?:V?I{0,3}|I[XV])$/,
		token = /[MDLV]|C[MD]?|X[CL]?|I[XV]?/g,
		key = {M:1000,CM:900,D:500,CD:400,C:100,XC:90,L:50,XL:40,X:10,IX:9,V:5,IV:4,I:1},
		num = 0, m;
	if (!(str && validator.test(str)))
		return false;
	while (m = token.exec(str))
		num += key[m[0]];
	return num;
};


function normalizeName(vessel){ 

  //Get rid of roman numerals
    vs = vessel.split(" ");
    rom = deromanize(vs[vs.length-1]);
    if(rom){
        vs[vs.length-1] = rom;
        vessel = vs.join(' ');
    }
    // if the vessel is vessel 036, change to vessel 36
    else if(!isNaN(vs[vs.length-1])){
        vs[vs.length-1] = parseInt(vs[vs.length-1]);
        vessel = vs.join(' ');
    }
    else if(vs[vs.length-1].match(/no\d+/gi)){
        vs[vs.length-1] = vs[vs.length-1].replace("no","");
        vessel = vs.join(' ');
    }



    vessel = vessel.replace(/\s/g,''); // get rid of white spaces

    // if the vessel starts with "no. or no53"
    if (vessel.length>3 && (vessel.slice(0,3).toUpperCase() == "NO." || 
       (vessel.slice(0,2).toUpperCase() == "NO" && vessel[2].match(/\d+/))) ){vessel = vessel.slice(2,vessel.length);}

    vessel = vessel.replace(/no\./gi,""); // get rid of no.

    // find all the numbers
    var number_find = /\d+/g;
    no_array =  vessel.match(number_find)||[];
    // move any number at the front of a vessel to the end
    if (no_array.length == 1){
        vessel = vessel.replace(number_find,"")+no_array[0].replace(/no\./i,"");
    }   

    vessel = vessel.replace(/\./g,""); // get rid of periods
    vessel = vessel.replace(/\'/g,""); // get rid of single quotes
    vessel = vessel.replace(/f\/b/ig,''); // get rid of all F/B
    vessel = vessel.replace(/fb/ig,''); // get rid of all F/B
    vessel = vessel.replace(/f\/v/ig,''); // get rid of F/V
    vessel = vessel.replace(/fv/ig,''); // get rid of FV and fv
    vessel = vessel.replace(/#/g,''); // get ride of #
    vessel = vessel.replace(/-/g,''); // get ride of -
    return vessel.toUpperCase(); // make uppercase and return

}



function normalize_itu_name(row, emit) {
  var shipname = row.shipname;
  if(shipname){
    shipname =  normalizeName(shipname);
  }
  
  
  emit({
    mmsi:row.mmsi,
    shipname:shipname,
    callsign:row.callsign,
    itu_General_classification:row.itu_General_classification,
    itu_Individual_classification:row.itu_Individual_classification,
    national_id:row.national_id,
    tonnage:row.tonnage,
    itu_Capacity_of_persons_on_board:row.itu_Capacity_of_persons_on_board
  });
}



bigquery.defineFunction(
  'normalize_itu_name',  // Name of the function exported to SQL
  
  ['mmsi',
   'shipname',
   'callsign',
   'itu_General_classification',
   'itu_Individual_classification',
   'national_id',
   'tonnage',
   'itu_Capacity_of_persons_on_board'],  // Names of input columns
  
  [
  {'name':'mmsi','type':'integer'},
  {'name':'shipname','type':'string'},
  {'name':'callsign','type':'string'},
  {'name':'itu_General_classification','type':'string'},
  {'name':'itu_Individual_classification','type':'string'},
  {'name':'national_id','type':'string'},
  {'name':'tonnage','type':'string'},
  {'name':'itu_Capacity_of_persons_on_board','type':'string'}
  ],
  
  normalize_itu_name                       // Reference to JavaScript UDF
);'''

5

5

In [36]:
# Now see how many of these vessels are in the big list of vessels

q = '''select * from [world-fishing-827:ITU_complete_2016.ITU_2016_complete_NORM] where mmsi in (
SELECT a.mmsi as mmsi FROM 
[world-fishing-827:ITU_complete_2016.ITU_2016_complete_NORM]a
inner join 
[Vessel_identity_messages.type5_24_2012_2016_grouped] b
on a.mmsi = b.mmsi
and a.callsign = b.callsign 
and a.shipname = b.shipname
group by mmsi)'''

df = bq.read_gbq(q, project_id = "world-fishing-827")

Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
  Got page: 1; 57.0% done. Elapsed 7.17 s.
  Got page: 2; 100.0% done. Elapsed 11.69 s.
Got 77369 rows.

Total time taken 12.98 s.
Finished at 2017-01-27 16:26:07.


In [37]:
df.head()

Unnamed: 0,mmsi,shipname,callsign,itu_General_classification,itu_Individual_classification,national_id,tonnage,itu_Capacity_of_persons_on_board
0,319058400,VALYRA,ZGCQ8,Pleasure / Leisure,Yacht,743779,63,12
1,319058500,TRIPLESEVEN,ZGDO7,Pleasure / Leisure,Yacht,739145,1385,26
2,319058600,AMOIXA,ZGDO6,Pleasure / Leisure,Yacht,739106,498,22
3,319058800,NORDICGRACE,ZGDM8,Merchant,Tanker,745168,84598,32
4,319058900,SCOUT,ZGDP4,Pleasure / Leisure,Yacht,744433,496,24


In [39]:
from tabulate import tabulate
rows = []
for i in df.groupby("itu_Individual_classification")['itu_Individual_classification'].indices:
    rows.append([i, len(df.groupby("itu_Individual_classification")['itu_Individual_classification'].indices[i])])

rows.sort(key=lambda x: -x[1])    
print tabulate(rows,tablefmt = "pipe",headers=['itu_Individual_classification','Number'])


| itu_Individual_classification          |   Number |
|:---------------------------------------|---------:|
| Unspecified                            |     8429 |
| Bulk carrier                           |     6410 |
| Pusher /Tug                            |     6213 |
| Yacht                                  |     6092 |
| Fishing vessel                         |     4173 |
| Sailing ship                           |     3942 |
| Cargo ship                             |     3912 |
| Oil tanker                             |     3406 |
| Tanker                                 |     3344 |
| Container ship                         |     3293 |
| Passenger ship                         |     2684 |
| Supply vessel                          |     2190 |
| General cargo                          |     1878 |
| Motor boat                             |     1214 |
| Dry cargo                              |     1023 |
| Trawler                                |      958 |
| Ferry                     

In [40]:
for r in rows:
    print r[0]

Unspecified
Bulk carrier
Pusher /Tug
Yacht
Fishing vessel
Sailing ship
Cargo ship
Oil tanker
Tanker
Container ship
Passenger ship
Supply vessel
General cargo
Motor boat
Dry cargo
Trawler
Ferry
Sloop
Liquefied gas carrier
Dredger
Research ship /Survey ship
Reefer
Barge
Cargo and passenger
RoRo ship
Launch
Pilot tender
Chemical carrier
Stand-by safety vessel
Vehicle carrier
Cutter
Drilling unit
Salvage ship
Rescue vessel
Ocean-station vessel
Patrol ship
Inspection ship
Support vessel
Transport
Auxiliary ship
Coaster
Ice breaker
Training ship
Pontoon
Ore carrier
Tramp
Coast-guard
Floating crane
Platform
Solvent carrier
Lighter
Ore-bulk-oil carrier
Cement carrier
Forest-product carrier
Warship
Cruiser
Ship used by divers
Cable ship
Hydrofoil
Factory ship
Hydrographic ship
Buoy ship
Livestock carrier
Mine layer
Ketch
Pollution and surface clearance vessel
Tunny ship
Oceanographic ship
Floating storage, offtake 
Frigate
Schooner
Barge carrier
Minesweeper
Firefloat
Fast patrol ship
Liner
Desp

In [41]:
mapping = '''Unspecified,None
Bulk carrier,Cargo
Pusher /Tug,Tug
Yacht,Motor_passenger
Fishing vessel,unknown_fishing
Sailing ship,Sailing
Cargo ship,Cargo
Oil tanker,Tanker
Tanker,Tanker
Container ship,Cargo
Passenger ship,Passenger
Supply vessel,Non_fishing
General cargo,Cargo
Motor boat,Motor_passenger
Dry cargo,Cargo
Trawler,trawlers
Ferry,Motor_passenger
Sloop,Sailing
Liquefied gas carrier,Tanker
Dredger,Non_fishing
Research ship /Survey ship,None
Reefer,Reefer
Barge,Non_fishing
Cargo and passenger,Cargo|Passenger
RoRo ship,Cargo|Passenger
Launch,Non_fishing
Pilot tender,Non_fishing
Chemical carrier,Tanker
Stand-by safety vessel,Non_fishing
Vehicle carrier,Cargo
Cutter,Non_fishing
Drilling unit,Non_fishing
Salvage ship,Non_fishing
Rescue vessel,Non_fishing
Ocean-station vessel,Non_fishing
Patrol ship,Non_fishing
Inspection ship,Non_fishing
Support vessel,Non_fishing
Transport,Non_fishing
Auxiliary ship,Non_fishing
Coaster,Cargo
Ice breaker,Non_fishing
Training ship,None
Pontoon,None
Ore carrier,Cargo
Tramp,Cargo
Coast-guard,Non_fishing
Floating crane,Non_fishing
Platform,Non_fishing
Solvent carrier,Cargo
Lighter,Cargo
Ore-bulk-oil carrier,Cargo|Tanker
Cement carrier,Cargo
Forest-product carrier,Cargo
Warship,Non_fishing
Cruiser,Non_fishing
Ship used by divers,None
Cable ship,Non_fishing
Hydrofoil,Motor_passenger
Factory ship,None
Hydrographic ship,Non_fishing
Buoy ship,Non_fishing
Livestock carrier,Cargo
Mine layer,Non_fishing
Ketch,Sailing
Pollution and surface clearance vessel,Non_fishing
Tunny ship,unknown_fishing
Oceanographic ship,Non_fishing
Floating storage ./ offtake,Non_fishing
Frigate,Sailing
Schooner,Sailing
Barge carrier,Cargo
Minesweeper,Non_fishing
Firefloat,None
Fast patrol ship,Non_fishing
Liner,Motor_passenger
Despatch vessel,Non_fishing
Customs launch,Non_fishing
Air-cushion vehicle,Non_fishing
Fishing guard,None
Banker,None
Escort ship,Non_fishing
Submarine,Non_fishing
Collier,Cargo
Destroyer,Non_fishing
Whaler,unknown_fishing
Rock breaker,Non_fishing
Lightship,Non_fishing
Corvette,Non_fishing
Hospital ship,Non_fishing
Lighthouse tender,Non_fishing
Grain carrier,Cargo
Helicopter carrier,Non_fishing
Fruit carrier,Cargo
Lobster ship,Pots_and_traps'''.replace("Non_fishing","unknown_not_fishing").split('\n')

In [42]:
category_mapping = {}
for m in mapping:
    m = m.split(",")
    category_mapping[m[0]] = m[1]

In [43]:
df['label'] = df['itu_Individual_classification'].map(category_mapping)

In [46]:
# Eliminate the tonnages that have commas in them, as it is unclear if
# the commas are American or European

def get_rid_commas(x):
    try:
        if "," in x:
            return None
        return float(x)
    except:
        return None

In [47]:
df['tonnage'] = df['tonnage'].apply(get_rid_commas)

In [48]:
df.to_csv('../classification-list-sources/itu_2016_processed.csv')