In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
from datetime import datetime
from math import cos, asin, sqrt, isnan, radians, sin
import utm
from xgboost import XGBClassifier

In [2]:
ports = pd.read_csv("ports.csv")
tracking = pd.read_csv("tracking.csv").drop_duplicates().sort_values(by='datetime', ascending=True)
voyages = pd.read_csv("voyages.csv")

In [3]:
dual_pattern = []
for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
    if len(voyages[voyages.vessel==vessel]['end_port_id'].unique()) < 3:
        dual_pattern.append(vessel)

In [4]:
first_pattern = []
second_pattern = []

for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
    voyages_vessel = voyages[voyages.vessel==vessel]
    most_common_count = voyages_vessel['end_port_id'].value_counts().values[0]
    most_common = voyages_vessel['end_port_id'].value_counts().keys()[0]
    second_most_common_count = voyages_vessel['end_port_id'].value_counts().values[1]
    latest_port = voyages_vessel.iloc[-1]['end_port_id']
    
    if most_common == latest_port:
        if (len(voyages_vessel) % 2) == 0 and most_common_count == (len(voyages_vessel) / 2):
            first_pattern.append(vessel)
        elif (len(voyages_vessel) % 2) != 0 and abs((int(len(voyages_vessel))/2) - most_common_count) == 1:
            first_pattern.append(vessel)
    else:
        if (len(voyages_vessel) % 2) == 0 and most_common_count == (len(voyages_vessel) / 2):
            second_pattern.append(vessel)
        elif (len(voyages_vessel) % 2) != 0 and abs((int(len(voyages_vessel))/2) - most_common_count) == 1:
            second_pattern.append(vessel)
    

In [5]:
ports_temp = pd.read_csv("ports.csv").copy()
ports_temp.loc[(ports_temp.port==142), 'lat'] = 30.600600
ports_temp.loc[(ports_temp.port==142), 'long'] = 122.104500

ports_temp.loc[(ports_temp.port==76), 'lat'] = 21.903937
ports_temp.loc[(ports_temp.port==76), 'long'] = 113.216557

voyages = pd.read_csv("voyages.csv")

voyages['previous_begin_port_id_1'] = voyages.groupby(['vessel']).begin_port_id.shift(1)
voyages = voyages[voyages['previous_begin_port_id_1'].notna()]
voyages['previous_begin_port_id_1'] = voyages['previous_begin_port_id_1'].astype(int)

voyages['previous_begin_port_id_2'] = voyages.groupby(['vessel']).previous_begin_port_id_1.shift(1)
voyages = voyages[voyages['previous_begin_port_id_2'].notna()]
voyages['previous_begin_port_id_2'] = voyages['previous_begin_port_id_2'].astype(int)

ports_temp['E'] = [utm.from_latlon(x,y)[0] for x,y in zip(ports_temp['lat'], ports_temp['long'])]
ports_temp['N'] = [utm.from_latlon(x,y)[1] for x,y in zip(ports_temp['lat'], ports_temp['long'])]
ports_temp['Zone'] = [utm.from_latlon(x,y)[2] for x,y in zip(ports_temp['lat'], ports_temp['long'])]
ports_temp['Hemi'] = [ord(utm.from_latlon(x,y)[3].lower())-96 for x,y in zip(ports_temp['lat'], ports_temp['long'])]

voyages['E_1'] = [ports_temp[ports_temp.port==x]['E'].values[0] for x in voyages['begin_port_id']]
voyages['N_1'] = [ports_temp[ports_temp.port==x]['N'].values[0] for x in voyages['begin_port_id']]
voyages['Zone_1'] = [ports_temp[ports_temp.port==x]['Zone'].values[0] for x in voyages['begin_port_id']]
voyages['Hemi_1'] = [ports_temp[ports_temp.port==x]['Hemi'].values[0] for x in voyages['begin_port_id']]

voyages['E_2'] = [ports_temp[ports_temp.port==x]['E'].values[0] for x in voyages['previous_begin_port_id_1']]
voyages['N_2'] = [ports_temp[ports_temp.port==x]['N'].values[0] for x in voyages['previous_begin_port_id_1']]
voyages['Zone_2'] = [ports_temp[ports_temp.port==x]['Zone'].values[0] for x in voyages['previous_begin_port_id_1']]
voyages['Hemi_2'] = [ports_temp[ports_temp.port==x]['Hemi'].values[0] for x in voyages['previous_begin_port_id_1']]

voyages['E_3'] = [ports_temp[ports_temp.port==x]['E'].values[0] for x in voyages['previous_begin_port_id_2']]
voyages['N_3'] = [ports_temp[ports_temp.port==x]['N'].values[0] for x in voyages['previous_begin_port_id_2']]
voyages['Zone_3'] = [ports_temp[ports_temp.port==x]['Zone'].values[0] for x in voyages['previous_begin_port_id_2']]
voyages['Hemi_3'] = [ports_temp[ports_temp.port==x]['Hemi'].values[0] for x in voyages['previous_begin_port_id_2']]


voyages.loc[:,'day'] = [datetime.strptime(x , '%Y-%m-%d %H:%M:%S').timetuple().tm_yday for x in voyages['begin_date']]

In [6]:
train_set_1 = voyages[~voyages.vessel.isin(dual_pattern)].copy()
X_1 = train_set_1[['E_1', 'N_1', 'Zone_1', 'Hemi_1', 'E_2', 'N_2', 'Zone_2', 'Hemi_2', 'E_3', 'N_3', 'Zone_3', 'Hemi_3']]
y_1 = train_set_1['end_port_id']

In [7]:
#X = [X_1, X_2, X_3]
#y = [y_1, y_2, y_3]

X = [X_1]
y = [y_1]

eta = [0.01]
max_depth = [7]

hyperF = dict(eta=eta, max_depth=max_depth)

gridF = GridSearchCV(XGBClassifier(eval_metric='mlogloss'), hyperF, cv = StratifiedKFold(n_splits=5, shuffle=True), verbose = 1, n_jobs = -1)
model = gridF.fit(X[0], y[0])
print(model.best_score_)
print(model.best_params_)



Fitting 5 folds for each of 1 candidates, totalling 5 fits




0.40846271423069513
{'eta': 0.01, 'max_depth': 7}


In [9]:
model.score(X_1, y_1)
pickle.dump(model, open('model_xgb.pkl', 'wb'))

In [23]:
def next_most_common(port, most_common, second_most_common):
    if port == most_common:
        return(second_most_common)
    else:
        return(most_common)
    
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    hav = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(hav))

def closest_port(data, v, ports):
    pos = min(data, key=lambda p: distance(v['lat'],v['long'],p['lat'],p['long']))
    dist = distance(pos['lat'], pos['long'], v['lat'], v['long'])
    return (ports[(ports.lat == pos['lat']) & (ports.long == pos['long'])]['port'].values[0], dist)

def get_utm(port):
    ports_port = ports_temp[ports_temp.port==port]
    #ports_temp[ports_temp.port==x]['x'].values[0]
    return(ports_port['E'].values[0], ports_port['N'].values[0], ports_port['Zone'].values[0], ports_port['Hemi'].values[0])

def model_non_duplicate(model, voyage_compare, E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2, E_3, N_3, Zone_3, Hemi_3):
    to_predict = [E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2, E_3, N_3, Zone_3, Hemi_3]
    to_predict = np.array(to_predict).reshape((1,-1))
    
    voyage_probs = model.predict_proba(to_predict)[0]
    voyage_probs = np.argsort(-voyage_probs)
    new_voyage = model.classes_[voyage_probs[0]]
    
    if new_voyage == voyage_compare:
        new_voyage = model.classes_[voyage_probs[1]]
    
    return new_voyage

ports_dict = ports[['lat', 'long']].to_dict('records')

In [63]:
predict_list = []
voyages_new = pd.read_csv('voyages.csv')

for vessel in dual_pattern:
    tracking_vessel = tracking[tracking.vessel == vessel]
    
    voyages_vessel = voyages_new[voyages_new.vessel == vessel]
    port = voyages_vessel.iloc[-1]['end_port_id']
    
    two_most_common = voyages_vessel['end_port_id'].value_counts().nlargest(2).keys()
    most_common = two_most_common[0]
    second_most_common = two_most_common[1]
    
    voyage_1 = next_most_common(port, most_common, second_most_common)
    voyage_2 = next_most_common(voyage_1, most_common, second_most_common)
    voyage_3 = next_most_common(voyage_2, most_common, second_most_common)
    
    predict_list.append({'vessel': vessel, 'begin_port_id': port, 'end_port_id': voyage_1, 'voyage': 1})
    predict_list.append({'vessel': vessel, 'begin_port_id': voyage_1, 'end_port_id': voyage_2, 'voyage': 2})
    predict_list.append({'vessel': vessel, 'begin_port_id': voyage_2, 'end_port_id': voyage_3, 'voyage': 3})

for vessel in voyages.drop_duplicates(subset=['vessel'])['vessel'].values:
#for vessel in [131,1]:
    if vessel not in dual_pattern:
        voyages_vessel = voyages[voyages.vessel == vessel]
        
        two_most_common = voyages_new[voyages_new.vessel==vessel]['end_port_id'].value_counts().nlargest(2).keys()
        most_common = two_most_common[0]
        second_most_common = two_most_common[1]
    
        port_1 = voyages_vessel.iloc[-1]['end_port_id']
        port_2 = voyages_vessel.iloc[-1]['begin_port_id']
        port_3 = voyages_vessel.iloc[-1]['previous_begin_port_id_1']
        (E_1, N_1, Zone_1, Hemi_1) = get_utm(port_1)
        (E_2, N_2, Zone_2, Hemi_2) = get_utm(port_2)
        (E_3, N_3, Zone_3, Hemi_3) = get_utm(port_3)
            
        if vessel in first_pattern:
            voyage_1 = model_non_duplicate(model, port_1, E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2, E_3, N_3, Zone_3, Hemi_3)
            voyage_2 = next_most_common(voyage_1, most_common, second_most_common)
            (E_4, N_4, Zone_4, Hemi_4) = get_utm(voyage_1)
            (E_5, N_5, Zone_5, Hemi_5) = get_utm(voyage_2)
            voyage_3 = model_non_duplicate(model, voyage_2, E_5, N_5, Zone_5, Hemi_5, E_4, N_4, Zone_4, Hemi_4, E_1, N_1, Zone_1, Hemi_1)
        elif vessel in second_pattern:
            voyage_1 = next_most_common(port_1, most_common, second_most_common)
            (E_4, N_4, Zone_4, Hemi_4) = get_utm(voyage_1)
            voyage_2 = model_non_duplicate(model, voyage_1, E_4, N_4, Zone_4, Hemi_4, E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2)
            voyage_3 = next_most_common(voyage_2, most_common, second_most_common)
        else:
            voyage_1 = model_non_duplicate(model, port_1, E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2, E_3, N_3, Zone_3, Hemi_3)
            (E_4, N_4, Zone_4, Hemi_4) = get_utm(voyage_1)
            voyage_2 = model_non_duplicate(model, voyage_1, E_4, N_4, Zone_4, Hemi_4, E_1, N_1, Zone_1, Hemi_1, E_2, N_2, Zone_2, Hemi_2)
            (E_5, N_5, Zone_5, Hemi_5) = get_utm(voyage_2)
            voyage_3 = model_non_duplicate(model, voyage_2, E_5, N_5, Zone_5, Hemi_5, E_4, N_4, Zone_4, Hemi_4, E_1, N_1, Zone_1, Hemi_1)
                    
        if voyage_1 == port_1:
            print("Error1")
            
        if voyage_1 == voyage_2:
            print("Error2")
    
        if voyage_2 == voyage_3:
            print("Error3")
            
        predict_list.append({'vessel': vessel, 'begin_port_id': port_1, 'end_port_id': voyage_1, 'voyage': 1})
        predict_list.append({'vessel': vessel, 'begin_port_id': voyage_1, 'end_port_id': voyage_2, 'voyage': 2})
        predict_list.append({'vessel': vessel, 'begin_port_id': voyage_2, 'end_port_id': voyage_3, 'voyage': 3})

predict = pd.DataFrame(predict_list)

In [64]:
predict = predict.sort_values(['vessel', 'voyage'])

In [62]:
predict.to_csv('predict.csv', index=False)