# IMPORTS

In [None]:
from pyltr.metrics import NDCG
from lightgbm import LGBMRanker
from sklearn.model_selection import train_test_split as tts
from scipy.special import softmax

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Load datasets

In [None]:
visits = pd.read_parquet('train_visits.parquet.gzip')
ports = pd.read_parquet('ports_prep.parquet.gzip')
vessels = pd.read_parquet('vessels_prep.parquet.gzip')

ports = ports.set_index('port_index')

# Take subsample to reduce memory
- Drop irrelevant columns (e.g. datetimes)
- Add target
- Add option (needed for creating the dataset)
- Create an identifier since a vessel can have multiple visits in the dataset

In [None]:
df = visits[visits['target_port_index'].notnull()].sample(25000).sort_values('entry_datetime')

df = df.drop(['entry_datetime', 'target_entry_datetime', 
                      'exit_datetime', 'previous_exit_datetime',
                      'target_stay_duration', 'target_travel_duration',
                      'previous_portname', 'previous_entry_datetime', 
                      'prev2_exit_datetime'], 1)

df['target'] = 1
df['option_id'] = 0

df['identifier'] = df['mmsi'].astype(str) + '_' + df.groupby('mmsi').cumcount().astype('str')
df = df.set_index('identifier')

# Create dataset for ranking
- Get pivots; creates pivot tables with new features
- Create dataset; creates a dataset for ranking purposes. Each instance is expanded to 75 instances each with a different potential target port.
- add features; adds new features (also from the pivots) to the dataset

In [None]:
def get_pivots(port_only=False):
    #add iso3 codes to hist_visits
    ports=pd.read_parquet('ports_prep.parquet.gzip').set_index('port_index')
    hist_visits = pd.read_parquet('hist_visits.parquet.gzip')
    hist_visits = pd.merge(hist_visits, ports[['iso3']].reset_index(), how='left', on='port_index')
    hist_visits = pd.merge(hist_visits, ports[['iso3']].rename(columns={'iso3':'target_iso3'}), 
                               how='left', left_on='target_port_index', right_index=True)
    
    hist_visits['hist_connections'] = 1
    
    #Connections between ports
    port_pt = pd.pivot_table(hist_visits, index=['port_index', 'target_port_index'], 
                        values='hist_connections', aggfunc='count').fillna(0).reset_index()
    port_pt.rename(columns={'hist_connections': 'port_hist_connections'}, inplace=True)
    port_pt = port_pt.set_index(['port_index', 'target_port_index'])
    
    if port_only:
        return port_pt
    
    #travel time/stay duration between ports
    port_pt2 = pd.pivot_table(hist_visits, index=['port_index', 'target_port_index'], 
                        values=['target_travel_duration', 'target_stay_duration'], 
                         aggfunc=['mean', 'std', 'max', 'min']).reset_index()
    port_pt2 = port_pt2.set_index(['port_index', 'target_port_index'])
    port_pt2.columns = ['ports_' + '_'.join((str(j), str(k))) for j, k in port_pt2.columns]  
    
    
    #connections between countries
    iso_pt = pd.pivot_table(hist_visits, index=['iso3', 'target_iso3'], 
                        values='hist_connections', aggfunc='count').fillna(0).reset_index()
    iso_pt.rename(columns={'hist_connections': 'iso_hist_connections'}, inplace=True)
    iso_pt = iso_pt.set_index(['iso3', 'target_iso3'])

    #travel time/stay duration between countries
    iso_pt2 = pd.pivot_table(hist_visits, index=['iso3', 'target_iso3'], 
                        values=['target_travel_duration', 'target_stay_duration'], 
                         aggfunc=['mean', 'std', 'max', 'min']).reset_index()
    iso_pt2 = iso_pt2.set_index(['iso3', 'target_iso3'])
    iso_pt2.columns = ['iso3_' + '_'.join((str(j), str(k))) for j, k in iso_pt2.columns]    
    

    #merge datasets
    port_pt = pd.merge(port_pt, port_pt2, left_index=True, right_index=True)
    iso_pt = pd.merge(iso_pt, iso_pt2, left_index=True, right_index=True)

    
    #previous port --> target port connections
    prev_pt = pd.pivot_table(hist_visits, index=['previous_port_index', 'target_port_index'], 
                        values='hist_connections', aggfunc='count').fillna(0).reset_index()
    prev_pt.rename(columns={'hist_connections': 'prev_hist_connections'}, inplace=True)
    prev_pt = prev_pt.set_index(['previous_port_index', 'target_port_index'])
    
    #prev2 port --> target port connections
    prev2_pt = pd.pivot_table(hist_visits, index=['prev2_port_index', 'target_port_index'], 
                            values='hist_connections', aggfunc='count').fillna(0).reset_index()
    prev2_pt.rename(columns={'hist_connections': 'prev2_hist_connections'}, inplace=True)
    prev2_pt = prev2_pt.set_index(['prev2_port_index', 'target_port_index'])
    
    return port_pt, iso_pt, prev_pt, prev2_pt

In [None]:
def create_dataset(df, n_regular_connections=10, n_random_ports=40):
    port_pt = get_pivots(port_only=True)
    
    #first get some regular connections
    dataset1 = pd.DataFrame(df[['port_index']])
    dataset1 = pd.merge(dataset1.reset_index(), port_pt[['port_hist_connections']].reset_index(), how='left', on='port_index')
    dataset1 = dataset1.sort_values(['identifier', 'port_hist_connections'], ascending=[True, False])
    dataset1 = dataset1.groupby('identifier').head(n_regular_connections)
    dataset1 = dataset1.drop(['port_index', 'port_hist_connections'], 1)

    #then get some random ports as possible targets
    dataset = pd.concat([pd.DataFrame(index=df.index)]*n_random_ports)
    dataset['target_port_index'] = np.random.choice(ports.index.astype(int).values, len(dataset), replace=True)

    #add them together, sort on identifier, set the target to 0 and give them an option id, and set the index
    dataset = pd.concat([dataset1, dataset.reset_index()], sort=False).reset_index(drop=True)
    dataset = dataset.sort_values('identifier')
    dataset['option_id'] = dataset.groupby('identifier').cumcount()+1
    dataset['target'] = 0
    dataset = dataset.set_index('identifier')

    #add the true targets to the dataset, merge the original data with the dataset
    dataset = pd.concat([dataset, df[dataset.columns]])
    dataset = pd.merge(df.drop(['option_id', 'target', 'target_port_index'], 1), 
                        dataset, how='right', left_index=True, right_index=True)

    #set the index and drop duplicates
    dataset = dataset.reset_index()
    dataset = dataset.drop_duplicates(subset=['identifier', 'port_index', 'target_port_index'], keep='last')

    #drop option_id
    dataset = dataset.drop('option_id', 1)
    
    return dataset

In [None]:
def add_features(dataset):
    port_pt, iso_pt, prev_pt, prev2_pt = get_pivots()
    ports = pd.read_parquet('ports_prep.parquet.gzip').set_index('port_index')
    distances = pd.read_excel('CERDI.xlsx').set_index(['iso1', 'iso2'])[['seadistance']]
    og_port_columns = ports.columns
    
    #add target port features
    ports.columns = ['target_' + column for column in og_port_columns]
    dataset = pd.merge(dataset, ports, how='left', left_on='target_port_index', right_index=True)
    ports.columns = og_port_columns
    
    #add port_pt, iso_pt features
    dataset = pd.merge(dataset, port_pt, how='left', left_on=['port_index', 'target_port_index'], right_index=True)
    dataset = pd.merge(dataset, iso_pt, how='left', left_on=['iso3', 'target_iso3'], right_index=True)
    dataset['port_hist_connections'] = dataset['port_hist_connections'].fillna(0)
    dataset['iso_hist_connections'] = dataset['iso_hist_connections'].fillna(0)
    
    #add fav port connections
    port_pt = port_pt.reset_index().groupby('port_index').head(1)
    dataset = pd.merge(dataset, port_pt[['port_index', 'target_port_index', 
                       'port_hist_connections']].rename(columns={'target_port_index': 'port_fav_port',
                       'port_hist_connections': 'port_fav_port_count'}), 
                       how='left', on='port_index')
    dataset['port_fav_port_count'] = dataset['port_fav_port_count'].fillna(0)
    dataset['port_fav_port'] = dataset['port_fav_port'].astype('object').fillna('Unknown').astype('category')

    ports.columns = ['port_fav_' + column for column in og_port_columns]
    dataset = pd.merge(dataset, ports[['port_fav_port_lat', 'port_fav_port_long']], 
                       how='left', left_on='target_port_index', right_index=True)
    ports.columns = og_port_columns
    
    #add previous and prev2 pt features
    dataset = pd.merge(dataset, prev_pt, how='left', left_on=['previous_port_index', 'target_port_index'], right_index=True)
    dataset = pd.merge(dataset, prev2_pt, how='left', left_on=['prev2_port_index', 'target_port_index'], right_index=True)
    dataset['prev_hist_connections'] = dataset['prev_hist_connections'].fillna(0)
    dataset['prev2_hist_connections'] = dataset['prev2_hist_connections'].fillna(0)    
    
    #add seadistance curr -- target
    dataset = pd.merge(dataset, distances[['seadistance']], how='left',
                       left_on=['iso3', 'target_iso3'], right_index=True)
    dataset['seadistance'] = dataset['seadistance'].fillna(0)

    #add seadistance prev -- target
    dataset = pd.merge(dataset, distances[['seadistance']].rename(columns={'seadistance': 'prev_tar_seadistance'}), 
                       how='left', left_on=['previous_iso3', 'target_iso3'], right_index=True)
    dataset['prev_tar_seadistance'] = dataset['prev_tar_seadistance'].fillna(0)

    #add seadistance prev2 -- target
    dataset = pd.merge(dataset, distances[['seadistance']].rename(columns={'seadistance': 'prev2_tar_seadistance'}), 
                       how='left', left_on=['prev2_iso3', 'target_iso3'], right_index=True)
    dataset['prev2_tar_seadistance'] = dataset['prev2_tar_seadistance'].fillna(0)
    
    #add euclidean distances
    dataset['eucl_cur_tar'] = np.sqrt((dataset['port_lat'] - dataset['target_port_lat'])**2 + 
                                 (dataset['port_long'] - dataset['target_port_long'])**2)
    dataset['eucl_cur_prev'] = np.sqrt((dataset['port_lat'] - dataset['previous_port_lat'])**2 + 
                                 (dataset['port_long'] - dataset['previous_port_long'])**2)
    dataset['eucl_tar_prev'] = np.sqrt((dataset['target_port_lat'] - dataset['previous_port_lat'])**2 + 
                                 (dataset['target_port_long'] - dataset['previous_port_long'])**2)
    dataset['eucl_ves_tar'] = np.sqrt((dataset['mean_port_lat'] - dataset['target_port_lat'])**2 + 
                                 (dataset['mean_port_long'] - dataset['target_port_long'])**2)
    dataset['eucl_ves_cur'] = np.sqrt((dataset['mean_port_lat'] - dataset['port_lat'])**2 + 
                                 (dataset['mean_port_long'] - dataset['port_long'])**2)
    dataset['eucl_ves_prev'] = np.sqrt((dataset['mean_port_lat'] - dataset['previous_port_lat'])**2 + 
                                 (dataset['mean_port_long'] - dataset['previous_port_long'])**2)
    
    dataset['eucl_cur_prev2'] = np.sqrt((dataset['port_lat'] - dataset['prev2_port_lat'])**2 + 
                                 (dataset['port_long'] - dataset['prev2_port_long'])**2)
    dataset['eucl_tar_prev2'] = np.sqrt((dataset['target_port_lat'] - dataset['prev2_port_lat'])**2 + 
                                 (dataset['target_port_long'] - dataset['prev2_port_long'])**2)
    dataset['eucl_prev_prev2'] = np.sqrt((dataset['previous_port_lat'] - dataset['prev2_port_lat'])**2 + 
                                 (dataset['previous_port_long'] - dataset['prev2_port_long'])**2)
    dataset['eucl_ves_prev2'] = np.sqrt((dataset['mean_port_lat'] - dataset['prev2_port_lat'])**2 + 
                                 (dataset['mean_port_long'] - dataset['prev2_port_long'])**2)    

    dataset['eucl_ves_fav_cur'] = np.sqrt((dataset['vessel_fav_port_lat'] - dataset['port_lat'])**2 + 
                                 (dataset['vessel_fav_port_long'] - dataset['port_long'])**2)
    dataset['eucl_ves_fav_tar'] = np.sqrt((dataset['vessel_fav_port_lat'] - dataset['target_port_lat'])**2 + 
                                 (dataset['vessel_fav_port_long'] - dataset['target_port_long'])**2)
    dataset['eucl_ves_fav_prev'] = np.sqrt((dataset['vessel_fav_port_lat'] - dataset['previous_port_lat'])**2 + 
                                 (dataset['vessel_fav_port_long'] - dataset['previous_port_long'])**2)
    dataset['eucl_ves_fav_prev2'] = np.sqrt((dataset['vessel_fav_port_lat'] - dataset['prev2_port_lat'])**2 + 
                                 (dataset['vessel_fav_port_long'] - dataset['prev2_port_long'])**2) 
    dataset['eucl_ves_fav_ves'] = np.sqrt((dataset['vessel_fav_port_lat'] - dataset['mean_port_lat'])**2 + 
                                 (dataset['vessel_fav_port_long'] - dataset['mean_port_long'])**2)
    
    dataset['eucl_port_fav_tar'] = np.sqrt((dataset['port_fav_port_lat'] - dataset['target_port_lat'])**2 + 
                                 (dataset['port_fav_port_long'] - dataset['target_port_long'])**2)
    dataset['eucl_port_fav_prev'] = np.sqrt((dataset['port_fav_port_lat'] - dataset['previous_port_lat'])**2 + 
                                 (dataset['port_fav_port_long'] - dataset['previous_port_long'])**2)
    dataset['eucl_port_fav_ves'] = np.sqrt((dataset['port_fav_port_lat'] - dataset['mean_port_lat'])**2 + 
                                 (dataset['port_fav_port_long'] - dataset['mean_port_long'])**2)    
    dataset['eucl_port_fav_ves_fav'] = np.sqrt((dataset['port_fav_port_lat'] - dataset['vessel_fav_port_lat'])**2 + 
                                 (dataset['port_fav_port_long'] - dataset['vessel_fav_port_long'])**2)   
    
    #add rankings
    dataset['rank_port_connections'] = dataset.groupby('identifier')['port_hist_connections'].rank(axis=0, 
                                                                                        method='min', ascending=False)
    dataset['rank_iso_connections'] = dataset.groupby('identifier')['iso_hist_connections'].rank(axis=0, 
                                                                                        method='min', ascending=False)
    dataset['rank_prev_connections'] = dataset.groupby('identifier')['prev_hist_connections'].rank(axis=0, 
                                                                                        method='min', ascending=False)
    dataset['rank_prev2_connections'] = dataset.groupby('identifier')['prev2_hist_connections'].rank(axis=0, 
                                                                                        method='min', ascending=False)
    dataset['rank_seadistance'] = dataset.groupby('identifier')['seadistance'].rank(axis=0, method='min')
    dataset['rank_prev_tar_seadistance'] = dataset.groupby('identifier')['prev_tar_seadistance'].rank(axis=0, method='min')
    dataset['rank_eucl_cur_tar'] = dataset.groupby('identifier')['eucl_cur_tar'].rank(axis=0, method='min')
    dataset['rank_eucl_tar_prev'] = dataset.groupby('identifier')['eucl_tar_prev'].rank(axis=0, method='min')
    dataset['rank_eucl_tar_ves_fav'] = dataset.groupby('identifier')['eucl_ves_fav_tar'].rank(axis=0, method='min')
    dataset['rank_eucl_tar_mean'] = dataset.groupby('identifier')['eucl_ves_tar'].rank(axis=0, method='min')
    dataset['rank_eucl_port_fav_tar'] = dataset.groupby('identifier')['eucl_port_fav_tar'].rank(axis=0, method='min')
    
    #rename a column
    dataset.rename(columns={'n_visits_y': 'n_visits'}, inplace=True)
    
    #add some difference metrics
    for col in ['port_lat', 'port_long', 'n_visits', 'n_unique_vessels',
           'n_high_speed', 'n_medium_speed', 'n_Chemical/Oil Tanker', 'n_Container Ship', 'n_Crude Oil Tanker',
           'n_General Cargo Ship', 'n_Tanker', 'n_large_length', 'n_medium_length',
           'n_small_length', 'n_very large_length', 'n_large_depth',
           'n_medium_depth', 'n_small_depth', 'n_very large_depth', 'port_avg_distance_to_port', 'port_avg_travel_duration',
           'port_avg_stay_duration' ]:
        dataset[f'prev_curr_diff_{col}'] = abs(dataset[col] - dataset[f'previous_{col}'])
        dataset[f'curr_targ_diff_{col}'] = abs(dataset[f'target_{col}'] - dataset[col])
        dataset[f'fav_targ_diff_{col}'] = abs(dataset[f'target_{col}'] - dataset[f'vessel_fav_{col}'])   
    
    
    dataset['prev_speed_coor'] = (dataset['eucl_cur_prev']*111) / np.exp(dataset['previous_travel_duration'])
    dataset['prev_speed_sead'] = dataset['previous_seadistance'] / np.exp(dataset['previous_travel_duration'])
    dataset['prev2_speed_coor'] = (dataset['eucl_prev_prev2']*111) / np.exp(dataset['prev2_travel_duration'])
    dataset['prev2_speed_sead'] = dataset['prev2_prev_seadistance'] / np.exp(dataset['prev2_travel_duration'])
    
    dataset['exp_trav_prev_coor'] = np.log((dataset['eucl_cur_tar']*111) / dataset['prev_speed_coor'])
    dataset['exp_trav_prev_sead'] = np.log(dataset['seadistance'] / dataset['prev_speed_sead'])
    dataset['exp_trav_prev2_coor'] = np.log((dataset['eucl_cur_tar']*111) / dataset['prev2_speed_coor'])
    dataset['exp_trav_prev2_sead'] = np.log(dataset['seadistance'] / dataset['prev2_speed_sead'])
        
    return dataset

In [None]:
dataset = create_dataset(df, n_regular_connections=25, n_random_ports=50)
print(dataset.shape)
dataset.head()

In [None]:
dataset = add_features(dataset)
print(dataset.shape)
dataset.head()

### Mappings
- Mappings are import to make sure that the DSS prediction uses the same mapping between categories

In [None]:
mappings = {}

for column in dataset.columns[dataset.dtypes == 'object']:
    if (column != 'identifier') and ('iso3' not in column) and ('portname' not in column):
        print(column)
        i = 0
        colmap = {}
        for value in dataset[column].unique():
            colmap[value] = i
            i+=1
        
        dataset[column] = dataset[column].map(colmap).astype('category')

        mappings[column] = colmap

In [None]:
pickle.dump(mappings, open('mappings.p', 'wb'))

# TRAIN TEST SPLIT
- Split the data on time
- There are some columns we do not want to use in the model. For example using the mmsi prohibits the model to be effective for unseen vessels. Also the features with many categories (such as mmsi) could induce overfitting

In [None]:
dataset = dataset.set_index(['identifier'])

In [None]:
srch_ids = np.asarray(dataset.index.get_level_values(0).unique())


train_cutoff = int(len(srch_ids)*0.6)
val_cutoff = int(len(srch_ids)*0.8)


dataset['count'] = 1
traindf = dataset[dataset.index.get_level_values(0).isin(srch_ids[:train_cutoff])]
trainids = pd.pivot_table(traindf, index='identifier', values='count', aggfunc='count')['count'].values

valdf = dataset[dataset.index.get_level_values(0).isin(srch_ids[train_cutoff:val_cutoff])]
valids = pd.pivot_table(valdf, index='identifier', values='count', aggfunc='count')['count'].values

testdf = dataset[dataset.index.get_level_values(0).isin(srch_ids[val_cutoff:])]
testids = pd.pivot_table(valdf, index='identifier', values='count', aggfunc='count')['count'].values

In [None]:
cols_not_train = ['count', 'target', 'mmsi', 'portname', 'target_portname', 'prev2_portname',
                 'mmsi_iso3', 'iso3', 'previous_iso3', 'prev2_iso3', 'target_iso3', 'target_port_index',
                 'port_index', 'prev2_port_index', 'vessel_fav_portname', 'vessel_fav_iso3', 'vessel_fav_port_index',
                 'port_fav_port']

cols_train = traindf.drop(cols_not_train, 1).columns

# FEATURE SELECTION
- Define a model
- Fit a model
- Select the features

In [None]:
model = LGBMRanker(n_estimators=500, learning_rate=.05, verbose=1,
                   min_child_samples=5000, lambdarank_truncation_level=1,
                   max_depth=7, num_leaves=15,
                   metric='ndcg')

In [None]:
model.fit(traindf[cols_train], traindf['target'].values, group=trainids, 
         eval_set=[(traindf[cols_train], traindf['target'].values), 
                   (valdf[cols_train], valdf['target'].values)],
         eval_group=[trainids, valids],
         eval_metric='ndcg', eval_at=1,
         verbose=10, early_stopping_rounds=250)

In [None]:
new_cols = pd.DataFrame({'Feature': cols_train, 
                         'Importance': model.feature_importances_}).sort_values('Importance').tail(50)['Feature'].values

pd.DataFrame({'Feature': cols_train, 'Importance': model.feature_importances_}).sort_values('Importance').tail(25)

# MODEL
- Define a model
- Train a model on the selected features
- Check performance
- Save the used features and the model

In [None]:
model = LGBMRanker(n_estimators=2500, learning_rate=.025, verbose=1,
                   min_child_samples=25000, lambdarank_truncation_level=1,
                   max_depth=4, num_leaves=10,
                   metric='ndcg')

In [None]:
model.fit(traindf[new_cols], traindf['target'].values, group=trainids, 
         eval_set=[(traindf[new_cols], traindf['target'].values), 
                   (valdf[new_cols], valdf['target'].values)],
         eval_group=[trainids, valids],
         eval_metric='ndcg', eval_at=1,
         verbose=10, early_stopping_rounds=250)

In [None]:
metric = NDCG(k=1)

prediction = model.predict(testdf[new_cols])

print ('Random ranking:', metric.calc_mean_random(testdf.index.get_level_values(0).values, 
                                                  testdf['target'].values))

print ('Our model:', metric.calc_mean(testdf.index.get_level_values(0).values, 
                                      testdf['target'].values, prediction))

#prediction_train = model.predict(x_train)
#print ('Train model:', metric.calc_mean(np.asarray(qids_train), np.asarray(y_train), prediction_train))

In [None]:
pd.DataFrame({'Feature': new_cols, 'Importance': model.feature_importances_}).sort_values('Importance').tail(25)

In [None]:
import lightgbm as lgb
lgb.plot_metric(model)

In [None]:
pickle.dump(new_cols, open('port_cols.p', 'wb'))
pickle.dump(model, open('port_model.p', 'wb'))

# Rank complete port dataset
- Since the previous test set only contained 75 ports per instance it is not representative of the actual environment in which it will be used
- Therefore the performance is evaluated on a part of the test set where for each instance all ports are possible targets
- This is the final performance of the model
- Lastly save the predictions to be able to analyze them in the results notebook

In [None]:
testdf.index.get_level_values(0).nunique()

In [None]:
def get_subset(dataset, samples=1750, test=True):
    if test:
        random_ids = np.random.choice(testdf.index.get_level_values(0).unique(), samples, replace=False)
    else:
        random_ids = np.random.choice(traindf.index.get_level_values(0).unique(), samples, replace=False)


    data = dataset[(dataset.index.get_level_values(0).isin(random_ids)) & 
                   (dataset['target'] == 1)][[col for col in dataset.columns 
                                              if (col in visits.columns) or (('n_visits' in col) and 
                                                                            ('target' not in col))]]

    true_ports = data.reset_index()[['identifier', 'target_port_index']]

    return data.reset_index().drop('target_port_index', 1), true_ports

In [None]:
def create_test_dataset(data, mappings):
    samples = len(data)
    data = data.iloc[np.repeat(np.arange(len(data)), len(ports))]
    data['target_port_index'] = np.tile(ports.index.astype('int').values, samples)
    
    #data = data[data['port_index'] != data['target_port_index']]
    
    return data

In [None]:
data, true_ports = get_subset(dataset, samples=1500, test=True)
print(data.shape)
data.head()

In [None]:
data = create_test_dataset(data, mappings)
print(data.shape)
data.head()

In [None]:
data = add_features(data)
print(data.shape)
data.head()

In [None]:
for column in data.columns[data.dtypes == 'object']:
    if (column != 'identifier') and ('iso3' not in column) and ('portname' not in column):
        print(column)
        data[column] = data[column].map(mappings[column]).astype('category')

In [None]:
data['prediction'] = model.predict(data[new_cols])

In [None]:
results = data[['identifier', 'port_index', 'target_port_index', 'prediction']].sort_values(['identifier', 'prediction'], 
                        ascending=[True, False]).reset_index(drop=True)

results['rank'] = results.groupby('identifier').cumcount()+1

In [None]:
true_ports['target'] = 1
results = pd.merge(results, true_ports, how='left', on=['identifier', 'target_port_index']).fillna(0)

In [None]:
results.to_csv('port_prediction_results.csv', index=False)

# Baseline
- Baseline uses the historic visits as well as the train dataset
- Computes the most occuring connections and predicts those
- Save the predictions to evaluate them in the Results notebook

In [None]:
hist_visits = pd.read_parquet('hist_visits.parquet.gzip')
hist_visits = hist_visits[['port_index', 'target_port_index']]
hist_visits = pd.concat([hist_visits, traindf[['port_index', 'target_port_index']]])
hist_visits['connections'] = 1

print(hist_visits.shape)
hist_visits.head()

In [None]:
pt = pd.pivot_table(hist_visits, index=['port_index', 'target_port_index'], 
                    values='connections', aggfunc='count').fillna(0).reset_index()

pt = pt.sort_values(['port_index', 'connections'], ascending=[True, False])

print(pt.shape)
pt.head()

In [None]:
testbase = testdf[testdf['target'] == 1][['mmsi', 'port_index', 'target_port_index', 'target']].reset_index()
testbase = testbase.drop_duplicates('mmsi').drop('mmsi', 1)

pt = pd.merge(testbase[['identifier', 'port_index']], pt, how='left', on='port_index')

testbase = pd.merge(pt, testbase, 
         how='left', on=['identifier', 'port_index', 'target_port_index'])

testbase['target'] = testbase['target'].fillna(0)
testbase['rank'] = testbase.groupby('identifier').cumcount()+1

In [None]:
testbase.head()

In [None]:
testbase.to_csv('BASELINE_port_prediction_results.csv', index=False)