# IMPORTS

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

# PREPROCESSING 3 DATASETS

## Ports
- Fill iso3 missing values first with the iso3_op
- Fill other missing values with unknown; might want to manually fill iso3 missing values left after first step
- Drop the irrelevant columns
    - Drop latitude/longitude since the correct values are in the visits dataset
- Drop the instance with no information
- Recode some stuff, to be able to determine natuical distance

In [None]:
def prep_ports():
    ports = pd.read_csv('ports.csv', delimiter=';', index_col='port_index', encoding='latin-1')
    print('old shape:', ports.shape)

    ports['iso3'] = ports['iso3'].fillna(ports['iso3_op'])
    ports[['prttype', 'prtsize', 'status', 'iso3']] = ports[['prttype', 'prtsize', 'status', 'iso3']].fillna('Unknown')
    ports = ports.drop(['code', 'maxdepth', 'maxlength', 'annualcapa', 'country', 
                        'iso3_op', 'latitude', 'longitude'], 1).reset_index()
    ports = ports[ports['port_index'].notnull()]

    ports = ports.replace({'Sea Port': 'Sea',
                          'Restricted': 'Unknown',
                          'Planned': 'Unknown',
                          'COG COD': 'COG',          # Both congo
                          'MOZ, MWI, ZWE' : 'MOZ',   # Mozambique
                          'MOZ, ZWE': 'MOZ',         # Mozambique
                          'ESH': 'MAR',              # Western Sahara --> Marocco
                          'JEY': 'FRA',              # Jersey --> France
                          'IMY': 'TUR'})             # Milyan language? --> Turkey

    ports.loc[ports['portname'] == 'Duqm', 'iso3'] = 'OMN' #the only port that we dont have iso3, but does occur in visits

    print('new shape:', ports.shape)
    return ports

## Vessels
- Compute a linear regression for length and depth with the instances that have both length and depth
- Predict and fill the values for the instances that have depth/length missing
- Fill the last 3 instances (which neither have length nor depth) with the mean length and depth
- Drop the predictions
- Categorize length/depth

In [None]:
def prep_vessels():
    vessels = pd.read_csv('vessels_subset.csv', delimiter=';')
    print('old shape:', vessels.shape)
    vessels = vessels.drop('imo', 1)

    #select data
    data = vessels[vessels['length'].notnull() & vessels['depth'].notnull()]
    length_data = vessels[vessels['depth'].notnull()]
    depth_data = vessels[vessels['length'].notnull()]

    #linear regressions
    lmod = LinearRegression()
    lmod.fit(data[['depth']], data['length'])
    length_data['length_pred'] = lmod.predict(length_data[['depth']])

    dmod = LinearRegression()
    dmod.fit(data[['length']], data['depth'])
    depth_data['depth_pred'] = dmod.predict(depth_data[['length']])

    #fill predictions
    vessels = pd.merge(vessels, depth_data[['mmsi', 'depth_pred']], how='left', on='mmsi')
    vessels = pd.merge(vessels, length_data[['mmsi', 'length_pred']], how='left', on='mmsi')
    vessels['length'] = vessels['length'].fillna(vessels['length_pred']).fillna(vessels['length'].mean())
    vessels['depth'] = vessels['depth'].fillna(vessels['depth_pred']).fillna(vessels['depth'].mean())

    #drop predictions
    vessels = vessels.drop(['length_pred', 'depth_pred'], 1)

    # Categorize length/depth
    vessels['length'] = np.where(vessels['length'] < vessels['length'].quantile(0.25), 'small',
                                    np.where(vessels['length'] < vessels['length'].quantile(0.5), 'medium',
                                            np.where(vessels['length'] < vessels['length'].quantile(0.75), 'large', 
                                                     'very large')))

    vessels['depth'] = np.where(vessels['depth'] < vessels['depth'].quantile(0.25), 'small',
                                    np.where(vessels['depth'] < vessels['depth'].quantile(0.5), 'medium',
                                            np.where(vessels['depth'] < vessels['depth'].quantile(0.75), 'large', 
                                                     'very large')))

    #reorder columns
    vessels = vessels[['mmsi', 'ship_type', 'speed', 'length', 'depth']]


    print('new shape:', vessels.shape)
    
    return vessels

## Visits
- Drop irrelevant variables
- Only take one instance per entry/exit line; drop the variable
- Add mid features
- Clean up columns
- Combine visits from one port to the same port
- Add target features
- Remove visits according to the steps defined in the report
- Take log of travel duration/stay duration
- Add previous port and the port before that
- Drop vessels with invalid mmsis

In [None]:
def prep_visits():
    visits = pd.read_csv('ais_port_visits.csv')
    print('old shape:', visits.shape)
    visits = visits.drop(['type', 'visit_uuid', 'latest_known_port', 'port_name', 'imo'], 1)
    visits['exit_datetime'] = visits['datetime'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    visits = visits[visits['action'] == 'enter'].drop('action', 1)
    print('shape after remvoving exit instances:', visits.shape)
    
    print(len(visits[visits['stay_duration'] == 0]), 'instances were dropped for having a stay duration of 0.')        
    visits = visits[visits['stay_duration'] != 0]

    #mid info
    mids = pd.read_csv('mids.csv', header=None, names=['mid', 'iso2', 'mmsi_iso3', 'idk', 'name'])
    mids = mids.drop(['iso2', 'idk', 'name'], 1)

    visits['mid'] = visits['mmsi'].astype(str).str[:3].astype(int)
    visits = pd.merge(visits, mids, how='left', on='mid')
    visits = visits[visits['mmsi_iso3'].notnull()]
    visits['mmsi_region'] = visits['mmsi'].astype(str).str[0].astype('category')

    #clean up columns
    visits = visits[['mmsi', 'mmsi_iso3', 'mmsi_region', 'port_index', 'port_lat', 'port_long', 
                     'distance_to_port', 'datetime', 'stay_duration', 'exit_datetime']]

    visits.rename(columns={'datetime':'entry_datetime'}, inplace=True)
    visits['entry_datetime'] = pd.to_datetime(visits['entry_datetime'])
    visits['exit_datetime'] = pd.to_datetime(visits['exit_datetime'])
    visits['stay_duration'] = (visits['exit_datetime'] - visits['entry_datetime']) / pd.Timedelta(hours=1)

    #sort visits
    visits = visits.sort_values(['mmsi', 'entry_datetime']).reset_index(drop=True)

    #Create target variables
    visits['target_entry_datetime'] = visits['entry_datetime'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    visits['target_travel_duration'] = visits['target_entry_datetime'].sub(visits['exit_datetime'], axis=0) / np.timedelta64(1, 'h')
    visits['target_port_index'] = visits['port_index'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    
    #save dss instances
    missings = visits[visits['target_port_index'].isnull()]
    
    #remove instances where current port == target port
    indices = visits[(visits['port_index'] == visits['target_port_index']) &
                    (visits['target_travel_duration'] < 12)].index
    visits.iloc[indices-1, 7] = pd.to_datetime(visits.iloc[indices, 7].values)
    visits = visits.drop(indices).reset_index(drop=True)
    print(len(indices), 'instances were dropped for returning to the same port in less than 12 hours.')
    
    
    #Redefine target features/previous features
    visits['target_port_index'] = visits['port_index'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    visits['target_entry_datetime'] = visits['entry_datetime'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    visits['stay_duration'] = (visits['exit_datetime'] - visits['entry_datetime']) / pd.Timedelta(hours=1)
    visits['target_stay_duration'] = visits['stay_duration'].shift(-1).where(visits['mmsi'].eq(visits['mmsi'].shift(-1)))
    visits['target_travel_duration'] = visits['target_entry_datetime'].sub(visits['exit_datetime'], axis=0) / np.timedelta64(1, 'h')
    
    
    #remove visits according to the preprocessing steps described
    pt_ports = pd.pivot_table(visits, values=['port_lat', 'port_long'], index='port_index', aggfunc='mean')
    visits = pd.merge(visits, pt_ports.rename(columns={'port_lat':'target_port_lat',
                                             'port_long': 'target_port_long'}),
                how='left', left_on='target_port_index', right_index=True)
    visits['distance'] = np.sqrt((visits['port_lat'] - visits['target_port_lat'])**2 + 
                                     (visits['port_long'] - visits['target_port_long'])**2)
    visits['speed'] = (visits['distance']*111) / visits['target_travel_duration']
    
    print(len(visits[(visits['speed'] > 50)]), 'instances were dropped for having a speed > 50 kmph.')    
    visits = visits[(visits['speed'] < 50)].reset_index(drop=True)
    
    indices = visits[(visits['speed'] < 0.1) & (visits['port_index'] != visits['target_port_index'])].index
    print(len(indices), 'instances were dropped for having a speed < 0.1 kmph.')    
    visits = visits.drop(indices)    
    
    visits = visits.drop(['speed', 'distance', 'target_port_lat', 'target_port_long'], 1)
    
    print(len(visits[(visits['target_travel_duration'] > 1250)]), 'instances were dropped for having a travel duration > 1250.')    
    visits = visits[(visits['target_travel_duration'] < 1250)]    
    
    print(len(visits[(visits['target_travel_duration'] < 0.25)]), 'instances were dropped for having a travel duration < 0.1.')    
    visits = visits[(visits['target_travel_duration'] > 0.25)] 
    
    print(len(visits[(visits['target_stay_duration'] > visits['target_stay_duration'].quantile(0.99))]),
          'instances were dropped for having a stay duration larger than the 99% percentile.')
    visits = visits[(visits['target_stay_duration'] < visits['target_stay_duration'].quantile(0.99))]
    
    #take log of stay_duration/travel_duration
    visits['stay_duration'] = np.log(visits['stay_duration'])
    visits['target_stay_duration'] = np.log(visits['target_stay_duration'])
    visits['target_travel_duration'] = np.log(visits['target_travel_duration'])
    
    #add dss instances again
    visits = pd.concat([visits, missings])
    visits = visits.sort_values(['mmsi', 'entry_datetime']).reset_index(drop=True)

    #add previous port features
    visits['previous_port_index'] = visits['port_index'].shift(1).where(visits['mmsi'].eq(visits['mmsi'].shift(1)))
    visits['previous_distance_to_port'] = visits['distance_to_port'].shift(1).where(visits['mmsi'].eq(visits['mmsi'].shift(1)))
    visits['previous_exit_datetime'] = visits['exit_datetime'].shift(1).where(visits['mmsi'].eq(visits['mmsi'].shift(1)))
    visits['previous_entry_datetime'] = visits['entry_datetime'].shift(1).where(visits['mmsi'].eq(visits['mmsi'].shift(1)))
    visits['previous_stay_duration'] = visits['stay_duration'].shift(1).where(visits['mmsi'].eq(visits['mmsi'].shift(1)))
    visits['previous_travel_duration'] = visits['entry_datetime'].sub(visits['previous_exit_datetime'], axis=0) / np.timedelta64(1, 'h')
    visits['previous_travel_duration'] = np.log(visits['previous_travel_duration'])
    
    #add prev2 port features
    visits['prev2_port_index'] = visits['port_index'].shift(2).where(visits['mmsi'].eq(visits['mmsi'].shift(2)))
    visits['prev2_distance_to_port'] = visits['distance_to_port'].shift(2).where(visits['mmsi'].eq(visits['mmsi'].shift(2)))
    visits['prev2_exit_datetime'] = visits['exit_datetime'].shift(2).where(visits['mmsi'].eq(visits['mmsi'].shift(2)))
    visits['prev2_stay_duration'] = visits['stay_duration'].shift(2).where(visits['mmsi'].eq(visits['mmsi'].shift(2)))
    visits['prev2_travel_duration'] = visits['previous_entry_datetime'].sub(visits['prev2_exit_datetime'], axis=0) / np.timedelta64(1, 'h')
    visits['prev2_travel_duration'] = np.log(visits['prev2_travel_duration'])    
    
    
    #only take instances with a previous port
    print(len(visits[((visits['previous_port_index'].isnull()) & 
                      (visits['target_port_index'].isnull()))]), 'instances were removed because they were singular.')
    visits = visits[~((visits['previous_port_index'].isnull()) & 
                      (visits['target_port_index'].isnull()))]
    
    #only take mmsis with 9 digits
    print(len(visits[visits['mmsi'] < 201000000]), 'instances were removed due to invalid mmsi.')
    visits = visits[visits['mmsi'] >= 201000000].sort_values(['mmsi', 'entry_datetime']).set_index(['mmsi', 'port_index']).reset_index()
    
    print('new shape:', visits.shape)
    
    return visits

# Load data

In [None]:
print('\nports')
ports = prep_ports()
print('\nvessels')
vessels = prep_vessels()
print('\nvisits')
visits = prep_visits()

In [None]:
ports.head()

In [None]:
vessels.head()

In [None]:
visits[['mmsi', 'port_index', 'port_lat',
       'port_long', 'entry_datetime', 'stay_duration',
       'exit_datetime', 'target_port_index', 'target_entry_datetime',
       'target_travel_duration']].head(10)

In [None]:
visits['target_travel_duration'].hist()

In [None]:
visits['target_stay_duration'].hist()

# FEATURE ENGINEERING
- Features are created by using the historic visits
- This is done for ports and vessels
- Also the datetime features for the visits were added

In [None]:
def historic_port_feats(hist_visits, vessels):
    hist_visits = pd.merge(hist_visits, vessels, how='left', on='mmsi')
    hist_visits[['ship_type', 'speed', 
                 'length', 'depth']] = hist_visits[['ship_type', 'speed', 
                                                    'length', 'depth']].fillna('Unknown') 
    
    port_features = pd.DataFrame(hist_visits['port_index'].unique(), columns=['port_index'])

    #number of visits
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', values='mmsi', 
            aggfunc='count').reset_index().rename(columns={'mmsi':'n_visits'}), 
                             how='left', on='port_index')

    #number of unique vessels
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', values='mmsi', 
            aggfunc='nunique').reset_index().rename(columns={'mmsi':'n_unique_vessels'}), 
                             how='left', on='port_index')
    
    #number of unique vessel origins
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', values='mmsi_iso3', 
            aggfunc='nunique').reset_index().rename(columns={'mmsi_iso3':'n_unique_vessel_origins'}), 
                             how='left', on='port_index')

    #number of visits per vessel speed category
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', columns='speed', values='mmsi', 
            aggfunc='count').fillna(0).reset_index().rename(columns={'High': 'n_high_speed',
                                                                    'Medium': 'n_medium_speed',
                                                                    'Unknown': 'n_unknown_speed'}), 
                             how='left', on='port_index')

    #number of visits per vessel type category
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', columns='ship_type', values='mmsi', 
            aggfunc='count').fillna(0).reset_index().rename(columns={'Chemical/Oil Tanker': 'n_Chemical/Oil Tanker',
                                                                    'Container Ship': 'n_Container Ship',
                                                                    'Crude Oil Tanker': 'n_Crude Oil Tanker',
                                                                    'General Cargo Ship': 'n_General Cargo Ship',
                                                                    'Tanker': 'n_Tanker',
                                                                    'Unknown': 'n_Unknown_shiptype'}), 
                             how='left', on='port_index')
    
    #number of visits per vessel length category
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', columns='length', values='mmsi', 
                aggfunc='count').fillna(0).reset_index().rename(columns={'large': 'n_large_length',
                                                                        'medium': 'n_medium_length',
                                                                        'small': 'n_small_length',
                                                                        'very large': 'n_very large_length',
                                                                        'Unknown': 'n_Unknown_length'}), 
                             how='left', on='port_index')

    #number of visits per vessel depth category
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index', columns='depth', values='mmsi', 
                aggfunc='count').fillna(0).reset_index().rename(columns={'large': 'n_large_depth',
                                                                        'medium': 'n_medium_depth',
                                                                        'small': 'n_small_depth',
                                                                        'very large': 'n_very large_depth',
                                                                        'Unknown': 'n_Unknown_depth'}), 
                             how='left', on='port_index')   
    
    #take percentages instead of counts, since counts are highly correlated
    cols = ['n_unique_vessels', 'n_high_speed', 'n_medium_speed', 
            'n_Chemical/Oil Tanker', 'n_Container Ship', 'n_Crude Oil Tanker', 
            'n_General Cargo Ship', 'n_Tanker', 'n_large_length', 
            'n_medium_length', 'n_small_length', 'n_very large_length', 
            'n_large_depth', 'n_medium_depth', 'n_small_depth', 'n_very large_depth', 'n_Unknown_shiptype']
    
    port_features[cols] = port_features[cols].divide(port_features['n_visits'], axis=0)
    
    #Average stay duration/travel duration/distance to port
    port_features = pd.merge(port_features, pd.pivot_table(hist_visits, index='port_index',
                values=['stay_duration', 'distance_to_port', 'previous_travel_duration'], 
                aggfunc='mean').reset_index().rename(columns={'stay_duration': 'port_avg_stay_duration',
                                                            'distance_to_port': 'port_avg_distance_to_port',
                                                            'previous_travel_duration': 'port_avg_travel_duration'}), 
                             how='left', on='port_index')    
    
    port_features = port_features.drop(['n_unknown_speed', 'n_Unknown_length', 'n_Unknown_depth'], 1)
    
    return port_features

In [None]:
def historic_vessel_feats(hist_visits, ports):
    hist_visits = pd.merge(hist_visits.drop(['port_lat', 'port_long'], 1), ports, how='left', on='port_index')
        
    vessel_features = pd.DataFrame(hist_visits['mmsi'].unique(), columns=['mmsi'])
    
    #number of visits
    vessel_features = pd.merge(vessel_features, pd.pivot_table(hist_visits, index='mmsi', values='port_index', 
            aggfunc='count').reset_index().rename(columns={'port_index':'n_visits'}), how='left', on='mmsi')
    
    #number of unique ports visited
    vessel_features = pd.merge(vessel_features, pd.pivot_table(hist_visits, index='mmsi', values='port_index', 
            aggfunc='nunique').reset_index().rename(columns={'port_index':'n_unique_ports'}), 
                               how='left', on='mmsi')
    
    #average travel/stay duration
    vessel_features = pd.merge(vessel_features, pd.pivot_table(hist_visits, index='mmsi',
                values=['stay_duration', 'distance_to_port', 'previous_travel_duration'], 
                aggfunc='mean').reset_index().rename(columns={'stay_duration': 'vessel_avg_stay_duration',
                                                            'distance_to_port': 'vessel_avg_distance_to_port',
                                                            'previous_travel_duration': 'vessel_avg_travel_duration'}), 
                             how='left', on='mmsi')
    
    #favorite port
    vessel_features = pd.merge(vessel_features, pd.pivot_table(hist_visits, columns='port_index', values='mmsi_region', 
                index='mmsi', aggfunc='count').idxmax(axis=1).reset_index().rename(columns={0: 'vessel_fav_port_index'}),
                               how='left', on='mmsi')
    
    #average coordinates
    pt = pd.pivot_table(hist_visits, index='mmsi', values=['port_lat', 'port_long'], 
            aggfunc=['mean', 'std'])
    
    pt.columns = ['_'.join((str(j), str(k))) for j, k in pt.columns]
    vessel_features = pd.merge(vessel_features, pt.reset_index(), how='left', on='mmsi')
    
    return vessel_features

In [None]:
def features_visits(visits):
    #add some datetime features
    visits['previous_exit_hour'] = pd.to_datetime(visits['previous_exit_datetime']).dt.hour.astype('category')
    visits['previous_exit_month'] = pd.to_datetime(visits['previous_exit_datetime']).dt.month.astype('category')
    visits['previous_exit_weekday'] = pd.to_datetime(visits['previous_exit_datetime']).dt.weekday.astype('category')
    visits['previous_exit_quarter'] = pd.to_datetime(visits['previous_exit_datetime']).dt.quarter.astype('category')
    visits['previous_exit_season'] = (pd.to_datetime(visits['previous_exit_datetime']).dt.month %12 // 3 + 1).astype('category')

    visits['exit_hour'] = pd.to_datetime(visits['exit_datetime']).dt.hour.astype('category')
    visits['exit_month'] = pd.to_datetime(visits['exit_datetime']).dt.month.astype('category')
    visits['exit_weekday'] = pd.to_datetime(visits['exit_datetime']).dt.weekday.astype('category')
    visits['exit_quarter'] = pd.to_datetime(visits['exit_datetime']).dt.quarter.astype('category')
    visits['exit_season'] = (pd.to_datetime(visits['exit_datetime']).dt.month %12 // 3 + 1).astype('category')

    visits.drop('previous_exit_datetime', 1)
    
    return visits

# Split off historic data and DSS data
- First add datetime features to the visit dataset
- Select the oldest data for the historic dataset
- Split the dataset
- Look at some stats of the historic dataset compared to the train/test dataset

In [None]:
visits = features_visits(visits)

In [None]:
indices = visits[visits['target_port_index'].notnull()][['mmsi', 'port_index', 'entry_datetime',
                'previous_port_index', 'prev2_port_index']]


# select visits without a previous port
hist_indices = indices[indices['prev2_port_index'].isnull()].index.values

# select oldest 50% of the data
indices = indices.sort_values(['entry_datetime'], ascending=True)

hist_indices = np.append(hist_indices, indices.head(int(len(indices)*0.5)).index.values)

# drop duplicates
hist_indices = pd.Series(hist_indices).unique()

len(hist_indices) / len(visits)

In [None]:
hist_visits = visits.iloc[hist_indices]
train_visits = visits.loc[~visits.index.isin(hist_indices)]

In [None]:
hist_visits.shape

In [None]:
train_visits.shape

In [None]:
hist_visits['port_index'].nunique()

In [None]:
train_visits['port_index'].nunique()

In [None]:
hist_visits['mmsi'].nunique()

In [None]:
train_visits['mmsi'].nunique()

## Update ports dataset with correct coordinates and new features
- Find correct coordinates from visit dataset
- Add them to ports
- Create new features
- Add new features to ports

In [None]:
ports = prep_ports()
pt_ports = pd.pivot_table(visits, values=['port_lat', 'port_long'], index='port_index', aggfunc='mean')
ports = pd.merge(ports, pt_ports.reset_index(), how='right', on='port_index')

port_features = historic_port_feats(hist_visits, vessels)
ports = pd.merge(ports, port_features, how='left', on='port_index')

ports.head()

## Update vessel dataset with new features
- Add vessel features

In [None]:
vessel_features = historic_vessel_feats(hist_visits, ports)

vessels = pd.merge(vessels, vessel_features, how='outer', on='mmsi')
vessels.head()

In [None]:
vessels.tail()

# Merge datasets
- Merge the datasets
- Also add the port features for the previous/previous previous/and vessel favorite ports
- Add seadistances between countries based on the CERDI dataset

In [None]:
df = pd.merge(vessels, train_visits, how='right', on='mmsi')
df = pd.merge(df, ports.drop(['port_lat', 'port_long'], 1), how='left', on='port_index')

og_ports_cols = ports.columns
ports.columns = ['previous_' + column for column in og_ports_cols]
df = pd.merge(df, ports, how='left', on='previous_port_index')

ports.columns = ['prev2_' + column for column in og_ports_cols]
df = pd.merge(df, ports, how='left', on='prev2_port_index')

ports.columns = ['vessel_fav_' + column for column in og_ports_cols]
df = pd.merge(df, ports, how='left', on='vessel_fav_port_index')

In [None]:
distances = pd.read_excel('CERDI.xlsx')
distances = distances.set_index(['iso1', 'iso2'])

df = pd.merge(df, distances[['seadistance']].rename(columns={'seadistance': 'previous_seadistance'}), 
              how='left', left_on=['previous_iso3', 'iso3'], right_index=True)
df['previous_seadistance'] = df['previous_seadistance'].fillna(0)

df = pd.merge(df, distances[['seadistance']].rename(columns={'seadistance': 'prev2_cur_seadistance'}), 
              how='left', left_on=['prev2_iso3', 'iso3'], right_index=True)
df['prev2_cur_seadistance'] = df['prev2_cur_seadistance'].fillna(0)

df = pd.merge(df, distances[['seadistance']].rename(columns={'seadistance': 'prev2_prev_seadistance'}), 
              how='left', left_on=['prev2_iso3', 'previous_iso3'], right_index=True)
df['prev2_prev_seadistance'] = df['prev2_prev_seadistance'].fillna(0)

In [None]:
df.head()

# Save datasets
- train_visits contains all the information for the models
    - Note that it also contains the visits for the dss (i.e. train_visits[train_visits['target_port_index'].isnull()])
    - We can only predict the stay duration/travel duration if we have predicted a next port
- hist_visits contains the visits based on which the historic port features were computed
- ports contains all the port information
- vessels contains all the vessel information

In [None]:
ports.columns = og_ports_cols

In [None]:
ports.to_parquet('ports_prep.parquet.gzip',
              compression='gzip', index=False)
vessels.to_parquet('vessels_prep.parquet.gzip',
              compression='gzip', index=False)
df.to_parquet('train_visits.parquet.gzip',
              compression='gzip', index=False)
hist_visits.to_parquet('hist_visits.parquet.gzip',
              compression='gzip', index=False)

In [None]:
print(df.shape)
df.head()

In [None]:
print(hist_visits.shape)
hist_visits.head()

In [None]:
print(vessels.shape)
vessels.head()

In [None]:
print(ports.shape)
ports.head()