# short_notebook_2
## By Jasper Steinberg  
### Team [200] - 紅燒肉

In [45]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [46]:
#Loading the training data and formating the time variable.
train = pd.read_csv('ais_train.csv', sep='|')
test = pd.read_csv('ais_test.csv')
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])

train = train.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
test = test.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

Basic preprocessing and feature engineering

In [47]:
train.drop(columns=['rot', 'heading', 'etaRaw', 'portId'], inplace=True)

train['cog'] = train['cog'].replace(360, np.nan)

train.loc[(train['cog'] > 360) & (train['cog'] <= 409.5), 'cog']= np.nan

train['sog'] = train['sog'].replace(102.3, np.nan)

train['v_lat'] = train['sog']*np.sin((np.pi/180)*train['cog'])
train['v_lon'] = train['sog']*np.cos((np.pi/180)*train['cog'])

Cleaning of teleporting vessels

In [49]:
#Calculate the time difference in hours for each vessel
train['time_delta'] = train.groupby('vesselId')['time'].diff().dt.total_seconds() / 3600

#Calculate the distance between consecutive positions
def calculate_distance(row):
    if pd.isna(row['time_delta']):
        return np.nan  #First entry for each vessel
    prev_row = train.iloc[row.name - 1]
    if row['vesselId'] == prev_row['vesselId']:
        prev_coords = (prev_row['latitude'], prev_row['longitude'])
        curr_coords = (row['latitude'], row['longitude'])
        return geodesic(prev_coords, curr_coords).kilometers
    else:
        return np.nan

train['distance_km'] = train.apply(calculate_distance, axis=1)

#Calculate speed in km/h
train['speed_kmh'] = train['distance_km'] / train['time_delta']

# Define a realistic maximum speed for vessels (e.g., 50 km/h)
MAX_SPEED = 50

#Identify rows with unrealistic speeds
train['teleport'] = train['speed_kmh'] > MAX_SPEED

#Count the total number of teleporting instances
teleport_count = train['teleport'].sum()
print(f'Total number of teleporting instances: {teleport_count}')

#Remove teleporting entries
train = train[train['teleport'] != True].copy()

#drop auxiliary columns
train.drop(columns=['time_delta', 'distance_km', 'speed_kmh', 'teleport'], inplace=True)

Total number of teleporting instances: 471


In [50]:
#Calculate the minimum time for each vessel
train['vessel_min_time'] = train.groupby('vesselId')['time'].transform('min')

#Compute the time difference in seconds
train['time_diff_seconds'] = (train['time'] - train['vessel_min_time']).dt.total_seconds()

#Define the window duration in seconds
window_duration_seconds = 5 * 24 * 60 * 60  #5 days in seconds

#Calculate 'window_id'
train['window_id'] = (train['time_diff_seconds'] // window_duration_seconds).astype(int)

#Drop the auxiliary columns 
train.drop(columns=['vessel_min_time', 'time_diff_seconds'], inplace=True)


In [51]:
#Sort the DataFrame
train_new = train.sort_values(by=['vesselId', 'window_id', 'time']).reset_index(drop=True).copy()

#Get the last observation per vessel and window
last_obs_per_window = train_new.groupby(['vesselId', 'window_id']).last().reset_index()

#Prepare and shift the last observations
#Select only the necessary columns to prevent overlap
last_obs_columns = [
    'vesselId', 'window_id', 'time', 'latitude', 'longitude',
    'v_lat', 'v_lon',
]

last_obs = last_obs_per_window[last_obs_columns].copy()

#Rename columns to distinguish them from 'train' columns
last_obs.rename(columns={
    'time': 'prev_window_time_last',
    'latitude': 'prev_window_last_latitude',
    'longitude': 'prev_window_last_longitude',
    'v_lat': 'prev_window_last_v_lat',
    'v_lon': 'prev_window_last_v_lon',
}, inplace=True)

shift_columns = [
        'prev_window_last_latitude', 'prev_window_last_longitude',
        'prev_window_time_last', 'prev_window_last_v_lat', 
        'prev_window_last_v_lon'
]

def shift_last_observations(df, shift_columns = shift_columns):
    df = df.sort_values('window_id').reset_index(drop=True)
    #Shift the columns by one to align with the next window
    
    df[shift_columns] = df[shift_columns].shift(1)
    return df

#Apply the shift per vesselId
last_obs_shifted = last_obs.groupby('vesselId').apply(shift_last_observations).reset_index(drop=True)

#Keep only the columns needed to merge
merge_columns = ['vesselId', 'window_id'] + shift_columns
last_obs_shifted = last_obs_shifted[merge_columns]


#Merge back into the original DataFrame
train_new = pd.merge(
    train_new,
    last_obs_shifted,
    on=['vesselId', 'window_id'],
    how='left'
)

#Compute 'time_diff'
train_new['time_diff'] = (train_new['time'] - train_new['prev_window_time_last']).dt.total_seconds()


In [52]:
#Define the target variables
targets = ['latitude', 'longitude']

#Select features
features = [
    'time_diff',                        
    'prev_window_last_latitude',        
    'prev_window_last_longitude',      
    'prev_window_last_v_lat',
    'prev_window_last_v_lon' 
]



### Training the model on all the data

In [53]:
#Formating of training sets
training_set = train_new.dropna(subset = features + targets)

X_global = training_set[features]

y_global_lat = training_set['latitude']
y_global_lon = training_set['longitude']

In [54]:
#Latitude datasets
global_data_lat = xgb.DMatrix(X_global, label=y_global_lat)

#Longitude datasets
global_data_lon = xgb.DMatrix(X_global, label=y_global_lon)

In [55]:
#Define model parameters, just use some fairly standard choises, no tuning
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',  
    'learning_rate': 0.1,
    'max_depth': 6,
    'alpha': 10
}

In [56]:
#Train the latitude model
lat_model = xgb.train(
    params,
    global_data_lat,
    num_boost_round=1000,
    verbose_eval=False
)

In [57]:
# Train the longitude model
lon_model = xgb.train(
    params,
    global_data_lon,
    num_boost_round=1000,
    verbose_eval=False
)

### Test predictions
We prepare for prediction by loading in the last seen information from the training set into the test set.

In [58]:
train_new.sort_values(by=['vesselId', 'time'], inplace=True)

#Features needed from the last known data
#Features to extract
last_known_features = [
    'vesselId',
    'latitude',
    'longitude',
    'v_lat',
    'v_lon',
    'time'
]

#Get last known records for each vessel
last_known_data = train_new.groupby('vesselId').last().reset_index()

#Keep only the required features
last_known_data = last_known_data[last_known_features]


In [59]:
#Merge last known data with test set on 'vesselId'
test_set = test.merge(last_known_data, on='vesselId', how='left', suffixes=('', '_prev'))


In [60]:
#Rename columns to align with feature names
test_set.rename(columns={
    'latitude': 'prev_window_last_latitude',
    'longitude': 'prev_window_last_longitude',
    'v_lat': 'prev_window_last_v_lat',
    'v_lon': 'prev_window_last_v_lon',
    'time_prev': 'prev_time'
}, inplace=True)

In [61]:
#Calculate 'time_diff' in seconds
test_set['time_diff'] = (test_set['time'] - test_set['prev_time']).dt.total_seconds()

In [62]:
#Convert the test set to a DMatrix
test_data = xgb.DMatrix(test_set[features])

#Predict latitude and longitude using the models
test_set['latitude_predicted'] = lat_model.predict(test_data)
test_set['longitude_predicted'] = lon_model.predict(test_data)


In [63]:
#Sort back to the original aligmnent and format for submission
test_fin = test_set.sort_values(by=['ID'])

test_fin = test_fin[['ID', 'longitude_predicted', 'latitude_predicted']].copy()

In [64]:
test_fin.head()

Unnamed: 0,ID,longitude_predicted,latitude_predicted
5045,0,-80.759438,31.173223
44194,1,115.983269,15.767951
41210,2,10.772911,38.086067
32880,3,173.339096,-41.010788
0,4,6.005428,48.190571


In [None]:
#Df to file
test_fin.to_csv('final_XGBoost.csv', index=False)