In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math
import os
import utm
import time
from collections import deque

## Pre-processing Implementation

In [2]:
""" Pre-Processing """
class preprocesser:
    def __init__(self, df, config):
        self.df = df
        self.scaleDown = config['ScaleDown']
        self.filterMap = config['FilterMap']
        self.vesselType = config['VesselType']
    
    def preprocess(self):
        self.df = self.df[(self.df['LAT'] > -80) & (self.df['LAT'] < 84) & (self.df['LON'] > -180) & (self.df['LON'] < 180)]
        self.convertCoord()
        self.scaleDownData()
        self.filterMapToScale()

        if self.vesselType != None:
            self.df = self.df[df.VesselType == vesselType]
        if len(self.df) >= 1:
            self.processTime()
        else:
            self.df['time'] = []
        self.df = self.df.reset_index()
        self.df = self.df[['MMSI', 'x', 'y', 'time', 'VesselType', 'Heading']]
        def process_heading(x):
            if x > 360 or x < 0:
                return None
            return x
        self.df['Heading'] = self.df['Heading'].apply(lambda x: process_heading(x))
    
    def convertCoord(self):
        latitudes = np.array(self.df['LAT'])
        longitudes = np.array(self.df['LON'])
        self.df['x'], self.df['y'], _, _ = utm.from_latlon(latitudes, longitudes)
    
    def scaleDownData(self):
        self.df['x'] = self.df['x'] / self.scaleDown
        self.df['y'] = self.df['y'] / self.scaleDown
    
    def filterMapToScale(self):
        right_x, left_x, top_y, bot_y = self.filterMap
        self.df = self.df[(self.df.x<right_x) & (self.df.x>left_x) & (self.df.y<top_y) & (self.df.y>bot_y)]
    
    def processTime(self):
        self.df['datetime'] = pd.to_datetime(self.df['BaseDateTime'])
        self.df.sort_values(by=['datetime'], inplace=True, ascending=True)
        self.df = self.df.reset_index()
        start_datetime = self.df['datetime'].iloc[0].floor('d')
        self.df['time'] = self.df['datetime'].apply(lambda x: (x-start_datetime)).dt.total_seconds()
        
        
""" Data Engineering """
import warnings
warnings.filterwarnings("ignore")
def engineer_data(df, interval):
    unique_ships = df['MMSI'].unique()
    for ship in unique_ships:
        new_x, new_y, new_time, new_heading = [], [], [], []
        temp = df[df['MMSI'] == ship]
        temp.sort_values(by=['time'], inplace=True, ascending=True)
        temp = temp.reset_index()
        
        x, y, time, heading = temp['x'], temp['y'], temp['time'], temp['Heading']
        for i in range(1, len(time)):
            time_diff = time[i] - time[i-1]
            for j in range(1, int(time_diff // interval)):
                cur_time = time[i-1] + j * interval
                ratio = (cur_time - time[i-1]) / (time[i] - time[i-1])
                new_x.append(x[i-1] + ratio * (x[i] - x[i-1]))
                new_y.append(y[i-1] + ratio * (y[i] - y[i-1]))
                if heading[0]:
                    new_heading.append(heading[i-1])
                else:
                    new_heading.append(None)
                new_time.append(cur_time)
        new_MMSI = [int(ship)] * len(new_time)
        new_vesseltype = [temp['VesselType'].iloc[0]] * len(new_time)
        new_dict = {'MMSI': new_MMSI, 
                    'x': new_x, 
                    'y': new_y, 
                    'time': new_time, 
                    'VesselType': new_vesseltype, 
                    'Heading': new_heading}
        new_df = pd.DataFrame(new_dict)
        df = df.append(new_df)
    return df

def process_heading(df):
    new_df = pd.DataFrame({'MMSI': [], 'x': [], 'y': [], 'time': [], 'VesselType': [], 'Heading': []})
    for ship in df['MMSI'].unique():
        temp_df = df[df['MMSI'] == ship]
        directions = np.empty(temp_df.shape[0])
        directions[0] = temp_df.iloc[0]['Heading']
        
        for i in range(temp_df.shape[0]):
            if i == temp_df.shape[0]-1:
                if temp_df.iloc[i]['Heading'] == None:
                    directions[i] = directions[i-1]
                else:
                    directions[i] = temp_df.iloc[i]['Heading']
                break
            
            next_x, next_y = temp_df.iloc[i+1]['x'], temp_df.iloc[i+1]['y']
            cur_x, cur_y = temp_df.iloc[i]['x'], temp_df.iloc[i]['y']
            direction = math.atan2(next_y-cur_y, next_x-cur_x) - math.pi / 2
            if direction < 0:
                direction *= -1
            elif direction > 0:
                direction = 2 * math.pi - direction
            direction = math.degrees(direction)
            directions[i] = direction
        
        directions = np.round(directions,1)
        temp_df['Heading'] = directions
        new_df = pd.concat([new_df, temp_df], ignore_index=True)
    return new_df

## Process data and do data engineering before concat

In [3]:
# Set config file
FilterMap = [4090000,4070000,5130000,5110000]
TimeWindows = [i for i in range(1, 61)]
config = {'ScaleDown': 1,
          'VesselType': None,
          'FilterMap': FilterMap,
          'NumCol': 1000,
          'NumRow': 1000,
          'Increment': 20,
          'Min_x': FilterMap[1],
          'Min_y': FilterMap[3],
          'MinSC': 2,
          'MinNeigb': 2,
          'StoreParam': [-4, 4, 101],
          'TimeWindows': TimeWindows,
          'Threshold': None,
          }

# Initialize
fp = './AIS_Data/'

start = time.time()
df = pd.read_csv(fp + os.listdir(fp)[0])
myPreProcesser = preprocesser(df, config)
myPreProcesser.preprocess()
df = myPreProcesser.df
df = process_heading(df)
df = engineer_data(df, 1)
end = time.time()
print('1st iteration: {}'.format(end-start))

# Rest of data
files = os.listdir(fp)
for i in range(1, len(files)):
    start = time.time()
    temp = pd.read_csv(fp + files[i])
    myPreProcesser = preprocesser(temp, config)
    myPreProcesser.preprocess()
    temp = myPreProcesser.df
    temp = process_heading(temp)
    temp = engineer_data(temp, 1)
    temp['time'] += i * 86400
    df = df.append(temp, ignore_index = True)
    end = time.time()
    print('{}st iteration: {}'.format(i, end-start))

1st iteration: 153.92707562446594
1st iteration: 19.9549343585968
2st iteration: 18.05855393409729
3st iteration: 19.659127235412598
4st iteration: 137.37540245056152
5st iteration: 19.54961395263672
6st iteration: 19.22675585746765
7st iteration: 19.16850185394287
8st iteration: 19.956040143966675
9st iteration: 18.994231700897217
10st iteration: 19.31592845916748
11st iteration: 159.9523434638977
12st iteration: 18.049937963485718
13st iteration: 18.792357921600342
14st iteration: 17.68850827217102
15st iteration: 17.767965078353882
16st iteration: 18.66211771965027
17st iteration: 146.60677981376648
18st iteration: 17.56329584121704
19st iteration: 16.990862131118774
20st iteration: 18.107818841934204
21st iteration: 17.968629121780396
22st iteration: 17.34468364715576
23st iteration: 17.639122009277344
24st iteration: 18.353692054748535
25st iteration: 122.00297093391418
26st iteration: 130.98199248313904
27st iteration: 17.588838577270508
28st iteration: 16.871620416641235
29st it

In [4]:
print(df.shape)
df.head(2)

(13457486, 6)


Unnamed: 0,MMSI,x,y,time,VesselType,Heading
0,366856070.0,4076630.0,5117847.0,0.0,31.0,130.5
1,366856070.0,4076632.0,5117846.0,69.0,31.0,332.0


In [5]:
df.to_csv('./ProcessedData_engined_wutm.csv')