In [1]:
import numpy as np
import pandas as pd
import os
import geopy.distance as distance
import random
import pickle
import matplotlib.pyplot as plt
import matplotlib
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import utility as ut
from tqdm import tnrange, notebook,tqdm_notebook
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import random

pd.set_option('display.float_format', lambda x: '%.5f' % x)
np.set_printoptions(suppress=True)

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
# import trip stats
with (open('../trajectory/20200106_AISTripStatsData.pkl', "rb")) as openfile:
    df_tripStat = pickle.load(openfile, encoding = 'latin1')

# import trip main
with (open('../trajectory/20200106_AISTripMainData.pkl', "rb")) as openfile:
    df_tripMain = pickle.load(openfile, encoding = 'latin1')
    #df_tripMain = df_tripMain[df_tripMain['trip_id'].isin(df_tripStat['trip_id'].unique())]
    df_tripMain.index = df_tripMain['point_id'] 
    df_tripMain['BaseDateTime'] = pd.to_datetime(df_tripMain['BaseDateTime'])


In [3]:
# find indexes where missing segments are present
index = ut.findMissing(df_tripMain, df_tripStat)

# show index
print(index[0:10])

[75919, 76049, 76594, 109618, 126365, 130279, 130285, 130898, 249543, 249545]


In [4]:
df_tripMain.head(2)

Unnamed: 0_level_0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,mmsiDiff,TimeDiff,Dist,Speed,point_id,Stationary,TimeCondition,medianSpeed,zScoreSpeed,Cos,trip_id,LastValid,TimeSplit
point_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
17394,636626,2017-07-31 16:52:06,37.16915,-76.60553,2.8,173.3,511.0,True,90.0,0.12611,5.04456,17394,True,1.32353,32.05496,-0.89613,0.0,17277,True,False
17395,636626,2017-07-31 16:54:06,37.16767,-76.60873,5.3,-171.8,511.0,True,120.0,0.32826,9.84771,17395,False,1.76471,32.05496,-0.73678,0.9999,17277,True,False


In [7]:
TripID = df_tripMain['trip_id'].unique().tolist()
MissingTripID = index
ValidTripID = list(set(TripID).difference(set(MissingTripID)))
len(ValidTripID)

81218

In [10]:
selectedTrip = random.choice(ValidTripID)
selectedTrip

740666

In [11]:
TripDF = df_tripMain.loc[df_tripMain['trip_id']==selectedTrip]
print(TripDF.shape)
rmStart,rmEnd = 5,10
TripDFKnown = TripDF.iloc[list(range(0,rmStart))+list(range(rmEnd,len(TripDF)))]
TripDFMissing = TripDF.iloc[rmStart:rmEnd]

(330, 20)


In [12]:
avgTimeGap = TripDFKnown['TimeDiff'].mean()
avgTimeGap

68.36

In [13]:
TripDFKnown['timePoint'] = TripDFKnown['BaseDateTime'] - TripDFKnown.iloc[0,1]
TripDFKnown['timePoint'] = TripDFKnown['timePoint'].apply(lambda x: x.days*24*3600 + x.seconds)
MissingStartTimePoint = TripDFKnown.iloc[rmStart-1]['timePoint']
MissingEndTimePoint = TripDFKnown.iloc[rmStart]['timePoint']
TimePointInserted = np.arange(MissingStartTimePoint,MissingEndTimePoint,avgTimeGap)[1:]
TimePointInserted = pd.DataFrame({'timePoint':TimePointInserted})

### how many points missed
shiftMin = len(TimePointInserted)
shiftMin

6

In [14]:
shiftMax = 10
trainDF = TripDFKnown[['LAT','LON','timePoint']]
for i in range(shiftMin,shiftMax+1):
    trainDF['LAT+'+str(i)] = trainDF['LAT'].shift(periods=i)
    trainDF['LON+'+str(i)] = trainDF['LON'].shift(periods=i)
    trainDF['LAT-'+str(i)] = trainDF['LAT'].shift(periods=-i)
    trainDF['LON-'+str(i)] = trainDF['LON'].shift(periods=-i)
    trainDF['timePoint+'+str(i)] = trainDF['timePoint'].shift(periods=i)
    trainDF['timePoint-'+str(i)] = trainDF['timePoint'].shift(periods=-i)
trainDF = trainDF.iloc[shiftMax:-shiftMax]

In [15]:
trainDF

Unnamed: 0_level_0,LAT,LON,timePoint,LAT+6,LON+6,LAT-6,LON-6,timePoint+6,timePoint-6,LAT+7,LON+7,LAT-7,LON-7,timePoint+7,timePoint-7,LAT+8,LON+8,LAT-8,LON-8,timePoint+8,timePoint-8,LAT+9,LON+9,LAT-9,LON-9,timePoint+9,timePoint-9,LAT+10,LON+10,LAT-10,LON-10,timePoint+10,timePoint-10
point_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
1477910,33.99507,-76.71528,1032,34.02734,-76.67647,33.97608,-76.73753,271.00000,1452.00000,34.02885,-76.67397,33.97294,-76.74123,201.00000,1521.00000,34.02945,-76.67202,33.96979,-76.74490,131.00000,1591.00000,34.02946,-76.67093,33.96652,-76.74873,62.00000,1662.00000,34.02935,-76.67077,33.96342,-76.75238,0.00000,1732.00000
1477911,33.99151,-76.71950,1111,34.01096,-76.69656,33.97294,-76.74123,682.00000,1521.00000,34.02734,-76.67647,33.96979,-76.74490,271.00000,1591.00000,34.02885,-76.67397,33.96652,-76.74873,201.00000,1662.00000,34.02945,-76.67202,33.96342,-76.75238,131.00000,1732.00000,34.02946,-76.67093,33.96025,-76.75610,62.00000,1802.00000
1477912,33.98837,-76.72318,1181,34.00784,-76.70036,33.96979,-76.74490,751.00000,1591.00000,34.01096,-76.69656,33.96652,-76.74873,682.00000,1662.00000,34.02734,-76.67647,33.96342,-76.75238,271.00000,1732.00000,34.02885,-76.67397,33.96025,-76.75610,201.00000,1802.00000,34.02945,-76.67202,33.95708,-76.75978,131.00000,1871.00000
1477913,33.98561,-76.72643,1242,34.00471,-76.70401,33.96652,-76.74873,821.00000,1662.00000,34.00784,-76.70036,33.96342,-76.75238,751.00000,1732.00000,34.01096,-76.69656,33.96025,-76.75610,682.00000,1802.00000,34.02734,-76.67647,33.95708,-76.75978,271.00000,1871.00000,34.02885,-76.67397,33.95399,-76.76346,201.00000,1941.00000
1477914,33.98241,-76.73017,1312,34.00146,-76.70780,33.96342,-76.75238,892.00000,1732.00000,34.00471,-76.70401,33.96025,-76.75610,821.00000,1802.00000,34.00784,-76.70036,33.95708,-76.75978,751.00000,1871.00000,34.01096,-76.69656,33.95399,-76.76346,682.00000,1941.00000,34.02734,-76.67647,33.95076,-76.76718,271.00000,2011.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478210,33.19415,-77.92559,21557,33.20561,-77.89787,33.18393,-77.95519,21167.00000,21959.00000,33.20770,-77.89330,33.18225,-77.96003,21101.00000,22025.00000,33.20977,-77.88873,33.18057,-77.96481,21035.00000,22091.00000,33.21177,-77.88425,33.17886,-77.96962,20970.00000,22157.00000,33.21381,-77.88020,33.17718,-77.97447,20909.00000,22223.00000
1478211,33.19245,-77.93049,21624,33.20352,-77.90248,33.18225,-77.96003,21233.00000,22025.00000,33.20561,-77.89787,33.18057,-77.96481,21167.00000,22091.00000,33.20770,-77.89330,33.17886,-77.96962,21101.00000,22157.00000,33.20977,-77.88873,33.17718,-77.97447,21035.00000,22223.00000,33.21177,-77.88425,33.17552,-77.97926,20970.00000,22289.00000
1478212,33.19078,-77.93529,21689,33.20141,-77.90706,33.18057,-77.96481,21299.00000,22091.00000,33.20352,-77.90248,33.17886,-77.96962,21233.00000,22157.00000,33.20561,-77.89787,33.17718,-77.97447,21167.00000,22223.00000,33.20770,-77.89330,33.17552,-77.97926,21101.00000,22289.00000,33.20977,-77.88873,33.17384,-77.98409,21035.00000,22355.00000
1478213,33.18896,-77.94062,21761,33.19946,-77.91175,33.17886,-77.96962,21365.00000,22157.00000,33.20141,-77.90706,33.17718,-77.97447,21299.00000,22223.00000,33.20352,-77.90248,33.17552,-77.97926,21233.00000,22289.00000,33.20561,-77.89787,33.17384,-77.98409,21167.00000,22355.00000,33.20770,-77.89330,33.17219,-77.98891,21101.00000,22421.00000


In [22]:
### polynomial

for i in range(shiftMin,shiftMax+1):
    trainDF['LATx+'+str(i)] = trainDF['LAT+'+str(i)].apply(lambda x: x**i)
    trainDF['LONx+'+str(i)] = trainDF['LON+'+str(i)].apply(lambda x: x**i)
    trainDF['LATx-'+str(i)] = trainDF['LAT-'+str(i)].apply(lambda x: x**i)
    trainDF['LONx-'+str(i)] = trainDF['LON-'+str(i)].apply(lambda x: x**i)
    trainDF['timePoint+'+str(i)] = trainDF.apply(lambda x:x['timePoint']-x['timePoint+'+str(i)],axis=1)
    trainDF['timePoint-'+str(i)] = trainDF.apply(lambda x:np.abs(x['timePoint']-x['timePoint-'+str(i)]),axis=1)
trainDF.head(2)

Unnamed: 0_level_0,LAT,LON,timePoint,LAT+6,LON+6,LAT-6,LON-6,timePoint+6,timePoint-6,LAT+7,LON+7,LAT-7,LON-7,timePoint+7,timePoint-7,LAT+8,LON+8,LAT-8,LON-8,timePoint+8,timePoint-8,LAT+9,LON+9,LAT-9,LON-9,timePoint+9,timePoint-9,LAT+10,LON+10,LAT-10,LON-10,timePoint+10,timePoint-10,LATx+6,LONx+6,LATx-6,LONx-6,LATx+7,LONx+7,LATx-7,LONx-7,LATx+8,LONx+8,LATx-8,LONx-8,LATx+9,LONx+9,LATx-9,LONx-9,LATx+10,LONx+10,LATx-10,LONx-10
point_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
1477910,33.99507,-76.71528,1032,34.02734,-76.67647,33.97608,-76.73753,271.0,612.0,34.02885,-76.67397,33.97294,-76.74123,201.0,543.0,34.02945,-76.67202,33.96979,-76.7449,131.0,473.0,34.02946,-76.67093,33.96652,-76.74873,62.0,402.0,34.02935,-76.67077,33.96342,-76.75238,0.0,332.0,1552272642.20527,203222909495.1438,1538294982.2931,204195843245.6531,52836118675.31127,-15578859270146.348,52231431031.26613,-15674774084340.822,1798205986005.6335,1194249979740616.2,1773139467545.715,1203361731609418.8,61192122529208.42,-9.155384340556312e+16,60181011947348.05,-9.239336391635715e+16,2082267575567132.0,7.019371834205033e+18,2042274700576876.8,7.094446420601182e+18
1477911,33.99151,-76.7195,1111,34.01096,-76.69656,33.97294,-76.74123,682.0,701.0,34.02734,-76.67647,33.96979,-76.7449,271.0,631.0,34.02885,-76.67397,33.96652,-76.74873,201.0,560.0,34.02945,-76.67202,33.96342,-76.75238,131.0,490.0,34.02946,-76.67093,33.96025,-76.7561,62.0,420.0,1547794658.35967,203542597411.51105,1537442182.84511,204254923778.793,52819708969.01709,-15582415323217.107,52197539859.55506,-15680022146219.732,1797952356984.366,1194492988313423.0,1771774439870.4385,1203842251413894.8,61191960690479.41,-9.156555833167208e+16,60131597482729.27,-9.243291765807368e+16,2082334885922796.8,7.019518318978891e+18,2040369329349935.5,7.097885675186546e+18


In [23]:
### unweighted


y = trainDF['LAT'].values
X = trainDF[['LATx+'+str(i) for i in range(shiftMin,shiftMax)]+['LATx-'+str(i) for i in range(shiftMin,shiftMax)]].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
print(r2_score(reg.predict(X_train), y_train))
print(r2_score(reg.predict(X_test), y_test))

0.9999901326894062
0.9999892605645955


In [24]:
### weighted

WeightLambda = np.arange(0,0.1,0.01)

for weight in WeightLambda:
    for i in range(shiftMin,shiftMax):
        trainDF['LATx+'+str(i)] = trainDF.apply(lambda x: x['LAT+'+str(i)]*np.exp(weight*x['timePoint+'+str(i)]),axis=1)
        trainDF['LONx+'+str(i)] = trainDF.apply(lambda x: x['LON+'+str(i)]*np.exp(weight*x['timePoint+'+str(i)]),axis=1)
        trainDF['LATx-'+str(i)] = trainDF.apply(lambda x: x['LAT-'+str(i)]*np.exp(weight*x['timePoint-'+str(i)]),axis=1)
        trainDF['LONx-'+str(i)] = trainDF.apply(lambda x: x['LON-'+str(i)]*np.exp(weight*x['timePoint-'+str(i)]),axis=1)
    y = trainDF['LAT'].values
    X = trainDF[['LAT+'+str(i) for i in range(shiftMin,shiftMax)]+['LAT-'+str(i) for i in range(shiftMin,shiftMax)]].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    regW = LinearRegression().fit(X_train, y_train)
#     print(r2_score(regW.predict(X_train), y_train))
    print(weight,r2_score(regW.predict(X_test), y_test))

0.0 0.9999951962677428
0.01 0.9999951962677428
0.02 0.9999951962677428
0.03 0.9999951962677428
0.04 0.9999951962677428
0.05 0.9999951962677428
0.06 0.9999951962677428
0.07 0.9999951962677428
0.08 0.9999951962677428
0.09 0.9999951962677428
