In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
plt.style.use("ggplot")


## Data understanding

In [None]:
df_ais = pd.read_csv('ais_train.csv', sep='|')
df_ais.head()
df_ais['time'] = pd.to_datetime(df_ais['time'])

df_ais.shape
#n = 1522065
#columns = 11
df_ais.columns
#latitude and longitude are our targets. Relevant covariates may be time, cog, sog, rot, heading, navstat, etaRAW, vesselId and portId.

df_ais.dtypes
#time, cog, latide and longitude are floats. Rot, heading and navstat are ints. etaRaw, vesselId and portId are objects. 

df_ais.describe()
#cog: course over ground. From 0 to 360 degrees.
#sog: speed over ground. from 0 to 1023 knots. 
#rot: rate of turning (of heading, which is the compass direction of where the boats bow/nose is heading). Degrees per minute.
#heading: direction of where boats bow is pointing. Measured in degrees from 0 to 360.
#navstat: Navigational status. The number tells the status of the boat. From 0 to 15.
#latitude: north-south position. Degrees. From -90 (south) to +90 (north). 
#longitude: east-west position. Degrees. From -180 (west) to +180 (east)

## Data preparation

### Documentation of exploring estimated arrival times (don't run this!)

In [None]:
#Double-checking types of our columns.
#etaRAW should be a date since this is estimated time of arrival.
df_ais['etaRaw']

print(df_ais['etaRaw'][1:3])

def parse_time(raw_time):
    try:
    
        # Define the format without the year
        date_format = "%m-%d %H:%M"
    
        # Parse the cleaned string into a datetime object
        parsed_datetime = datetime.strptime(raw_time, date_format)

        # Add the correct year (2024)

        return parsed_datetime.replace(year = 2024)
    
    except ValueError:
        return None


df_ais['etaParsed'] = df_ais['etaRaw'].apply(parse_time)

#Want to ensure that the eta is the latest date. Checking how many instances I have of december.

december_count = df_ais[df_ais['etaParsed'].dt.month == 12].shape[0]

total_count = df_ais.shape[0]


print(f"Number of instances in December: {december_count/total_count}")

#Checking if I have instances of the etaParsed is later than time.

df_ais['later'] = df_ais['etaParsed'] > df_ais['time']

#November and later:
df_ais['later'] = df_ais['etaParsed'] > df_ais['time']

df_coolio = df_ais[df_ais['later'] == True].copy()

#Checking if there is any instances where etaParsed is in november
df_coolio[df_coolio['vesselId'] == '61e9f3a8b937134a3c4bfdf7'].head(10)

df_coolio_nov = df_coolio[df_coolio['etaParsed'].dt.month == 9].copy()

df_coolio_nov[df_coolio_nov['vesselId'] == '61e9f42eb937134a3c4c0103'].tail(40)

df_ais[df_ais['vesselId'] == '61e9f42eb937134a3c4c0103'].head(10)

#NOt fruitful

"""

later_count = df_ais[df_ais['later']  == False].shape[0]

print(f"Number of instances where time is later than eta: {later_count}")
df_ais[df_ais['later']  == False].head()

#I have instances where time is later than eta if I do this solution. 

#So when later is False I want to ensure that the year of etaParsed is 2025.

def adjust_year(time1,time2):
    if time1 <= time2:
        return time1.replace(year = 2025)
    return time1

df_ais['etaParsed'] = df_ais.apply(lambda row: adjust_year(row['etaParsed'], row['time']), axis = 1)
df_ais[df_ais['later']  == False].head() #It's fixed
"""



### Actual useful stuff

In [3]:
def parse_time(raw_time):
    try:
    
        # Define the format without the year
        date_format = "%m-%d %H:%M"
    
        # Parse the cleaned string into a datetime object
        parsed_datetime = datetime.strptime(raw_time, date_format)

        # Add placeholder year 2024.

        return parsed_datetime.replace(year = 2024)
    
    except ValueError:
        return None


df_ais['etaParsed'] = df_ais['etaRaw'].apply(parse_time)

#Sets all years to 2024, because not able to determine which years estimated arrival time should be.

In [None]:
df_ais.drop(['etaRaw'], axis = 1)

df_ais.isna().sum() #Estimated arrival time (1615 entries) and portId (1615 entries)

df_ais.loc[df_ais.duplicated()] #No duplicated rows

