In [3]:
import  pandas as pd 
import numpy as np 
import re
from datetime import datetime, timedelta 

In [4]:
file_path = 'Vessel_Data.csv'
df = pd.read_csv(file_path)

In [5]:
df['UTCPortTime'] = pd.to_datetime(df['UTCPortTime'])

In [10]:
# Convert 'Port' from text to numeric
df['Port'] = pd.to_numeric(df['Port'])

In [11]:
# Convert 'SourceMMSI' from text to numeric (integer)
df['SourceMMSI'] = pd.to_numeric(df['SourceMMSI'], errors='coerce')

In [13]:
df['MesseageID'] = df['MessageID'].astype(int) #Convert to integer

In [14]:
df['RepeatIndicator'] = df['RepeatIndicator'].astype(str) #Convert to string

In [19]:
#Convert columns to appropiate data types 
df['NavigationalStatus'] = df['NavigationalStatus'].astype('category')
df['SpeedOverGround'] = pd.to_numeric(df['SpeedOverGround'].str.replace(' knots', '', regex=False), errors='coerce')

In [31]:
def classify_vessel_status(navigational_status):
    if "under way using engine" in navigational_status.lower():
        return "Underway using engine"
    elif "at anchor" in navigational_status.lower():
        return "Anchored"
    elif "moored" in navigational_status.lower():
        return "Moored"
    elif "engaged in fishing" in navigational_status.lower():
        return "Engaged in fishing"
    elif "constrained by her draught" in navigational_status.lower():
        return "Constrained by her draught"
    elif "restricted maneuvarability" in navigational_status.lower():
        return"Restricted maneuvarability"
    elif "under way sailing" in navigational_status.lower():
        return "Underway Sailing"
    elif "undefined" in navigational_status.lower():
        return "Undefined"
    else:
        return "Other"

In [28]:
def infer_vessel_type(speed):
    if speed < 3:
        return "Fishing/Small vessel"
    elif 3 <= speed < 10:
        return "Tug/Support Vessel"
    elif 10 <= speed < 20:
        return "Cargo/Passenger Ship"
    else:
        return "Fast-moving vessel"

In [32]:
df['Vesselstatus'] = df['NavigationalStatus'].apply(classify_vessel_status)
df['InferredVesselType'] = df['SpeedOverGround'].apply(infer_vessel_type)

In [34]:
df['RateOfTurn'] = df['RateOfTurn'].astype(str)

In [35]:
df['PositionAccuracy'] = df['PositionAccuracy'].astype(str)

In [38]:
def convert_latitude_dms_to_dd(dms_str):
    # Remove leading and trailing whitespaces
    dms_str = dms_str.strip()
    
    # Check if the last character is the direction indicator
    direction = dms_str[-1]
    if direction not in ['N', 'S']:
        raise ValueError(f"Invalid direction '{direction}' in DMS string")

    # Remove the direction character from the DMS string
    dms_str = dms_str[:-1].strip()
    
    # Split the DMS string into parts
    parts = re.split('[°\'"]', dms_str)

    # Convert the parts to float
    degrees = float(parts[0].strip())
    minutes = float(parts[1].strip())
    seconds = float(parts[2].strip()) if len(parts) > 2 and parts[2].strip() else 0.0

    # Calculate the decimal degree
    dd = degrees + (minutes / 60.0) + (seconds / 3600.0)
    
    # Adjust for direction
    if direction == 'S':
        dd *= -1
        
    return dd

In [39]:
def convert_longitude_dms_to_dd(dms_str):
    # Remove leading and trailing whitespaces
    dms_str = dms_str.strip()
    
    # Check if the last character is the direction indicator
    direction = dms_str[-1]
    if direction not in ['E', 'W']:
        raise ValueError(f"Invalid direction '{direction}' in DMS string")

    # Remove the direction character from the DMS string
    dms_str = dms_str[:-1].strip()
    
    # Split the DMS string into parts
    parts = re.split('[°\'"]', dms_str)

    # Convert the parts to float
    degrees = float(parts[0].strip())
    minutes = float(parts[1].strip())
    seconds = float(parts[2].strip()) if len(parts) > 2 and parts[2].strip() else 0.0

    # Calculate the decimal degree
    dd = degrees + (minutes / 60.0) + (seconds / 3600.0)
    
    # Adjust for direction
    if direction == 'E':
        dd *= 1
        
    return dd

In [40]:
df['Latitude'] = df['Latitude'].apply(convert_latitude_dms_to_dd)

In [41]:
df['Longitude'] = df['Longitude'].apply(convert_longitude_dms_to_dd)

In [47]:
def clean_course_over_ground(cog):
     cog = cog.replace('°', '')
    
     try:
        return float(cog)
     except ValueError:
         return None    

In [49]:
df['CourseOverGround'] = df['CourseOverGround'].apply(clean_course_over_ground)

In [50]:
df['Heading'] = df['Heading'].astype(str)

In [51]:
output_file = 'Vesssel_Data 2.csv'
df.to_csv(output_file, index=False)

In [52]:
print(f"cleaned saved data to {output_file}")

cleaned saved data to Vesssel_Data 2.csv
