In [21]:
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta 

In [22]:
file_path = 'week4_february_2024.csv'
df= pd.read_csv(file_path)

In [3]:
def convert_to_datetime(time_str):
    parts = time_str.split(':')
    hours = int(parts[0])
    minutes_seconds = parts[1]
    
    if hours > 23:
        days = hours // 24
        hours = hours % 24
        dt_str = f'{hours:02d}:{minutes_seconds}'
        dt = datetime.strptime(dt_str, '%H:%M.%S') + timedelta(days=days)
    else:
        dt = datetime.strptime(time_str, '%H:%M.%S')
    
    return dt

In [5]:
def convert_to_datetime(datetime_str):
    try:
        # Try parsing with both date and time
        dt = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        try:
            # Fallback to parsing time only (assuming it might happen in some cases)
            dt = datetime.strptime(datetime_str, '%H:%M:%S')
        except ValueError:
            # If all else fails, return None (or handle it as needed)
            dt = None
    return dt

In [7]:
def convert_repeat_indicator(value):
    if value == 'Not repeated':
        return 0
    match = re.search(r'(\d+)', value)
    if match:
        return int(match.group(1))
    return None

In [8]:
def convert_to_float(knots_str):
    return float(knots_str.split()[0])

In [9]:
def convert_to_float(knots_str):
    try:
        return float(knots_str.split()[0])
    except ValueError:
        return None

In [10]:
def extract_numeric(value):
    try:
        return float(re.findall(r'[-+]?\d.\d+|\d+', str(value))[0])
    except(indexError, ValueError):
        return None

In [99]:
def convert_to_float_or_na(value_str):
    try:
        return float(value_str_replace('°', ''))
    except ValueError:
        return None

In [100]:
def convert_to_float_or_na(value_str):
    try:
        return float(value_str.replace('°', ''))
    except ValueError:
        return None

In [101]:
# Map for NavigationalStatus
navigational_status_map = {
    '0 - under way using engine': 0,
    '1 - at anchor': 1,
    '2 - not under command': 2,
    '3 - restricted manoeuverability': 3,
    '4 - constrained by her draught': 4,
    '5 - moored': 5,
    '6 - aground': 6,
    '7 - engaged in fishing': 7,
    '8 - under way sailing': 8,
    '9 - reserved for future amendment': 9,
    '10 - reserved for future amendment': 10,
    '11 - reserved for future amendment': 11,
    '12 - reserved for future amendment': 12,
    '13 - reserved for future amendment': 13,
    '14 - ais-sart is active': 14,
    '15 - not defined': 15
}

def convert_navigational_status(value):
    return navigational_status_map.get(value, None)

In [102]:
def convert_position_accuracy(value):
    if 'High' in value:
        return 1
    elif 'Low' in value:
        return 0
    return None 

In [103]:
def convert_latitude_dms_to_dd(dms_str):
    # Remove leading and trailing whitespaces
    dms_str = dms_str.strip()
    
    # Check if the last character is the direction indicator
    direction = dms_str[-1]
    if direction not in ['N', 'S']:
        raise ValueError(f"Invalid direction '{direction}' in DMS string")

    # Remove the direction character from the DMS string
    dms_str = dms_str[:-1].strip()
    
    # Split the DMS string into parts
    parts = re.split('[°\'"]', dms_str)

    # Convert the parts to float
    degrees = float(parts[0].strip())
    minutes = float(parts[1].strip())
    seconds = float(parts[2].strip()) if len(parts) > 2 and parts[2].strip() else 0.0

    # Calculate the decimal degree
    dd = degrees + (minutes / 60.0) + (seconds / 3600.0)
    
    # Adjust for direction
    if direction == 'S':
        dd *= -1
        
    return dd

In [104]:
def convert_longitude_dms_to_dd(dms_str):
    # Remove leading and trailing whitespaces
    dms_str = dms_str.strip()
    
    # Check if the last character is the direction indicator
    direction = dms_str[-1]
    if direction not in ['E', 'W']:
        raise ValueError(f"Invalid direction '{direction}' in DMS string")

    # Remove the direction character from the DMS string
    dms_str = dms_str[:-1].strip()
    
    # Split the DMS string into parts
    parts = re.split('[°\'"]', dms_str)

    # Convert the parts to float
    degrees = float(parts[0].strip())
    minutes = float(parts[1].strip())
    seconds = float(parts[2].strip()) if len(parts) > 2 and parts[2].strip() else 0.0

    # Calculate the decimal degree
    dd = degrees + (minutes / 60.0) + (seconds / 3600.0)
    
    # Adjust for direction
    if direction == 'E':
        dd *= 1
        
    return dd

In [6]:
df['UTCPortTime'] = df['UTCPortTime'].apply(convert_to_datetime)

In [108]:
df['Port'] = df['Port'].astype(int)

KeyError: 'Port'

In [74]:
df['SourceMMSI'] = df['SourceMMSI'].astype(int)

In [75]:
df['MessageID'] = df['MessageID'].astype(int)

In [76]:
df['RepeatIndicator'] = df['RepeatIndicator'].apply(convert_repeat_indicator)

In [77]:
df['NavigationalStatus'] = df['NavigationalStatus'].apply(convert_navigational_status)

In [78]:
df['RateOfTurn'] = df['RateOfTurn'].apply(convert_to_float_or_na)

In [79]:
df['SpeedOverGround'] = df ['SpeedOverGround'].apply(convert_to_float)

In [80]:
df['PositionAccuracy'] = df['PositionAccuracy'].apply(convert_position_accuracy)

In [81]:
df['Latitude'] = df['Latitude'].apply(convert_latitude_dms_to_dd)

In [82]:
df['Longitude'] = df['Longitude'].apply(convert_longitude_dms_to_dd)

In [83]:
df['CourseOverGround'] = df['CourseOverGround'].apply(convert_to_float_or_na)

In [84]:
df['Heading'] = df['Heading'].apply(convert_to_float_or_na)

In [86]:
# Replace non-numeric values with a default value, e.g., 0
df['UTCTimeStamp'] = pd.to_numeric(df['UTCTimeStamp'], errors='coerce').fillna(0).astype(int)


In [23]:
columns_to_remove = ['UTCTimeStamp', 'SpecialManoeuvreIndicator', 'Spare', 'RAIMFlag', 'CommunicationStateSelectorFlag', 'CommSyncState', 'CommStateSlotTimeOut', 'CommStateSubMessage', 'CommStateSlotIncrement', 'CommStateNumberOfSlots', 'CommStateKeepFlag']
df = df.drop(columns=columns_to_remove)

In [24]:
df.head()

Unnamed: 0,UTCPortTime,Port,SourceMMSI,MessageID,RepeatIndicator,NavigationalStatus,RateOfTurn,SpeedOverGround,PositionAccuracy,Latitude,Longitude,CourseOverGround,Heading
0,2024-02-22 00:00:01,1,271002721,1,Not repeated,0 - under way using engine,Turning Right 000°/min,11.7 knots,Low (>10m),34° 44.3046' S,016° 01.3110' E,098.6°,099°
1,2024-02-22 00:00:07,1,563140400,1,Not repeated,0 - under way using engine,Turning Right 011°/min,11.1 knots,Low (>10m),36° 00.6191' S,018° 36.5918' E,080.5°,080°
2,2024-02-22 00:00:17,1,636021527,1,Not repeated,0 - under way using engine,Turning Right at more than 005°/30s (No TI ava...,8.6 knots,High (<=10m),34° 29.9695' S,015° 58.2794' E,281.8°,272°
3,2024-02-22 00:15:38,1,636016212,1,Not repeated,0 - under way using engine,Turning Right 000°/min,10.0 knots,Low (>10m),32° 13.8489' S,032° 16.5483' E,062.8°,060°
4,2024-02-22 00:15:48,1,601305800,3,Not repeated,0 - under way using engine,No turn information available,0.0 knots,Low (>10m),28° 47.7290' S,032° 04.7515' E,327.3°,Unavailable


In [25]:
output_file = 'RSA_COASTDATA 6.csv'
df.to_csv(output_file, index=False)

In [26]:
print(f"cleaned saved data to {output_file}")

cleaned saved data to RSA_COASTDATA 6.csv
