# Filtering the raw data files

In [11]:
import pandas as pd

# Step 1: Define the input and output file paths
file_path = "aisdk-2025-01-25.csv"  # Replace with your actual file path
output_file_path = file_path.replace(".csv", "_filtered.csv")

# Step 2: Load the CSV file
print(f"Loading data from: {file_path}...")
df = pd.read_csv(file_path, parse_dates=['# Timestamp'])  # Ensure '# Timestamp' is recognized as datetime
print(f"Data loaded successfully. Total rows: {len(df)}")


Loading data from: aisdk-2025-01-25.csv...


  df = pd.read_csv(file_path, parse_dates=['# Timestamp'])  # Ensure '# Timestamp' is recognized as datetime


Data loaded successfully. Total rows: 17422586
Filtering data...
Applying two-minute time gap filtering...


  df = df.groupby('MMSI', group_keys=False).apply(filter_two_minutes)


Filtering complete. Remaining rows: 76337
Saving the filtered data to: aisdk-2025-01-25_filtered.csv...
Filtered data saved successfully: aisdk-2025-01-25_filtered.csv


# Filteration Critera
1) Remove the rows having Longitude = 0
2) Keep only Ship type Cargo, Tanker and if Unknown then check length and if it is greater then 100 then include in data.
3) Remove duplicate rows
4) For each MMSI/vessel, take only 1 record in two minutes time interverl

In [None]:
# Step 3: Filter the data
print("Filtering data...")
# Remove rows where 'Longitude' is 0
df = df[df['Longitude'] != 0]

# Keep rows where 'Ship type' is 'Cargo', 'Tanker', or 'Undefined' (if Length > 100)
df = df[
    (df['Ship type'].isin(['Cargo', 'Tanker'])) |
    ((df['Ship type'] == 'Undefined') & (df['Length'] > 100))
]

# Remove duplicate rows based on 'IMO' and '# Timestamp'
df = df.drop_duplicates(subset=['IMO', '# Timestamp'])

# Sort by 'MMSI' and '# Timestamp' to prepare for time filtering
df = df.sort_values(by=['MMSI', '# Timestamp'])

# Apply two-minute time gap filtration for each MMSI
print("Applying two-minute time gap filtering...")
def filter_two_minutes(group):
    group = group.sort_values(by='# Timestamp')
    return group[group['# Timestamp'].diff().dt.total_seconds().fillna(120) >= 120]

df = df.groupby('MMSI', group_keys=False).apply(filter_two_minutes)


# Remove the un-necessary columns

In [None]:
# Remove unnecessary columns
columns_to_drop = [
    'Width', 'Type of position fixing device', 'Data source type', 
    'A', 'B', 'C', 'D'
]
df = df.drop(columns=columns_to_drop, errors='ignore')  # Ignore errors if columns don't exist

print(f"Filtering complete. Remaining rows: {len(df)}")

# Save the file after flteration

In [None]:
# Step 4: Save the filtered data
print(f"Saving the filtered data to: {output_file_path}...")
df.to_csv(output_file_path, index=False)
print(f"Filtered data saved successfully: {output_file_path}")