### following mongodb setup

https://www.mongodb.com/resources/products/compatibilities/deploying-a-mongodb-cluster-with-docker


1) Create docker network with 'docker network create mongoCluster'
2) Create mongod instances in docker with
3) Create 2 more such instances
4) Use pymongo to connect init the replica set

In [10]:
from pymongo import MongoClient

# connect to the client, everything else is initiated and configured before
client = MongoClient('localhost', 27017, directConnection=True, replicaSet='myReplicaSet')

In [11]:
def print_member_status(client):
    db = client.admin
    rs_status = db.command({'replSetGetStatus': 1})

    for m in rs_status['members']:
        print(m['name'], m['stateStr'])

print_member_status(client)

mongo1:27017 PRIMARY
mongo2:27017 SECONDARY
mongo3:27017 SECONDARY


# Data

In [13]:
import pandas as pd

file = "aisdk_test.csv"

df_sample = pd.read_csv(file, nrows=100) # Reading a small sample just to show structure, tasks will process full file
df_sample.head()

Unnamed: 0,# Timestamp,Type of mobile,MMSI,Latitude,Longitude,Navigational status,ROT,SOG,COG,Heading,...,Length,Type of position fixing device,Draught,Destination,ETA,Data source type,A,B,C,D
0,01/05/2025 00:00:00,Base Station,2194005,56.34425,4.272,Unknown value,,,,,...,,Surveyed,,Unknown,,AIS,,,,
1,01/05/2025 00:00:00,Base Station,2190064,56.716555,11.519008,Unknown value,,,,,...,,GPS,,Unknown,,AIS,,,,
2,01/05/2025 00:00:00,Class A,244170000,57.848112,10.373403,Under way using engine,0.0,9.6,268.7,266.0,...,,Undefined,,Unknown,,AIS,,,,
3,01/05/2025 00:00:00,Class A,265610940,56.893248,12.488332,Under way using engine,0.0,0.0,302.5,120.0,...,,Undefined,,Unknown,,AIS,,,,
4,01/05/2025 00:00:00,Class A,265610940,56.893248,12.488332,Under way using engine,0.0,0.0,302.5,120.0,...,,Undefined,,Unknown,,AIS,,,,


In [35]:
len(df_sample.columns)

26

## Setup: Imports and Configuration

In [15]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
import multiprocessing as mp
from datetime import datetime
import numpy as np # For NaN checks if needed, and for histogram binning

# Configuration
DB_HOST = 'localhost'
DB_PORT = 27017
REPLICA_SET = 'myReplicaSet'  # From existing cell
DB_NAME = 'ais_assignment_db'
RAW_COLLECTION_NAME = 'ais_raw_data'
FILTERED_COLLECTION_NAME = 'ais_filtered_data'
CSV_FILE_PATH = file  # Using 'file' variable from cell a2d2fe13 ('aisdk_test.csv')

CHUNK_SIZE = 10000 
NUM_PROCESSES = mp.cpu_count()

print(f"Using CSV file: {CSV_FILE_PATH}")
print(f"Number of CPU cores available: {NUM_PROCESSES}")
print(f"MongoDB Host: {DB_HOST}:{DB_PORT}, Replica Set: {REPLICA_SET}")

Using CSV file: aisdk_test.csv
Number of CPU cores available: 22
MongoDB Host: localhost:27017, Replica Set: myReplicaSet


## Task 2: Data Insertion in Parallel
This task involves reading data from the CSV file (`aisdk_test.csv`) and inserting it into a MongoDB collection (`ais_raw_data`) in parallel. Timestamps are converted to datetime objects. Each worker process uses its own MongoClient instance via the MongoWorker class.

In [25]:
class MongoWorker:
    def __init__(self, db_host, db_port, replica_set, db_name, collection_name):
        self.db_host = db_host
        self.db_port = db_port
        self.replica_set = replica_set
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = None
        self.db = None
        self.collection = None

    def __enter__(self):
        self.client = MongoClient(self.db_host, self.db_port, directConnection=True, replicaSet=self.replica_set)
        self.db = self.client[self.db_name]
        self.collection = self.db[self.collection_name]
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.client:
            self.client.close()

    def insert_chunk(self, chunk_df):
        try:
            chunk_df['# Timestamp'] = pd.to_datetime(chunk_df['# Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
            chunk_df.dropna(subset=['# Timestamp'], inplace=True)
            records = chunk_df.to_dict(orient='records')
            print(len(records), "records to insert")
            if records:
                self.collection.insert_many(records, ordered=False)
            return len(records)
        except Exception as e:
            print(f"Error in worker during insert_chunk: {e}")
            return 0

def process_chunk_for_insertion(chunk_df_tuple):
    chunk_df, db_host, db_port, replica_set, db_name, collection_name = chunk_df_tuple
    with MongoWorker(db_host, db_port, replica_set, db_name, collection_name) as worker:
        return worker.insert_chunk(chunk_df)


In [26]:
with MongoClient(DB_HOST, DB_PORT, directConnection=True, replicaSet=REPLICA_SET) as main_client:
    db = main_client[DB_NAME]
    print(f"Dropping collection: {RAW_COLLECTION_NAME}...")
    db[RAW_COLLECTION_NAME].drop()
    print(f"Collection {RAW_COLLECTION_NAME} dropped.")

Dropping collection: ais_raw_data...
Collection ais_raw_data dropped.


In [27]:

print(f"Starting data insertion into {RAW_COLLECTION_NAME} from {CSV_FILE_PATH}...")
chunk_iterator = pd.read_csv(CSV_FILE_PATH, chunksize=CHUNK_SIZE)
tasks = [(chunk, DB_HOST, DB_PORT, REPLICA_SET, DB_NAME, RAW_COLLECTION_NAME) for chunk in chunk_iterator]
len(tasks)


Starting data insertion into ais_raw_data from aisdk_test.csv...


1

In [28]:
total_inserted = 0
with mp.Pool(processes=NUM_PROCESSES) as pool:
    for i, inserted_count in enumerate(pool.imap_unordered(process_chunk_for_insertion, tasks)):
        total_inserted += inserted_count
        print(f"Processed chunk {i+1}, {inserted_count} records inserted. Total so far: {total_inserted}")

print(f"Total documents inserted into {RAW_COLLECTION_NAME}: {total_inserted}")

9999 records to insert
 records to insert
Processed chunk 1, 9999 records inserted. Total so far: 9999
Total documents inserted into ais_raw_data: 9999
Processed chunk 1, 9999 records inserted. Total so far: 9999
Total documents inserted into ais_raw_data: 9999


In [29]:
total_inserted

9999

In [None]:
main_client =  MongoClient(DB_HOST, DB_PORT, directConnection=True, replicaSet=REPLICA_SET)

db = main_client[DB_NAME]
raw_collection = db[RAW_COLLECTION_NAME]

print(raw_collection.count_documents({}))
if raw_collection.count_documents({}) > 0:
    print(f"Creating indexes on {RAW_COLLECTION_NAME} for MMSI and # Timestamp...")
    raw_collection.create_index([("MMSI", ASCENDING)])
    raw_collection.create_index([("# Timestamp", ASCENDING)])
    raw_collection.create_index([("MMSI", ASCENDING), ("# Timestamp", ASCENDING)])
    print("Indexes created on raw data collection.")

9999
Creating indexes on ais_raw_data for MMSI and # Timestamp...
Indexes created on raw data collection.


## Task 3: Data Noise Filtering in Parallel
This task filters noise from the `ais_raw_data` collection. 
1. Vessels with less than 100 data points in `ais_raw_data` are identified and excluded.
2. For the remaining vessels, data points are filtered based on missing or invalid essential fields: 
   - Required fields: `MMSI`, `Latitude`, `Longitude`, `ROT`, `SOG`, `COG`, `Heading` (must not be missing/NaN).
   - `Navigational status` must not be 'Unknown value' or missing.
The filtered data is stored in `ais_filtered_data`. Each worker process handles a distinct MMSI.

In [None]:
def process_mmsi_for_filtering(filter_args):
    mmsi, db_host, db_port, replica_set, db_name, raw_coll_name, filtered_coll_name = filter_args
    with MongoWorker(db_host, db_port, replica_set, db_name, raw_coll_name) as raw_worker:
        required_fields = ['MMSI', 'Latitude', 'Longitude', 'ROT', 'SOG', 'COG', 'Heading']
        nav_status_field = 'Navigational status'
        invalid_nav_status = 'Unknown value'
        vessel_data = list(raw_worker.collection.find({'MMSI': mmsi}).sort('# Timestamp', ASCENDING))
        
        filtered_points_for_mmsi = []
        for point in vessel_data:
            is_valid_point = True
            for field in required_fields:
                value = point.get(field)
                if value is None or (isinstance(value, float) and np.isnan(value)):
                    is_valid_point = False
                    break
            if not is_valid_point: continue

            nav_status = point.get(nav_status_field)
            if nav_status is None or nav_status == invalid_nav_status:
                is_valid_point = False
            
            if is_valid_point:
                filtered_points_for_mmsi.append(point)
        
        if filtered_points_for_mmsi:
            with MongoWorker(db_host, db_port, replica_set, db_name, filtered_coll_name) as filtered_worker:
                filtered_worker.collection.insert_many(filtered_points_for_mmsi, ordered=False)
            return len(filtered_points_for_mmsi)
        return 0

if __name__ == '__main__':
    with MongoClient(DB_HOST, DB_PORT, directConnection=True, replicaSet=REPLICA_SET) as main_client:
        db = main_client[DB_NAME]
        raw_collection = db[RAW_COLLECTION_NAME]

        print(f"Dropping collection: {FILTERED_COLLECTION_NAME}...")
        db[FILTERED_COLLECTION_NAME].drop()
        print(f"Collection {FILTERED_COLLECTION_NAME} dropped.")

        if raw_collection.count_documents({}) > 0:
            print("Identifying vessels with >= 100 data points from raw data...")
            pipeline = [
                {'$group': {'_id': '$MMSI', 'count': {'$sum': 1}}},
                {'$match': {'count': {'$gte': 100}}}
            ]
            mmsi_docs_to_process = list(raw_collection.aggregate(pipeline))
            mmsis_to_filter = [doc['_id'] for doc in mmsi_docs_to_process]
            print(f"Found {len(mmsis_to_filter)} vessels with >= 100 data points to filter.")

            if mmsis_to_filter:
                filter_tasks = [(mmsi, DB_HOST, DB_PORT, REPLICA_SET, DB_NAME, RAW_COLLECTION_NAME, FILTERED_COLLECTION_NAME) for mmsi in mmsis_to_filter]
                total_filtered_inserted = 0
                processed_mmsi_count = 0
                with mp.Pool(processes=NUM_PROCESSES) as pool:
                    for i, inserted_count in enumerate(pool.imap_unordered(process_mmsi_for_filtering, filter_tasks)):
                        total_filtered_inserted += inserted_count
                        processed_mmsi_count +=1
                        print(f"Filtered MMSI {processed_mmsi_count}/{len(mmsis_to_filter)}, {inserted_count} records kept. Total: {total_filtered_inserted}")
                print(f"Total documents inserted into {FILTERED_COLLECTION_NAME}: {total_filtered_inserted}")
            else:
                print("No vessels found meeting the >= 100 data points criteria for filtering.")
        else:
            print(f"{RAW_COLLECTION_NAME} is empty. Skipping filtering task.")

    with MongoClient(DB_HOST, DB_PORT, directConnection=True, replicaSet=REPLICA_SET) as main_client:
        db = main_client[DB_NAME]
        filtered_collection = db[FILTERED_COLLECTION_NAME]
        if filtered_collection.count_documents({}) > 0:
            print(f"Creating indexes on {FILTERED_COLLECTION_NAME} for MMSI and # Timestamp...")
            filtered_collection.create_index([("MMSI", ASCENDING)])
            filtered_collection.create_index([("# Timestamp", ASCENDING)])
            filtered_collection.create_index([("MMSI", ASCENDING), ("# Timestamp", ASCENDING)])
            print("Indexes created on filtered data collection.")
        else:
            print(f"Skipping index creation as {FILTERED_COLLECTION_NAME} is empty.")

## Task 4: Calculation of Delta t and Histogram Generation
This task calculates the time difference (delta t) in milliseconds between subsequent data points for each vessel in the `ais_filtered_data` collection. A histogram of these delta t values is then generated. Each worker process handles delta_t calculation for a distinct MMSI.

In [None]:
def calculate_delta_t_for_mmsi(delta_t_args):
    mmsi, db_host, db_port, replica_set, db_name, filtered_coll_name = delta_t_args
    with MongoWorker(db_host, db_port, replica_set, db_name, filtered_coll_name) as worker:
        vessel_data = list(worker.collection.find({'MMSI': mmsi}).sort('# Timestamp', ASCENDING))
        delta_t_values_for_mmsi = []
        if len(vessel_data) > 1:
            for i in range(len(vessel_data) - 1):
                t1 = vessel_data[i]['# Timestamp']
                t2 = vessel_data[i+1]['# Timestamp']
                if isinstance(t1, datetime) and isinstance(t2, datetime):
                    delta = (t2 - t1).total_seconds() * 1000
                    if delta >= 0:
                         delta_t_values_for_mmsi.append(delta)
                else:
                    print(f"Warning: Non-datetime object for MMSI {mmsi}, idx {i}. t1:{type(t1)}, t2:{type(t2)}")
        return delta_t_values_for_mmsi

if __name__ == '__main__':
    all_delta_t_values = []
    with MongoClient(DB_HOST, DB_PORT, directConnection=True, replicaSet=REPLICA_SET) as main_client:
        db = main_client[DB_NAME]
        filtered_collection = db[FILTERED_COLLECTION_NAME]
        
        if filtered_collection.count_documents({}) == 0:
            print(f"{FILTERED_COLLECTION_NAME} is empty. Skipping delta_t calculation.")
        else:
            print(f"Fetching distinct MMSIs from {FILTERED_COLLECTION_NAME} for delta_t...")
            distinct_mmsis_for_dt = filtered_collection.distinct('MMSI')
            print(f"Found {len(distinct_mmsis_for_dt)} distinct MMSIs for delta_t.")

            if distinct_mmsis_for_dt:
                delta_t_tasks = [(mmsi, DB_HOST, DB_PORT, REPLICA_SET, DB_NAME, FILTERED_COLLECTION_NAME) for mmsi in distinct_mmsis_for_dt]
                processed_mmsi_dt_count = 0
                with mp.Pool(processes=NUM_PROCESSES) as pool:
                    for i, dt_list_for_mmsi in enumerate(pool.imap_unordered(calculate_delta_t_for_mmsi, delta_t_tasks)):
                        all_delta_t_values.extend(dt_list_for_mmsi)
                        processed_mmsi_dt_count +=1
                        print(f"Delta_t for MMSI {processed_mmsi_dt_count}/{len(distinct_mmsis_for_dt)}. Found {len(dt_list_for_mmsi)} values.")
                print(f"Total delta_t values collected: {len(all_delta_t_values)}")

    if all_delta_t_values:
        plt.figure(figsize=(12, 7))
        positive_delta_t = [dt for dt in all_delta_t_values if dt > 0] # Ensure positive for log scale if used
        if not positive_delta_t:
            print("No positive delta_t values to plot.")
        else:
            plt.hist(positive_delta_t, bins=100, edgecolor='black')
            plt.yscale('log', nonpositive='clip')
            plt.title('Histogram of Delta t values (ms) between AIS data points')
            plt.xlabel('Delta t (milliseconds)')
            plt.ylabel('Frequency (log scale)')
            plt.grid(True, which="both", linestyle='--')
            if positive_delta_t:
                print(f"Delta_t stats (ms): Min: {np.min(positive_delta_t):.2f}, Max: {np.max(positive_delta_t):.2f}, Mean: {np.mean(positive_delta_t):.2f}, Median: {np.median(positive_delta_t):.2f}")
            plt.show()
    else:
        print("No delta_t values to plot.")