# 1 – Cleaning and Tidying of Orbital Element and Maneuver Data

This module performs initial data cleaning and organization.It consolidates raw CSV files of orbital elements (one per satellite) and maneuvers into a single structured dataset and formats temporal and identification fields for downstream analysis.

Key operations include:
- Reading raw orbital element files with consistent structure.
- Parsing the 'epoch' column into a full datetime format.
- Creating an additional 'epoch_date' column with date-only values to support daily modeling.
- Tagging each observation with its corresponding satellite name.
- Exporting the cleaned and tidy orbital elements to a single consolidated CSV file.

A similar process is applied to maneuver logs:
- A new column is created for maneuver duration in minutes.
- Both full timestamps and date-only fields are preserved for flexible alignment with orbital data.
- Only relevant columns are retained in the output.

This stage prepares the data for resampling, transformation, and modeling in later preprocessing steps.

In [50]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import datetime, timedelta

# Orbital Elements Cleaning

In [88]:
# Define the path where CSV files are stored
OE_path = r"C:\Users\Suare\satellite_research_project\orbital_elements"

In [90]:
# Ensure output folder exists
os.makedirs(output_path, exist_ok=True)

# List all CSV files in the folder
csv_files = [f for f in os.listdir(OE_path) if f.endswith('.csv')]

# List to store individual DataFrames
dfs = []

# Loop through each CSV file and process it
for file in csv_files:
    file_path = os.path.join(OE_path, file)

    # Read CSV and rename first column
    df = pd.read_csv(file_path)
    df.rename(columns={df.columns[0]: 'epoch'}, inplace=True)
    
    # Convert 'epoch' to full datetime
    df['epoch'] = pd.to_datetime(df['epoch'])  # keep full datetime
    df['epoch_date'] = df['epoch'].dt.floor('D')  # add date-only version
    
    # Add satellite name
    df['satellite_name'] = os.path.splitext(file)[0]
    
    dfs.append(df)

# Concatenate all DataFrames
consolidated_df = pd.concat(dfs, ignore_index=True)

# Save to CSV
output_file = os.path.join(output_path, "all_satellite_orbitals.csv")
consolidated_df.to_csv(output_file, index=False)

print(f"Consolidated CSV saved at: {output_file}")

Consolidated CSV saved at: C:\Users\Suare\satellite_anomaly_project\data\cleaned\all_satellite_orbitals.csv


In [92]:
file_path = r"C:\Users\Suare\satellite_anomaly_project\data\cleaned\all_satellite_orbitals.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,epoch,eccentricity,argument of perigee,inclination,mean anomaly,Brouwer mean motion,right ascension,epoch_date,satellite_name
0,2010-04-25 12:13:31.467936,0.00119,3.773156,1.606104,-3.772437,0.063403,5.077397,2010-04-25,CryoSat_2
1,2010-04-26 13:01:57.579456,0.001207,3.69766,1.60611,-3.696826,0.063403,5.08176,2010-04-26,CryoSat_2
2,2010-04-27 22:06:32.422176,0.001224,3.603564,1.606097,-3.602503,0.063403,5.087575,2010-04-27,CryoSat_2
3,2010-04-28 12:59:36.035519,0.001233,3.566036,1.606094,-3.564917,0.063403,5.090192,2010-04-28,CryoSat_2
4,2010-04-29 00:34:12.213407,0.001252,3.52915,1.606097,-3.52792,0.063403,5.092225,2010-04-29,CryoSat_2


In [98]:
summary_table = df.groupby('satellite_name').agg(
    observations=('epoch', 'count'),  # Number of observations
    min_epoch=('epoch_date', 'min'),       # Earliest epoch
    max_epoch=('epoch_date', 'max')        # Latest epoch
).reset_index()
summary_table

Unnamed: 0,satellite_name,observations,min_epoch,max_epoch
0,CryoSat_2,4308,2010-04-25,2022-09-28
1,Fengyun_2D,1187,2011-01-27,2015-04-16
2,Fengyun_2E,2375,2011-03-13,2018-10-21
3,Fengyun_2F,2985,2012-09-06,2022-01-11
4,Fengyun_2H,1053,2019-01-13,2022-01-24
5,Fengyun_4A,1305,2018-05-17,2022-02-27
6,Haiyang_2A,2998,2011-10-08,2020-06-02
7,Jason_1,3996,2001-12-22,2013-06-06
8,Jason_2,3921,2008-07-04,2019-09-27
9,Jason_3,2410,2016-01-31,2022-10-03


# Manoeuvres Tidy and Cleaning

In [24]:
# Dictionary mapping maneuver file names to orbital key names
ORBITAL_MAPPING = {
    "cs2man.txt": "CryoSat_2",
    "manFY2D.txt.fy": "Fengyun_2D",
    "manFY2E.txt.fy": "Fengyun_2E",
    "manFY2F.txt.fy": "Fengyun_2F",
    "manFY2H.txt.fy": "Fengyun_2H",
    "manFY4A.txt.fy": "Fengyun_4A",
    "h2aman.txt": "Haiyang_2A",
    "ja1man.txt": "Jason_1",
    "ja2man.txt": "Jason_2",
    "ja3man.txt": "Jason_3",
    "srlman.txt": "SARAL",
    "s3aman.txt": "Sentinel_3A",
    "s3bman.txt": "Sentinel_3B",
    "s6aman.txt": "Sentinel_6A",
    "topman.txt": "TOPEX"
}

In [26]:
def normalize_datetime_format(datetime_str):
    """Normalizes date format by replacing '/' with '-' and extracting YYYY-MM-DDTHH:MM:SS."""
    cleaned_datetime = datetime_str.replace("/", "-")  # Replace / with -
    match = re.match(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})', cleaned_datetime)
    return match.group(1) if match else None  # Extract only the datetime part

In [28]:
def process_maneuver_file(filepath, filename):
    """
    Reads a maneuver file and extracts specific columns.
    Handles 'topman.txt' separately due to its unique format.
    
    Args:
        filepath: Path to the maneuver file.
        filename: Name of the maneuver file.

    Returns:
        pandas.DataFrame: DataFrame with relevant maneuver data.
    """
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
        return None

    data = []  # Store extracted rows
    orbital_key = ORBITAL_MAPPING.get(filename, "Unknown")  # Get Orbital Key Name

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        
        try:
            # --- Special Handling for 'topman.txt' ---
            if filename.lower() == "topman.txt":
                parts = line.split()
                if len(parts) < 9:
                    print(f"Skipping malformed line in {filename}: {line}")
                    continue

                satellite = parts[0]
                start_year, start_doy, start_hour, start_minute = map(int, parts[1:5])
                end_year, end_doy, end_hour, end_minute = map(int, parts[5:9])
                maneuver_type = "Unknown"
                parameter_type = "Unknown"
                num_burns = 0  # No info in topman.txt
                
            # --- Standard Format for Other TXT Files ---
            else:
                satellite = line[0:5].strip()
                start_year = int(line[6:10])
                start_doy = int(line[11:14])
                start_hour = int(line[15:17])
                start_minute = int(line[18:20])
                end_year = int(line[21:25])
                end_doy = int(line[26:29])
                end_hour = int(line[30:32])
                end_minute = int(line[33:35])
                maneuver_type = line[36:39].strip()
                parameter_type = line[40:43].strip()
                num_burns = int(line[44:46])

            # Store the extracted values
            row_data = {
                'OrbitalKeyName': orbital_key,  # Add new column for cross-referencing
                'satellite': satellite,
                'start_year': start_year,
                'start_doy': start_doy,
                'start_hour': start_hour,
                'start_minute': start_minute,
                'end_year': end_year,
                'end_doy': end_doy,
                'end_hour': end_hour,
                'end_minute': end_minute,
                'maneuver_type': maneuver_type,
                'parameter_type': parameter_type,
                'num_burns': num_burns
            }
            data.append(row_data)

        except ValueError:
            print(f"Error parsing line in {filename}: {line}")
            continue  # Skip problematic lines

    return pd.DataFrame(data)

In [30]:
def process_txt_fy_file(filepath, filename):
    """Processes .txt.fy maneuver files and extracts relevant columns."""
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"Error: File not found: {filepath}")
        return None

    data = []
    orbital_key = ORBITAL_MAPPING.get(filename, "Unknown")  # Get Orbital Key Name

    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        try:
            # Extract datetime values inside quotes
            datetime_matches = re.findall(r'"(.*?)"', line)
            if len(datetime_matches) != 2:
                print(f"Skipping malformed line in {filepath}: {line}")
                continue

            start_time_clean = normalize_datetime_format(datetime_matches[0])
            end_time_clean = normalize_datetime_format(datetime_matches[1])
            if not start_time_clean or not end_time_clean:
                print(f"Skipping malformed line in {filepath}: {line}")
                continue

            # Convert to datetime format
            start_dt = datetime.strptime(start_time_clean, "%Y-%m-%dT%H:%M:%S")
            end_dt = datetime.strptime(end_time_clean, "%Y-%m-%dT%H:%M:%S")

            # Convert date to Year and DOY
            start_year, start_doy = start_dt.year, start_dt.timetuple().tm_yday
            end_year, end_doy = end_dt.year, end_dt.timetuple().tm_yday

            # Extract maneuver type and satellite
            parts = line.split()
            maneuver_type = parts[0]  # Example: GEO-NS-STATION-KEEPING
            satellite = parts[1]      # Example: 2012-002A

            row_data = {
                'OrbitalKeyName': orbital_key,  # New column
                'satellite': satellite,
                'start_year': start_year,
                'start_doy': start_doy,
                'start_hour': start_dt.hour,
                'start_minute': start_dt.minute,
                'end_year': end_year,
                'end_doy': end_doy,
                'end_hour': end_dt.hour,
                'end_minute': end_dt.minute,
                'maneuver_type': maneuver_type,
                'parameter_type': "Unknown",
                'num_burns': 0
            }
            data.append(row_data)

        except Exception as e:
            print(f"Error processing line in {filepath}: {line}, Error: {e}")

    return pd.DataFrame(data)

In [32]:
def process_folder(directory):
    """Processes all maneuver files in a folder and adds the OrbitalKeyName column."""
    all_data = []
    
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)

        # Process standard .txt files (excluding .txt.fy)
        if filename.endswith(".txt") and not filename.endswith(".txt.fy"):
            df = process_maneuver_file(filepath, filename)
        
        # Process .txt.fy files with different logic
        elif filename.endswith(".txt.fy"):
            df = process_txt_fy_file(filepath, filename)

        else:
            continue  # Skip non-matching files

        if df is not None and not df.empty:
            all_data.append(df)
    
    # Combine all DataFrames into one
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  # Return empty DataFrame if no data

In [38]:
#Executing the function
folder_path = r"C:\Users\Suare\satellite_research_project\manoeuvres" 
result_df = process_folder(folder_path)

In [40]:
result_df.head()

Unnamed: 0,OrbitalKeyName,satellite,start_year,start_doy,start_hour,start_minute,end_year,end_doy,end_hour,end_minute,maneuver_type,parameter_type,num_burns
0,CryoSat_2,CRYO2,2010,105,17,47,2010,105,17,48,,6,1
1,CryoSat_2,CRYO2,2010,123,17,55,2010,124,0,35,,6,2
2,CryoSat_2,CRYO2,2010,124,18,38,2010,124,18,48,,6,1
3,CryoSat_2,CRYO2,2010,125,17,44,2010,125,17,59,,6,1
4,CryoSat_2,CRYO2,2010,126,18,26,2010,126,18,51,,6,1


In [48]:
print(result_df.dtypes)

OrbitalKeyName    object
satellite         object
start_year         int64
start_doy          int64
start_hour         int64
start_minute       int64
end_year           int64
end_doy            int64
end_hour           int64
end_minute         int64
maneuver_type     object
parameter_type    object
num_burns          int64
dtype: object


In [52]:
def convert_doy_to_datetime(year, doy, hour, minute):
    """Convert DOY format to a proper datetime."""
    return datetime(year, 1, 1) + timedelta(days=doy - 1, hours=hour, minutes=minute)

In [78]:
# Create new datetime columns
result_df['start_datetime'] = result_df.apply(
    lambda row: convert_doy_to_datetime(row['start_year'], row['start_doy'], row['start_hour'], row['start_minute']),
    axis=1)

result_df['start_date'] = result_df['start_datetime'].dt.normalize()

result_df['end_datetime'] = result_df.apply(
    lambda row: convert_doy_to_datetime(row['end_year'], row['end_doy'], row['end_hour'], row['end_minute']),
    axis=1)

# Calculate duration in minutes
result_df['duration_minutes'] = (result_df['end_datetime'] - result_df['start_datetime']).dt.total_seconds() / 60

# Keep only the useful columns
cleaned_maneuver_df = result_df[['OrbitalKeyName', 'satellite', 'start_datetime', 'start_date', 'end_datetime', 'duration_minutes']]

In [80]:
cleaned_maneuver_df.head()

Unnamed: 0,OrbitalKeyName,satellite,start_datetime,start_date,end_datetime,duration_minutes
0,CryoSat_2,CRYO2,2010-04-15 17:47:00,2010-04-15,2010-04-15 17:48:00,1.0
1,CryoSat_2,CRYO2,2010-05-03 17:55:00,2010-05-03,2010-05-04 00:35:00,400.0
2,CryoSat_2,CRYO2,2010-05-04 18:38:00,2010-05-04,2010-05-04 18:48:00,10.0
3,CryoSat_2,CRYO2,2010-05-05 17:44:00,2010-05-05,2010-05-05 17:59:00,15.0
4,CryoSat_2,CRYO2,2010-05-06 18:26:00,2010-05-06,2010-05-06 18:51:00,25.0


In [82]:
print(cleaned_maneuver_df.dtypes)

OrbitalKeyName              object
satellite                   object
start_datetime      datetime64[ns]
start_date          datetime64[ns]
end_datetime        datetime64[ns]
duration_minutes           float64
dtype: object


In [84]:
file_path = r"C:\Users\Suare\satellite_anomaly_project\data\cleaned\all_satellite_maneuvers.csv"
result_df.to_csv(file_path, index=False)

In [86]:
file_path = r"C:\Users\Suare\satellite_anomaly_project\data\cleaned\cleaned_maneuvers.csv"
result_df.to_csv(file_path, index=False)