In [2]:
import pandas as pd
import re   
from pathlib import Path
import glob
import zipfile

## Section process

In [15]:

def sec_to_csv(sec_path, csv_path):
    rows = []
    with open(sec_path, "r") as f:
        lines = [line.strip() for line in f if line.strip()]
    
    section_name = lines[0].lstrip(">")
    metadata = lines[1]  # not parsed here, but can be saved if needed
    print(str(metadata))
    coords = lines[2:]   # lon/lat pairs
    
    for line in coords:
        lon, lat = map(float, line.split())
        rows.append({"section": section_name, "longitude": lon, "latitude": lat})
    
    df = pd.DataFrame(rows)
    df.to_csv(csv_path, index=False)
    print(f"Saved {len(df)} records to {csv_path}")

# Example
frm = "D:/DS & ML/MLE/dataset/ewoce/data/ctd/cfgAtlanticSections/A1E.sec"
to = "A1E.csv"
sec_to_csv(frm, to)


1 0.100000 6 0 8
Saved 6 records to A1E.csv


In [None]:

def all_sec_to_single_csv(sec_folder, output_csv):
    all_rows = []
    for sec_path in glob.glob(f"{sec_folder}/*.sec"):
        with open(sec_path, "r") as f:
            lines = [line.strip() for line in f if line.strip()]
        if len(lines) < 3:
            continue
        section_name = lines[0].lstrip(">")
        coords = lines[2:]
        for line in coords:
            try:
                lon, lat = map(float, line.split())
                all_rows.append({
                    "section": section_name,
                    "longitude": lon,
                    "latitude": lat,
                    #"source_file": sec_path
                })
            except Exception:
                continue
    if all_rows:
        df = pd.DataFrame(all_rows)
        df.to_csv(output_csv, index=False)
        print(f"Saved {len(df)} records from {len(set([r['section'] for r in all_rows]))} sections to {output_csv}")
    else:
        print("No valid .sec files found or parsed.")

# Example usage:
sec_folder = r"D:/DS & ML/MLE/dataset/ewoce/data/ctd/cfgAtlanticSections"  # update as needed
output_csv = "all_sections_simple.csv"
all_sec_to_single_csv(sec_folder, output_csv)


Saved 128 records from 24 sections to all_sections_simple.csv


In [None]:

# For processing all the _CTDs
all_rows = []

root = Path("")

for cruise_zip in root.glob("*_ct1.zip"):
    with zipfile.ZipFile(cruise_zip, "r") as z:
        for fname in z.namelist():
            if fname.endswith(".csv"):
                with z.open(fname) as f:
                    df = pd.read_csv(f, comment="#")  # skip header comments
                    # metadata may be in header rows â€” parse separately if needed
                    df["cruise"] = cruise_zip.stem
                    all_rows.append(df)

final_df = pd.concat(all_rows, ignore_index=True)
final_df.to_parquet("woce_ctd_flat.parquet")


## process _ctds

In [55]:
def stripping(filepath):
    """
    Reads a WOCE CTD csv file, filters comments and metadata, extracts latitude/longitude,
    parses header and data, and returns a DataFrame with LATITUDE, LONGITUDE, and data columns.
    """
    import pandas as pd
    lat = None
    lon = None
    header = None
    data_lines = []
    found_lat = False
    found_header = False
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        # Skip comment lines and metadata
        if not found_lat:
            if line.startswith('LATITUDE'):
                lat = float(line.split('=')[1].strip(','))
                # Next line should be LONGITUDE
                i += 1
                lon_line = lines[i].strip()
                if lon_line.startswith('LONGITUDE'):
                    lon = float(lon_line.split('=')[1].strip(','))
                found_lat = True
        elif not found_header:
            if line.startswith('CTDPRS'):
                header = line.split(',')
                # Add LATITUDE and LONGITUDE to header
                header = ['LATITUDE', 'LONGITUDE'] + header
                found_header = True
                i += 1  # skip the next line (usually units)
        elif found_header:
            # Stop at END_DATA
            if line.startswith('END_DATA'):
                break
            # Only process non-empty, non-comment lines
            if line and not line.startswith('#'):
                values = line.split(',')
                # Prepend lat/lon to each row
                row = [lat, lon] + [float(v) if v else None for v in values]
                data_lines.append(row)
        i += 1
    # Create DataFrame
    df = pd.DataFrame(data_lines, columns=header)
    return df

# Example usage:
df = stripping('49UP20140609_00001_00001_ct1.csv')

In [56]:
df2 = stripping("49UP20140609_00001_00001_ct1.csv")
print(df.shape)
df = pd.concat([df,df2],ignore_index=True)
print(df.shape)

(471, 10)
(942, 10)


In [6]:
df = pd.read_csv("Aggregated.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31127 entries, 0 to 31126
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   LATITUDE   31127 non-null  float64
 1   LONGITUDE  31127 non-null  float64
 2   CTDPRS     31127 non-null  float64
 3   CTDTMP     31127 non-null  float64
 4   CTDSAL     31127 non-null  float64
dtypes: float64(5)
memory usage: 1.2 MB
