In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import os

# Method 1: Using glob to find all CSV files
csv_files = glob.glob('rawcsv/twins/*.csv')
df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Method 2: Using os.listdir (alternative)
directory = 'rawcsv/twins'
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
df = df.sort_values(by='LMST')
print(df['LMST'].min())
print(df['LMST'].max())
print(df.shape)

00004M06:46:33.826
00120M06:00:22.668
(2350433, 25)


In [29]:
AIR_TEMP_WEIGHTS = [0.2, 0.2, 0.2, 0.2]
WIND_WEIGHTS = [0.2, 0.2, 0.2]

def air_temp_confidence(val):
    # 1. If NaN → return NaN
    if pd.isna(val):
        return np.nan

    # 2. Convert to bitstring safely
    bitstr = str(val).strip()

    # Handle floats like 1101.0
    if bitstr.endswith(".0"):
        bitstr = bitstr[:-2]

    # 3. Validate bitstring
    if len(bitstr) != 4 or not set(bitstr).issubset({"0", "1"}):
        return np.nan

    # 4. Convert bits → confidence
    bits = [int(b) for b in bitstr]
    return sum(bits) / 4  # simple linear confidence

def wind_confidence(val):
    # 1. Preserve missing data
    if pd.isna(val):
        return np.nan

    # 2. Convert safely to string
    bitstr = str(val).strip()

    # Handle floats like 101.0
    if bitstr.endswith(".0"):
        bitstr = bitstr[:-2]

    # 3. Validate: must be exactly 3 bits
    if len(bitstr) != 3 or not set(bitstr).issubset({"0", "1"}):
        return np.nan

    # 4. Convert bits → confidence
    bits = [int(b) for b in bitstr]
    return sum(bits) / 3

In [30]:

df["BMY_AVE_ROD_TEMP"] = df[
    ["BMY_BASE_ROD_TEMP", "BMY_MID_ROD_TEMP", "BMY_TIP_ROD_TEMP"]
].mean(axis=1)

df["BPY_AVE_ROD_TEMP"] = df[
    ["BPY_BASE_ROD_TEMP", "BPY_MID_ROD_TEMP", "BPY_TIP_ROD_TEMP"]
].mean(axis=1)
df = df.drop(columns=["BMY_BASE_ROD_TEMP", "BMY_MID_ROD_TEMP", "BMY_TIP_ROD_TEMP", "BPY_BASE_ROD_TEMP", "BPY_MID_ROD_TEMP", "BPY_TIP_ROD_TEMP", "AOBT", "SCLK", "BMY_WIND_FREQUENCY", "BPY_AIR_TEMP_FREQUENCY"], errors="ignore")

df["BMY_AIR_CONF"] = df["BMY_AIR_TEMP_OPERATIONAL_FLAGS"].apply(air_temp_confidence)
df["BPY_AIR_CONF"] = df["BPY_AIR_TEMP_OPERATIONAL_FLAGS"].apply(air_temp_confidence)

df["BMY_WIND_CONF"] = df["BMY_WS_OPERATIONAL_FLAGS"].apply(wind_confidence)
df["BPY_WIND_CONF"] = df["BPY_WS_OPERATIONAL_FLAGS"].apply(wind_confidence)

# Extract sol
df["sol"] = df["LMST"].str.extract(r"(\d+)M").astype(int)

# Extract hour and minute
time_part = df["LMST"].str.extract(r"M(\d+):(\d+):")
df["hour"] = time_part[0].astype(int)
df["minute"] = time_part[1].astype(int)

minute_avg = (
    df
    .groupby(["sol", "hour", "minute"])
    .mean(numeric_only=True)
    .reset_index()
)

minute_avg["LMST_minute"] = (
    minute_avg["sol"].astype(str).str.zfill(5) + "M" +
    minute_avg["hour"].astype(str).str.zfill(2) + ":" +
    minute_avg["minute"].astype(str).str.zfill(2)
)

minute_avg = minute_avg.drop(columns=["LMST"], errors="ignore")

minute_avg.head()
minute_avg.shape
minute_avg.groupby("sol").size().describe()



minute_avg.to_csv("twins_calib_minute_avg.csv", index=False)