In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob

# Method 1: Using glob to find all CSV files
twins_csv = glob.glob('rawcsv/twins/*.csv')
df = pd.concat([pd.read_csv(file) for file in twins_csv], ignore_index=True)

ps_csv = glob.glob('rawcsv/ps/*.csv')
ps_df = pd.concat([pd.read_csv(file) for file in ps_csv], ignore_index=True)

# Method 2: Using os.listdir (alternative)
#directory = 'rawcsv/twins'
#csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
#df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
#df = df.sort_values(by='LMST')
#print(df['LMST'].min())
#print(df['LMST'].max())
#print(df.shape)

KeyboardInterrupt: 

In [None]:
ps_df = ps_df.drop(columns=["AOBT", "SCLK", "LTST", "UTC", "PRESSURE_FREQUENCY", "PRESSURE_TEMP_FREQUENCY"], errors='ignore')

# Extract sol, minute and hour from LMST
ps_df["sol"] = ps_df["LMST"].str.extract(r"(\d+)M").astype(int)
ps_time_part = ps_df["LMST"].str.extract(r"M(\d+):(\d+):")
ps_df["hour"] = ps_time_part[0].astype(int)
ps_df["minute"] = ps_time_part[1].astype(int)

# Group and aggregate differently for confidence vs other numeric columns
# Confidence values should NOT be averaged
minute_avg_ps = (
    ps_df
    .groupby(["sol", "hour", "minute"])
    .agg({
        'PRESSURE': 'mean',
        'PRESSURE_TEMP': 'mean'
    })
    .reset_index()
)



##Rebuild orig LMST String for readability
minute_avg_ps["LMST_minute"] = (
    minute_avg_ps["sol"].astype(str).str.zfill(5) + "M" +
    minute_avg_ps["hour"].astype(str).str.zfill(2) + ":" +
    minute_avg_ps["minute"].astype(str).str.zfill(2)
)



In [None]:
#reduce confidence by weight based on bitstring (1101)
AIR_TEMP_WEIGHTS = [0.25, 0.25, 0.25, 0.25]
WIND_WEIGHTS = [0.33, 0.33, 0.33]

def air_temp_confidence(val):
    # 1. If NaN â†’ return NaN
    if pd.isna(val):
        return np.nan

    # 2. Convert to bitstring safely
    bitstr = str(val).strip()

    # Handle floats like 1101.0
    if bitstr.endswith(".0"):
        bitstr = bitstr[:-2]

    # 3. Validate bitstring
    if len(bitstr) != 4 or not set(bitstr).issubset({"0", "1"}):
        return np.nan

    # 4. Convert bits to confidence
    bits = [int(b) for b in bitstr]
    return sum(bits) / 4  # simple linear confidence

def wind_confidence(val):
    # 1. Preserve missing data
    if pd.isna(val):
        return np.nan

    # 2. Convert safely to string
    bitstr = str(val).strip()

    # Handle floats like 101.0
    if bitstr.endswith(".0"):
        bitstr = bitstr[:-2]

    # 3. Validate: must be exactly 3 bits
    if len(bitstr) != 3 or not set(bitstr).issubset({"0", "1"}):
        return np.nan

    # 4. Convert bits to confidence
    bits = [int(b) for b in bitstr]
    return sum(bits) / 3

In [None]:
##Average the Base-Mid-Tip temperatures to one value, remove old data after
df["BMY_AVE_ROD_TEMP"] = df[
    ["BMY_BASE_ROD_TEMP", "BMY_MID_ROD_TEMP", "BMY_TIP_ROD_TEMP"]
].mean(axis=1)

df["BPY_AVE_ROD_TEMP"] = df[
    ["BPY_BASE_ROD_TEMP", "BPY_MID_ROD_TEMP", "BPY_TIP_ROD_TEMP"]
].mean(axis=1)

df = df.drop(columns=["BMY_BASE_ROD_TEMP", "BMY_MID_ROD_TEMP", "BMY_TIP_ROD_TEMP", "BPY_BASE_ROD_TEMP",
                       "BPY_MID_ROD_TEMP", "BPY_TIP_ROD_TEMP", "AOBT", "SCLK", "BMY_WIND_FREQUENCY", "BPY_AIR_TEMP_FREQUENCY"], errors="ignore")


##Apply operational flags confidence functions
df["BMY_AIR_CONF"] = df["BMY_AIR_TEMP_OPERATIONAL_FLAGS"].apply(air_temp_confidence)
df["BPY_AIR_CONF"] = df["BPY_AIR_TEMP_OPERATIONAL_FLAGS"].apply(air_temp_confidence)

df["BMY_WIND_CONF"] = df["BMY_WS_OPERATIONAL_FLAGS"].apply(wind_confidence)
df["BPY_WIND_CONF"] = df["BPY_WS_OPERATIONAL_FLAGS"].apply(wind_confidence)


# Extract sol, minute and hour from LMST
df["sol"] = df["LMST"].str.extract(r"(\d+)M").astype(int)
time_part = df["LMST"].str.extract(r"M(\d+):(\d+):")
df["hour"] = time_part[0].astype(int)
df["minute"] = time_part[1].astype(int)

# Group and aggregate differently for confidence vs other numeric columns
# Confidence values should NOT be averaged
minute_avg = (
    df
    .groupby(["sol", "hour", "minute"])
    .agg({
        'BMY_HORIZONTAL_WIND_SPEED': 'mean',
        'BMY_WIND_DIRECTION': 'mean',
        'BMY_AVE_ROD_TEMP': 'mean',
        'BMY_AIR_CONF': 'first',  # Take first non-null
        'BMY_WIND_CONF': 'first',
        'BPY_HORIZONTAL_WIND_SPEED': 'mean',
        'BPY_WIND_DIRECTION': 'mean',
        'BPY_AVE_ROD_TEMP': 'mean',
        'BPY_AIR_CONF': 'first',  # Take first non-null
        'BPY_WIND_CONF': 'first'
    })
    .reset_index()
)

##Rebuild orig LMST String for readability
minute_avg["LMST_minute"] = (
    minute_avg["sol"].astype(str).str.zfill(5) + "M" +
    minute_avg["hour"].astype(str).str.zfill(2) + ":" +
    minute_avg["minute"].astype(str).str.zfill(2)
)

#Drop useless columns
minute_avg = minute_avg.drop(columns=["LMST", "BMY_WS_OPERATIONAL_FLAGS", "BMY_AIR_TEMP_FREQUENCY", "BMY_AIR_TEMP_OPERATIONAL_FLAGS",
                                      "BPY_WIND_FREQUENCY", "BPY_WS_OPERATIONAL_FLAGS", "BPY_AIR_TEMP_OPERATIONAL_FLAGS",
                                      "BPY_VERTICAL_WIND_SPEED", "BMY_VERTICAL_WIND_SPEED"], errors="ignore")

#Round down confidence values to 2 decimal points because precision is not needed for this
minute_avg["BPY_WIND_CONF"] = minute_avg["BPY_WIND_CONF"].round(2)
minute_avg["BPY_AIR_CONF"] = minute_avg["BPY_AIR_CONF"].round(2)
minute_avg["BMY_WIND_CONF"] = minute_avg["BMY_WIND_CONF"].round(2)
minute_avg["BMY_AIR_CONF"] = minute_avg["BMY_AIR_CONF"].round(2)

#Sort rows for readability
minute_avg = minute_avg[['LMST_minute', 'sol', 'hour', 'minute'] + 
        [c for c in minute_avg.columns if c.startswith('BMY')] + 
        [c for c in minute_avg.columns if c.startswith('BPY')]]

minute_avg.to_csv("twins_calib_minute_avg.csv", index=False)
minute_avg_ps.to_csv("ps_calib_minute_avg.csv", index=False)

In [None]:
all_readings = pd.merge(
    minute_avg, 
    minute_avg_ps, 
    on=['sol', 'hour', 'minute'],
    how='outer'  # Use 'outer' to keep all rows from both, 'inner' for only matching rows
)

# Save the result
all_readings.to_csv('combined_minute_avg.csv', index=False)