In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import hmean
import pandas as pd
import numpy as np
from scipy.stats import linregress
import matplotlib.dates as mdates
from scipy.interpolate import UnivariateSpline
import itertools
import gsw
from scipy.optimize import minimize

In [None]:
def process_data(filepath):    
    # Load the CSV file into a DataFrame
    data_df = pd.read_csv(filepath)
    
    # Define constants for the number of header rows and the row indicating the start of data
    NUM_HEADER_ROWS = 14
    DATA_START_MARKER = "# Data"
    
    # Separate header rows and data rows
    header_rows = data_df.iloc[:NUM_HEADER_ROWS]
    data_rows = data_df[data_df["Code"] != DATA_START_MARKER]
    
    # Create a dictionary to store DataFrames for each code
    dataframes_dict = {}
    
    # Iterate through the header rows
    for _, row in header_rows.iterrows():
        # Extract the code and headers for this code
        code, *headers = row.dropna().values
        
        # Rename the columns in a temporary DataFrame based on the extracted headers
        rename_dict = {f"V{i+1}": header for i, header in enumerate(headers)}
        temp_df = data_rows.copy()
        temp_df.rename(columns=rename_dict, inplace=True)
        
        # Get the corresponding data rows based on the code
        corresponding_data_rows = temp_df[temp_df["Code"] == code]
        
        # If there are corresponding data rows:
        if not corresponding_data_rows.empty:
            # Set the first row as the header
            corresponding_data_rows.columns = corresponding_data_rows.iloc[0].values
            # Drop the first row (now header)
            corresponding_data_rows = corresponding_data_rows.drop(corresponding_data_rows.index[0])
            dataframes_dict[code] = corresponding_data_rows
            
    return dataframes_dict

def create_nested_dictionary(filepaths):
    """
    Process multiple files and return a dictionary containing dataframes for each file.
    The keys of the outer dictionary are the last 4 digits of the csv file name.
    """
    all_dfs = {}
    
    for filepath in filepaths:
        # Extracting the identifier from the filename (using the last 4 digits)
        identifier = filepath.split("_")[-2]
        
        all_dfs[identifier] = process_data(filepath)
        
    return all_dfs

# Check and convert to datetime if not already
def ensure_datetime(df, column_name):
    if not pd.api.types.is_datetime64_any_dtype(df[column_name]):
        df[column_name] = pd.to_datetime(df[column_name])
        
def remove_outliers(df, column_name):
    """
    Removes outliers from a dataframe based on the Interquartile Range (IQR) for a given column.
    """
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Return only rows where the value is within the bounds
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

def calculate_harmonic_mean(series1, series2):
    # Drop NaN values
    combined = pd.concat([series1, series2], axis=1).dropna()
    harmonic_values = hmean(combined, axis=1)
    return pd.Series(harmonic_values, index=combined.index)


# Function to iterate over timeseries or use a constant value
def iterate_or_constant(value):
    if isinstance(value, (list, np.ndarray)):
        return value
    else:
        return itertools.repeat(value)

# Function using GSW with TEOS-10 conversions
def velocity_timeseries_TEOS10(T_series, P_series_kPa, S_series):
    velocity_series = []
    for T, P_kPa, S in zip(iterate_or_constant(T_series), iterate_or_constant(P_series_kPa), iterate_or_constant(S_series)):
        P_dbar = P_kPa * 0.1  # Convert kPa to dbar
        SA = gsw.SA_from_SP(S, P_dbar, longitude, latitude)
        CT = gsw.CT_from_t(SA, T, P_dbar)
        velocity_series.append(gsw.sound_speed(SA, CT, P_dbar))
    return velocity_series

# Function using basic T, P, S with GSW
def velocity_timeseries_basic(T_series, P_series_kPa, S_series):
    velocity_series = []
    for T, P_kPa, S in zip(iterate_or_constant(T_series), iterate_or_constant(P_series_kPa), iterate_or_constant(S_series)):
        P_dbar = P_kPa * 0.1  # Convert kPa to dbar
        velocity_series.append(gsw.sound_speed(S, T, P_dbar))
    return velocity_series

# Function using Chen and Millero equation
def velocity_timeseries_chen_millero(T_series, P_series_kPa, S_series):
    velocity_series = []
    for T, P_kPa, S in zip(iterate_or_constant(T_series), iterate_or_constant(P_series_kPa), iterate_or_constant(S_series)):
        P_bar = P_kPa * 0.01  # Convert kPa to bar
        velocity_series.append(sound_speed_chen_millero(S, T, P_bar))
    return velocity_series

# Function to perform linear regression and extrapolation
def fit_and_extrapolate(df, start_time, end_time, target_time):
    subset = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    slope, intercept, _, _, _ = linregress(
        x=subset['time'].map(pd.Timestamp.timestamp),
        y=subset['sea_water_practical_salinity']
    )
    return slope * target_time.timestamp() + intercept

In [None]:
# List of filepaths to process
filepaths = [
    "Data_230912111909_East_006870_2502_.csv",
    "Data_230912112033_West_006874_2503_.csv",
    "Data_230913091309_North_00687A_2504_.csv"
]

# Process the files again using the updated outlier removal function
df_dict = create_nested_dictionary(filepaths)

# Convert the 'SoundSpeed (m/s)' column to a numeric data type and then proceed with the outlier removal and dataframe creation using IQR
result_dfs = {}
identifiers = ['2502', '2503', '2504']

for identifier in identifiers:
    ssp_df = df_dict[identifier]['SSP'].copy()
    ssp_df['SoundSpeed (m/s)'] = pd.to_numeric(ssp_df['SoundSpeed (m/s)'], errors='coerce')
    cleaned_df = remove_outliers(ssp_df, 'SoundSpeed (m/s)')
    result_df = cleaned_df[['Record Time', 'SoundSpeed (m/s)']]
    result_df = result_df.rename(columns={'SoundSpeed (m/s)': 'SoundSpeed'})
    result_dfs[identifier] = result_df

# Converting the 'Record Time' column in the dataframes within result_dfs to datetime format
for identifier, df in result_dfs.items():
    df['Record Time'] = pd.to_datetime(df['Record Time'])

# Setting the 'SoundSpeed' values of 2503 to NaN
result_dfs['2503']['SoundSpeed'] = np.nan


# Set larger font sizes
plt.rcParams.update({'font.size': 14, 'legend.fontsize': 14})
plt.rcParams['figure.dpi'] = 300
plt.figure(figsize=(12, 5))

identifiers = ['2502', '2504']
titles = ["Filtered SSP for 2502", "Filtered SSP for 2504"]
colors = ['blue', 'orange']  # Adjusted color list to match the number of identifiers

for identifier, title, color in zip(identifiers, titles, colors):
    plt.plot(result_dfs[identifier]['Record Time'], result_dfs[identifier]['SoundSpeed'], 
             label=title, color=color, linewidth=2)  # Increased line width

plt.xlabel("Record Time")
plt.ylabel("SoundSpeed (m/s)")
plt.grid(True, linestyle='--', alpha=0.7)  # Lighter grid lines
plt.xticks(rotation=45)
plt.legend(loc='best')  # Optimal legend placement
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
# List of identifiers
identifiers = ["2504", "2503", "2502"]

# Extracting the data for each identifier
data_extracted = {}
for identifier in identifiers:
    data_extracted[identifier] = {
        "TMP": df_dict[identifier]["TMP"] if "TMP" in df_dict[identifier] else None,
        "DQZ": df_dict[identifier]["DQZ"] if "DQZ" in df_dict[identifier] else None,
        "INC": df_dict[identifier]["INC"] if "INC" in df_dict[identifier] else None
    }

# Converting columns to numeric type and removing NaN values
for identifier in identifiers:
    if data_extracted[identifier]["TMP"] is not None:
        data_extracted[identifier]["TMP"]["Temperature Deg C"] = pd.to_numeric(data_extracted[identifier]["TMP"]["Temperature Deg C"], errors='coerce')
        data_extracted[identifier]["TMP"].dropna(subset=["Temperature Deg C"], inplace=True)
    
    if data_extracted[identifier]["DQZ"] is not None:
        data_extracted[identifier]["DQZ"]["Pressure (kPa)"] = pd.to_numeric(data_extracted[identifier]["DQZ"]["Pressure (kPa)"], errors='coerce')
        data_extracted[identifier]["DQZ"].dropna(subset=["Pressure (kPa)"], inplace=True)
        
    if data_extracted[identifier]["DQZ"] is not None:
        data_extracted[identifier]["DQZ"]['Temperature (Deg C)'] = pd.to_numeric(data_extracted[identifier]["DQZ"]['Temperature (Deg C)'], errors='coerce')
        data_extracted[identifier]["DQZ"].dropna(subset=['Temperature (Deg C)'], inplace=True)
        
    if data_extracted[identifier]["INC"] is not None:
        data_extracted[identifier]["INC"]['Pitch (deg)'] = pd.to_numeric(data_extracted[identifier]["INC"]['Pitch (deg)'], errors='coerce')
        data_extracted[identifier]["INC"].dropna(subset=['Pitch (deg)'], inplace=True)  
        
    if data_extracted[identifier]["INC"] is not None:
        data_extracted[identifier]["INC"]['Roll (deg)'] = pd.to_numeric(data_extracted[identifier]["INC"]['Roll (deg)'], errors='coerce')
        data_extracted[identifier]["INC"].dropna(subset=['Roll (deg)'], inplace=True) 

# Plotting the data after removing outliers
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15),dpi=300)
fig.tight_layout(pad=6.0)

for idx, identifier in enumerate(identifiers):
    # Removing outliers and plotting temperature
    if data_extracted[identifier]["TMP"] is not None:
        data_extracted[identifier]["TMP"] = remove_outliers(data_extracted[identifier]["TMP"], "Temperature Deg C")
        axes[idx, 0].plot(pd.to_datetime(data_extracted[identifier]["TMP"]["Record Time"]), 
                          data_extracted[identifier]["TMP"]["Temperature Deg C"])
        axes[idx, 0].set_title(f"Temperature Timeseries for {identifier}")
        axes[idx, 0].set_xlabel("Time")
        axes[idx, 0].set_ylabel("Temperature (°C)")
    
    # Removing outliers and plotting pressure
    if data_extracted[identifier]["DQZ"] is not None:
        data_extracted[identifier]["DQZ"] = remove_outliers(data_extracted[identifier]["DQZ"], "Pressure (kPa)")
        data_extracted[identifier]["DQZ"] = remove_outliers(data_extracted[identifier]["DQZ"], 'Temperature (Deg C)')
        axes[idx, 1].plot(pd.to_datetime(data_extracted[identifier]["DQZ"]["Record Time"]), 
                          data_extracted[identifier]["DQZ"]["Pressure (kPa)"])
        axes[idx, 1].set_title(f"Pressure Timeseries for {identifier}")
        axes[idx, 1].set_xlabel("Time")
        axes[idx, 1].set_ylabel("Pressure (kPa)")
    
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
result_dfs['2502']['Record Time'][result_dfs['2502']['Record Time'].duplicated()]

## Observed Velocity

In [None]:
# Initialize dictionaries to store the dataframes with harmonic means
harmonic_mean_dfs = {}

# Calculate harmonic means between the time series stored in 2502, 2503, and 2504
pairs = [('2502', '2503'), ('2502', '2504'), ('2503', '2504')]

# Calculate harmonic means between the time series stored in 2502, 2503, and 2504
for pair in pairs:
    df1 = result_dfs[pair[0]].set_index('Record Time')
    df2 = result_dfs[pair[1]].set_index('Record Time')
    df1 = df1[~df1.index.duplicated(keep='first')]
    df2 = df2[~df2.index.duplicated(keep='first')]
    
    # Calculate harmonic mean for each pair and create a new dataframe
    harmonic_mean = calculate_harmonic_mean(df1['SoundSpeed'], df2['SoundSpeed'])
    harmonic_mean_df = pd.DataFrame({
        'Record Time': harmonic_mean.index,
        'Harmonic Mean': harmonic_mean.values
    })
    
    harmonic_mean_dfs[f'{pair[0]}_{pair[1]}'] = harmonic_mean_df
    
# Updating the dataframes based on the instructions
harmonic_mean_dfs['2502_2503'] = result_dfs['2502'][['Record Time', 'SoundSpeed']]
harmonic_mean_dfs['2503_2504'] = result_dfs['2504'][['Record Time', 'SoundSpeed']]

In [None]:
harmonic_mean_dfs['2502_2504']

## Salinity Correction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
from scipy.ndimage import uniform_filter1d


# Load the CSV into a DataFrame
file_path = 'ooi-rs03ccal-mj03f-12-ctdpfb305_fedb_e1a6_bd2b.csv'#'ooi-rs03ccal-mj03f-12-ctdpfb305_40d5_df03_3834 (1).csv'
df = pd.read_csv(file_path, usecols=[0, 1])

# Drop the first row (with units) and reset the index
df = df.drop(0).reset_index(drop=True)

# Convert the 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Filter out rows where 'sea_water_practical_salinity' is above 100 or below 34.48
df_filtered = df[(df['sea_water_practical_salinity'] <= 100) & (df['sea_water_practical_salinity'] >= 34.48)].copy()

# Remove timezone information from 'time'
df_filtered.loc[:, 'time'] = df_filtered['time'].dt.tz_localize(None)

smoothing_start_date = pd.to_datetime('2023-08-01')

# Find the index where smoothing should start
smoothing_start_index = df_filtered.index[df_filtered['time'] >= smoothing_start_date][0]

# Apply smoothing only to the portion after the start date
smoothed_salinity = uniform_filter1d(df_filtered['sea_water_practical_salinity'].iloc[smoothing_start_index:], size=250)

# Combine unchanged data with the smoothed portion
df_filtered['sea_water_practical_salinity'] = pd.concat([
    df_filtered['sea_water_practical_salinity'].iloc[:smoothing_start_index],
    pd.Series(smoothed_salinity, index=df_filtered.index[smoothing_start_index:])
])

# Define the range
start_time = pd.to_datetime('2022-08-14 05:49:53')
end_time = pd.to_datetime('2024-09-05 23:59:00')

# Filter the salinity data based on the defined range
mask = (df_filtered['time'] >= start_time) & (df_filtered['time'] <= end_time)
salinity_df = df_filtered.loc[mask].copy()


# Define the discrete points and their actual salinity values
discrete_points = pd.to_datetime(['2022-08-14 05:49:53', '2022-08-30 19:37:13', '2023-09-17 03:35:09'])
actual_salinity_values = [34.52543602, 34.5272034, 34.527000]

# Function to find the closest time in the DataFrame to a given discrete point
def find_closest_time(df, target_time):
    absolute_difference = abs(df['time'] - target_time)
    closest_index = absolute_difference.idxmin()
    return df.loc[closest_index]

# First Correction: Adjust the entire series to align with the second discrete point
second_point_data = find_closest_time(salinity_df, discrete_points[1])
second_point_salinity = second_point_data['sea_water_practical_salinity']
salinity_adjustment = actual_salinity_values[1] - second_point_salinity
salinity_df.loc[:, 'adjusted_salinity'] = salinity_df['sea_water_practical_salinity'] + salinity_adjustment

# Extrapolation function using linregress
def fit_and_extrapolate(df, start_time, end_time, target_time):
    subset = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    slope, intercept, _, _, _ = linregress(
        x=subset['time'].map(pd.Timestamp.timestamp),
        y=subset['adjusted_salinity']
    )
    return slope * target_time.timestamp() + intercept

# Third Correction: Drift correction for data between second and third discrete points
fit_start_2 = pd.to_datetime('2023-06-15 03:35:09')
fit_end_2 = pd.to_datetime('2024-09-07 23:59:00')
extrapolated_end_salinity = fit_and_extrapolate(salinity_df, fit_start_2, fit_end_2, discrete_points[2])
drift_correction_at_third_point = actual_salinity_values[2] - extrapolated_end_salinity

# Apply linear correction between the second and third discrete points
time_difference = discrete_points[2] - discrete_points[1]
salinity_df.loc[:, 'second_stage_corrected_salinity'] = salinity_df.apply(
    lambda row: row['adjusted_salinity'] + drift_correction_at_third_point * (1 - (discrete_points[2] - row['time']) / time_difference) if discrete_points[1] <= row['time'] <= discrete_points[2] else row['adjusted_salinity'], 
    axis=1
)

# New Correction: Adjust salinity values between first and second discrete points
fit_start_1 = pd.to_datetime('2022-08-14 15:54:00')
fit_end_1 = pd.to_datetime('2022-09-15 03:35:09')
extrapolated_first_point_salinity = fit_and_extrapolate(salinity_df, fit_start_1, fit_end_1, discrete_points[0])
drift_correction_at_first_point = actual_salinity_values[0] - extrapolated_first_point_salinity

# Apply linear correction between the first and second discrete points
time_difference = discrete_points[1] - discrete_points[0]
salinity_df.loc[:, 'corrected_salinity'] = salinity_df.apply(
    lambda row: row['second_stage_corrected_salinity'] + drift_correction_at_first_point * (1 - (row['time'] - discrete_points[0]) / time_difference) if discrete_points[0] <= row['time'] <= discrete_points[1] else row['second_stage_corrected_salinity'],
    axis=1
)


In [None]:
plt.figure(figsize=(16, 5))
#plt.plot(salinity_df['time'], salinity_df['sea_water_practical_salinity'], label="Original Salinity", alpha=0.7)
plt.plot(salinity_df['time'], salinity_df['corrected_salinity'], label="After Final Correction", linestyle=':', color = 'black')
#plt.scatter(discrete_points, actual_salinity_values, color='red', marker='o', label="Discrete Data Points")
#plt.plot(discrete_points[0:3], actual_salinity_values[0:3],'-.', label="Discrete Points", color='red')
plt.rcParams.update({'font.size': 18})
plt.xlabel('Time')
plt.ylabel('Salinity (PSU)')
#plt.title('Salinity Corrections over Time with Discrete Data Points')
#plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming salinity_df is your DataFrame with the corrected salinity data
# and df_discrete contains the discrete points with actual salinity values

# Define the time range to zoom in (1st and 2nd discrete points)
start_time = discrete_points[0]- pd.Timedelta(days=20)  # first discrete point
end_time = discrete_points[1]+ pd.Timedelta(days=5)    # second discrete point


# Filter the DataFrame for the zoomed-in range
zoomed_df = salinity_df[(salinity_df['time'] >= start_time) & (salinity_df['time'] <= end_time)]

# Plotting
plt.figure(figsize=(12, 7))
plt.plot(zoomed_df['time'], zoomed_df['sea_water_practical_salinity'],'--', label="Corrected Salinity", color='blue')
plt.plot(zoomed_df['time'], zoomed_df['corrected_salinity'],'--', label="2nd", color='orange')
#plt.plot(zoomed_df['time'], zoomed_df['third_stage_corrected_salinity'],'--', label="3rd", color='green')
#plt.plot(discrete_points[0:2], actual_salinity_values[0:2],'-.', label="Discrete Points", color='red')

plt.scatter(discrete_points[0:2], actual_salinity_values[0:2], label="Discrete Points", color='red')

plt.xlabel('Time')
plt.ylabel('Salinity (PSU)')
plt.title('Salinity Correction between 1st and 2nd Discrete Points')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


## Init Storage

#### 2504

In [None]:
import pandas as pd

# Convert the 'Record Time' to datetime and set as index for pressure DataFrame
pressure_df4 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2504']['DQZ']['Record Time']),
    'Pressure (kPa)': data_extracted['2504']['DQZ']['Pressure (kPa)']
}).set_index('Record Time')

DQZtemp_df4 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2504']['DQZ']['Record Time']),
    'Temperature Deg C': data_extracted['2504']['DQZ']['Temperature (Deg C)']
}).set_index('Record Time')

# Convert the 'Record Time' to datetime and set as index for temperature DataFrame
temperature_df4 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2504']['TMP']['Record Time']),
    'Temperature Deg C': data_extracted['2504']['TMP']['Temperature Deg C']+0.351
}).set_index('Record Time')

INC_df4 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2504']['INC']['Record Time']),
    'Pitch': data_extracted['2504']['INC']['Pitch (deg)'],
    'Roll': data_extracted['2504']['INC']['Roll (deg)']
})

# Merge pressure and temperature data on their indexes (Record Time)
combined_df4 = pressure_df4.join(temperature_df4, how='left')
DQZtemp_df4 = DQZtemp_df4[~DQZtemp_df4.index.duplicated(keep='first')]
combined_df4['TempDQZ']=DQZtemp_df4['Temperature Deg C']

# Now, for salinity, which has different times, let's create a DataFrame and reindex it to the pressure times,
# filling missing values by interpolating or forward-filling (depending on your needs)
salinity_dfs4 = pd.DataFrame({
    'time': pd.to_datetime(salinity_df['time']),
    'adjusted_salinity': salinity_df['corrected_salinity']
}).set_index('time')

# Reindex the salinity data to match the pressure record times, forward-filling values
aligned_salinity_df4 = salinity_dfs4.reindex(combined_df4.index, method='nearest')

# Add the aligned salinity data to the combined dataframe
combined_df4['Salinity'] = aligned_salinity_df4['adjusted_salinity']

# Reset index to make 'Record Time' a column again, if needed
combined_df4.reset_index(inplace=True)

# Rename the index to 'Record Time'
combined_df4.rename(columns={'index': 'Record Time'}, inplace=True)

combined_df4

#### 2503

In [None]:
# Convert the 'Record Time' to datetime and set as index for pressure DataFrame
pressure_df3 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2503']['DQZ']['Record Time']),
    'Pressure (kPa)': data_extracted['2503']['DQZ']['Pressure (kPa)']
}).set_index('Record Time')

# Convert the 'Record Time' to datetime and set as index for temperature DataFrame
temperature_df3 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2503']['TMP']['Record Time']),
    'Temperature Deg C': data_extracted['2503']['TMP']['Temperature Deg C']
}).set_index('Record Time')

INC_df3 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2503']['INC']['Record Time']),
    'Pitch': data_extracted['2503']['INC']['Pitch (deg)'],
    'Roll': data_extracted['2503']['INC']['Roll (deg)']
})

# Merge pressure and temperature data on their indexes (Record Time)
combined_df3 = pressure_df3.join(temperature_df3, how='left')

# Now, for salinity, which has different times, let's create a DataFrame and reindex it to the pressure times,
# filling missing values by interpolating or forward-filling (depending on your needs)
salinity_dfs3 = pd.DataFrame({
    'time': pd.to_datetime(salinity_df['time']),
    'adjusted_salinity': salinity_df['corrected_salinity']
}).set_index('time')

# Reindex the salinity data to match the pressure record times, forward-filling values
aligned_salinity_df3 = salinity_dfs3.reindex(combined_df3.index, method='nearest')

# Add the aligned salinity data to the combined dataframe
combined_df3['Salinity'] = aligned_salinity_df3['adjusted_salinity']

# Reset index to make 'Record Time' a column again, if needed
combined_df3.reset_index(inplace=True)

# Rename the index to 'Record Time'
combined_df3.rename(columns={'index': 'Record Time'}, inplace=True)

combined_df3

#### 2502

In [None]:
# Convert the 'Record Time' to datetime and set as index for pressure DataFrame
pressure_df = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2502']['DQZ']['Record Time']),
    'Pressure (kPa)': data_extracted['2502']['DQZ']['Pressure (kPa)']
}).set_index('Record Time')

# Convert the 'Record Time' to datetime and set as index for temperature DataFrame
temperature_df1 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2502']['TMP']['Record Time']),
    'Temperature Deg C': data_extracted['2502']['TMP']['Temperature Deg C']
}).set_index('Record Time')

INC_df2 = pd.DataFrame({
    'Record Time': pd.to_datetime(data_extracted['2502']['INC']['Record Time']),
    'Pitch': data_extracted['2502']['INC']['Pitch (deg)'],
    'Roll': data_extracted['2502']['INC']['Roll (deg)']
})

# Merge pressure and temperature data on their indexes (Record Time)
combined_df2 = pressure_df.join(temperature_df1, how='left')

# Now, for salinity, which has different times, let's create a DataFrame and reindex it to the pressure times,
# filling missing values by interpolating or forward-filling (depending on your needs)
salinity_dfs2 = pd.DataFrame({
    'time': pd.to_datetime(salinity_df['time']),
    'adjusted_salinity': salinity_df['corrected_salinity']
}).set_index('time')

# Reindex the salinity data to match the pressure record times, forward-filling values
aligned_salinity_df2 = salinity_dfs2.reindex(combined_df2.index, method='nearest')

# Add the aligned salinity data to the combined dataframe
combined_df2['Salinity'] = aligned_salinity_df2['adjusted_salinity']

# Reset index to make 'Record Time' a column again, if needed
combined_df2.reset_index(inplace=True)

# Rename the index to 'Record Time'
combined_df2.rename(columns={'index': 'Record Time'}, inplace=True)

combined_df2

## Pitch and Roll 

In [None]:
INC_df4

In [None]:
INC_dfs3 = INC_df3[(INC_df3['Pitch'] < -3) & (INC_df3['Pitch'] > -3.5)]

# Plotting
plt.figure(figsize=(12, 5))
plt.rcParams.update({'font.size': 16})
plt.scatter(INC_dfs3['Record Time'], INC_dfs3['Pitch'], s=2, label='2503 Pitch')
plt.xlabel("Time")
plt.ylabel("Pitch (Deg)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()


In [None]:
INC_dfs3 = INC_df3[(INC_df3['Roll'] < 0)]

# Plotting
plt.figure(figsize=(12, 5))
plt.rcParams.update({'font.size': 16})
plt.scatter(INC_dfs3['Record Time'], INC_dfs3['Roll'], s=2, label='2503 Roll')
plt.xlabel("Time")
plt.ylabel("Roll (Deg)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend()
plt.show()


## Pressure Correction

In [None]:
#new correction model
import pandas as pd
import numpy as np
from scipy.optimize import least_squares

def parse_to_dataframe(data):
    lines = data.split('\n')
    parsed_data = [line.split() for line in lines if line]
    df = pd.DataFrame(parsed_data, columns=['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Value'])
    df['DateTime'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']])
    df['Value'] = pd.to_numeric(df['Value'])
    df.drop(['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], axis=1, inplace=True)
    return df

# Reading the contents of the files
with open('pred_F_2022.txt', 'r') as file_2022, open('pred_F_2023.txt', 'r') as file_2023, open('pred_F_2024.txt', 'r') as file_2024:
    data_2022 = file_2022.read()
    data_2023 = file_2023.read()
    data_2024 = file_2024.read()

df_2022 = parse_to_dataframe(data_2022)
df_2023 = parse_to_dataframe(data_2023)
df_2024 = parse_to_dataframe(data_2024)

tidal_df = pd.concat([df_2022, df_2023,df_2024], ignore_index=True)
tidal_df.set_index('DateTime', inplace=True)

# Assuming combined_df4 is already defined and contains pressure data
#combined_df4['Pressure (kPa)'] and combined_df4['Record Time']

# Convert 'Record Time' to DateTime and set as index
combined_df4['DateTime'] = pd.to_datetime(combined_df4['Record Time'])
combined_df4.set_index('DateTime', inplace=True)

# Optimization function
def optimize_tidal_influence(params, tidal_df, pressure_df):
    amplitude, rho = params
    adjusted_tidal = amplitude * tidal_df['Value']
    tidal_df['Adjusted Value'] = adjusted_tidal
    interpolated_tidal = tidal_df.reindex(pressure_df.index, method='nearest')['Adjusted Value']
    corrected_pressure = pressure_df['Pressure (kPa)'] - (rho * 9.81 * interpolated_tidal) * 0.001
    return np.var(corrected_pressure)

# Initial guesses for amplitude, phase, and rho
initial_amplitude = 1  # Adjust based on your data
initial_phase = 0      # Adjust based on your data
initial_rho = 1025     # Initial guess for the density of seawater (kg/m^3)

# Perform optimization
result = least_squares(optimize_tidal_influence, x0=[initial_amplitude, initial_rho],
                       args=(tidal_df, combined_df4))

optimized_amplitude, optimized_rho = result.x

# Apply optimized parameters
tidal_df['Adjusted Value'] = optimized_amplitude * tidal_df['Value']
combined_df4['Tidal Influence (kPa)'] = (optimized_rho * 9.81 * tidal_df.reindex(combined_df4.index, method='nearest')['Adjusted Value']) * 0.001
combined_df4['Corrected Pressure (kPa)'] = combined_df4['Pressure (kPa)'] - combined_df4['Tidal Influence (kPa)']

# Proceed with any further analysis or steps as needed
# Reset the index to make 'Record Time' a column again
combined_df4.reset_index(inplace=True)

del tidal_df
combined_df4

In [None]:
#new correction model
import pandas as pd
import numpy as np
from scipy.optimize import least_squares

def parse_to_dataframe(data):
    lines = data.split('\n')
    parsed_data = [line.split() for line in lines if line]
    df = pd.DataFrame(parsed_data, columns=['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Value'])
    df['DateTime'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']])
    df['Value'] = pd.to_numeric(df['Value'])
    df.drop(['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], axis=1, inplace=True)
    return df

# Reading the contents of the files
with open('pred_F_2022.txt', 'r') as file_2022, open('pred_F_2023.txt', 'r') as file_2023:
    data_2022 = file_2022.read()
    data_2023 = file_2023.read()

df_2022 = parse_to_dataframe(data_2022)
df_2023 = parse_to_dataframe(data_2023)

tidal_df = pd.concat([df_2022, df_2023], ignore_index=True)
tidal_df.set_index('DateTime', inplace=True)

# Assuming combined_df4 is already defined and contains pressure data
# combined_df4['Pressure (kPa)'] and combined_df4['Record Time']

# Convert 'Record Time' to DateTime and set as index
combined_df3['DateTime'] = pd.to_datetime(combined_df3['Record Time'])
combined_df3.set_index('DateTime', inplace=True)

# Optimization function
def optimize_tidal_influence(params, tidal_df, pressure_df):
    amplitude, rho = params
    adjusted_tidal = amplitude * tidal_df['Value']
    tidal_df['Adjusted Value3'] = adjusted_tidal
    interpolated_tidal = tidal_df.reindex(pressure_df.index, method='nearest')['Adjusted Value3']
    corrected_pressure = pressure_df['Pressure (kPa)'] - (rho * 9.81 * interpolated_tidal) * 0.001
    return np.var(corrected_pressure)

# Initial guesses for amplitude, phase, and rho
initial_amplitude = 1  # Adjust based on your data
initial_phase = 0      # Adjust based on your data
initial_rho = 1025     # Initial guess for the density of seawater (kg/m^3)

# Perform optimization
result = least_squares(optimize_tidal_influence, x0=[initial_amplitude, initial_rho],
                       args=(tidal_df, combined_df3))

optimized_amplitude, optimized_rho = result.x

# Apply optimized parameters
tidal_df['Adjusted Value3'] = optimized_amplitude * tidal_df['Value']
combined_df3['Tidal Influence (kPa)'] = (optimized_rho * 9.81 * tidal_df.reindex(combined_df3.index, method='nearest')['Adjusted Value3']) * 0.001
combined_df3['Corrected Pressure (kPa)'] = combined_df3['Pressure (kPa)'] - combined_df3['Tidal Influence (kPa)']

# Proceed with any further analysis or steps as needed
# Reset the index to make 'Record Time' a column again
combined_df3.reset_index(inplace=True)
combined_df3

In [None]:
#new correction model
import pandas as pd
import numpy as np
from scipy.optimize import least_squares

def parse_to_dataframe(data):
    lines = data.split('\n')
    parsed_data = [line.split() for line in lines if line]
    df = pd.DataFrame(parsed_data, columns=['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Value'])
    df['DateTime'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']])
    df['Value'] = pd.to_numeric(df['Value'])
    df.drop(['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], axis=1, inplace=True)
    return df

# Reading the contents of the files
with open('pred_F_2022.txt', 'r') as file_2022, open('pred_F_2023.txt', 'r') as file_2023:
    data_2022 = file_2022.read()
    data_2023 = file_2023.read()

df_2022 = parse_to_dataframe(data_2022)
df_2023 = parse_to_dataframe(data_2023)

tidal_df = pd.concat([df_2022, df_2023], ignore_index=True)
tidal_df.set_index('DateTime', inplace=True)

# Assuming combined_df4 is already defined and contains pressure data
# combined_df4['Pressure (kPa)'] and combined_df4['Record Time']

# Convert 'Record Time' to DateTime and set as index
combined_df2['DateTime'] = pd.to_datetime(combined_df2['Record Time'])
combined_df2.set_index('DateTime', inplace=True)

# Optimization function
def optimize_tidal_influence(params, tidal_df, pressure_df):
    amplitude, rho = params
    adjusted_tidal = amplitude * tidal_df['Value']
    tidal_df['Adjusted Value2'] = adjusted_tidal
    interpolated_tidal = tidal_df.reindex(pressure_df.index, method='nearest')['Adjusted Value2']
    corrected_pressure = pressure_df['Pressure (kPa)'] - (rho * 9.81 * interpolated_tidal) * 0.001
    return np.var(corrected_pressure)

# Initial guesses for amplitude, phase, and rho
initial_amplitude = 1  # Adjust based on your data
initial_phase = 0      # Adjust based on your data
initial_rho = 1025     # Initial guess for the density of seawater (kg/m^3)

# Perform optimization
result = least_squares(optimize_tidal_influence, x0=[initial_amplitude, initial_rho],
                       args=(tidal_df, combined_df2))

optimized_amplitude, optimized_rho = result.x

# Apply optimized parameters
tidal_df['Adjusted Value2'] = optimized_amplitude * tidal_df['Value']
combined_df2['Tidal Influence (kPa)'] = (optimized_rho * 9.81 * tidal_df.reindex(combined_df2.index, method='nearest')['Adjusted Value2']) * 0.001
combined_df2['Corrected Pressure (kPa)'] = combined_df2['Pressure (kPa)'] - combined_df2['Tidal Influence (kPa)']

# Proceed with any further analysis or steps as needed
# Reset the index to make 'Record Time' a column again
combined_df2.reset_index(inplace=True)
combined_df2

In [None]:
from scipy.interpolate import interp1d
# Create a function to interpolate velocity
def interpolate_velocity(time_series, velocity_df):
    # Convert 'Record Time' to numeric for interpolation
    velocity_df['Record Time'] = pd.to_datetime(velocity_df['Record Time'])
    time_numeric = velocity_df['Record Time'].astype('int64')
    
    # Interpolator function
    interpolator = interp1d(time_numeric, velocity_df['SoundSpeed'], 
                            fill_value='extrapolate', bounds_error=False)
    
    # Apply interpolation
    return interpolator(pd.to_datetime(time_series).astype('int64'))

# Apply interpolation to get velocity values for combined_df4 datetimes
combined_df4['Record Time'] = pd.to_datetime(combined_df4['Record Time'])
combined_df4['interp_v'] = interpolate_velocity(combined_df4['Record Time'], result_dfs['2504'])

In [None]:
combined_df2.reset_index(inplace=True)

In [None]:

plt.figure(figsize=(16, 4))
#plt.plot(salinity_df['time'], salinity_df['sea_water_practical_salinity'], label="Original Salinity", alpha=0.7)
plt.plot(salinity_df['time'], salinity_df['corrected_salinity'], label="After Final Correction", linestyle=':', color = 'black')
#plt.scatter(discrete_points, actual_salinity_values, color='red', marker='o', label="Discrete Data Points")
#plt.plot(discrete_points[0:3], actual_salinity_values[0:3],'-.', label="Discrete Points", color='red')
plt.rcParams.update({'font.size': 18})
plt.xlabel('Time')
plt.ylabel('Salinity (PSU)')
#plt.title('Salinity Corrections over Time with Discrete Data Points')
#plt.legend()
#plt.xticks(rotation=45)
plt.tight_layout()

plt.show()


In [None]:
plt.figure(figsize=(16, 4))
plt.plot(combined_df2['Record Time'], combined_df2['Pressure (kPa)'], label='2504 Measured Pressure')
plt.rcParams.update({'font.size': 18})
plt.ylabel("Pressure (kPa)")
plt.xlabel('Time')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
plt.figure(figsize=(16, 4))
plt.plot(combined_df2['Record Time'],combined_df2['Temperature Deg C'],label='Temperature', color = 'orange')
plt.rcParams.update({'font.size': 18})
plt.xlabel('Time')
plt.ylabel('Temperature Deg C')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 4))
plt.plot(result_dfs['2502']['Record Time'], result_dfs['2502']['SoundSpeed'],label='2502 Recorded Velocity', color = 'green')
plt.rcParams.update({'font.size': 18})
plt.xlabel('Time')
plt.ylabel('Velocity m/s')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
print(combined_df2.columns)

In [None]:
import matplotlib.pyplot as plt

# Create a figure with 4 subplots, each having a size of 16x4
fig, axs = plt.subplots(4, 1, figsize=(16, 13))
plt.rcParams.update({'font.size': 24})
# Plot 1: Velocity
axs[0].plot(result_dfs['2502']['Record Time'], result_dfs['2502']['SoundSpeed'], label='2502 Recorded Velocity', color='green')
axs[0].plot(combined_df2['Record Time'],combined_df2['velocity'], label='2502 Calculated Velocity', color='black',linewidth=1)
axs[0].set_ylabel('Velocity m/s')
axs[0].tick_params(axis='x', labelbottom=False)  # Remove x-axis labels

# Plot 2: Temperature
axs[1].plot(combined_df2['Record Time'], combined_df2['Temperature Deg C'], label='Temperature', color='orange')
axs[1].set_ylabel('Temperature C°')
axs[1].tick_params(axis='x', labelbottom=False)  # Remove x-axis labels

# Plot 3: Pressure
axs[2].plot(combined_df2['Record Time'], combined_df2['Pressure (kPa)'], label='2504 Measured Pressure')
axs[2].set_ylabel('Pressure (kPa)')
axs[2].tick_params(axis='x', labelbottom=False)  # Remove x-axis labels

# Plot 4: Salinity
axs[3].plot(salinity_df['time'], salinity_df['corrected_salinity'], label="After Final Correction", linestyle=':', color='black')
axs[3].set_xlabel('Time')
axs[3].set_ylabel('Salinity (PSU)')
axs[3].tick_params(axis='x', rotation=45)

plt.tight_layout()

# Display the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a figure and a set of subplots
fig, axs = plt.subplots(4, 1, figsize=(12, 12))  # 3 subplots, one above the other

# Adjust the font size
plt.rcParams.update({'font.size': 14, 'legend.fontsize': 14})

# First subplot for combined_df2
axs[0].plot(combined_df2['Record Time'][200:300], combined_df2['Pressure (kPa)'][200:300], color='gold', label='2502 Measured Pressure')
axs[0].plot(combined_df2['Record Time'][200:300], combined_df2['Corrected Pressure (kPa)'][200:300], color='red', label='Measured Pressure - Tidal Model')
axs[0].set_ylabel("Pressure (kPa)")
axs[0].legend()
axs[0].tick_params(axis='x', rotation=45)

# Second subplot for combined_df3
axs[1].plot(combined_df3['Record Time'][200:300], combined_df3['Pressure (kPa)'][200:300], color='green', label='2503 Measured Pressure')
axs[1].plot(combined_df3['Record Time'][200:300], combined_df3['Corrected Pressure (kPa)'][200:300], color='red', label='Measured Pressure - Tidal Model')
axs[1].set_ylabel("Pressure (kPa)")
axs[1].legend()
axs[1].tick_params(axis='x', rotation=45)

# Third subplot for combined_df4
axs[2].plot(combined_df4['Record Time'][200:300], combined_df4['Pressure (kPa)'][200:300], label='2504 Measured Pressure')
axs[2].plot(combined_df4['Record Time'][200:300], combined_df4['Corrected Pressure (kPa)'][200:300], color='red', label='Measured Pressure - Tidal Model')
axs[2].set_ylabel("Pressure (kPa)")
axs[2].legend()
axs[2].tick_params(axis='x', rotation=45)

axs[3].plot(combined_df3['Record Time'][200:300], combined_df3['Tidal Influence (kPa)'][200:300], color='black', label='Tidal Model Pressure')
axs[3].set_xlabel("Time")
axs[3].set_ylabel("Pressure (kPa)")
axs[3].legend()
axs[3].tick_params(axis='x', rotation=45)


# Adjust the layout
plt.tight_layout()



# Show the plot
plt.show()


## Temperature Correction

In [None]:
import numpy as np
import pandas as pd
import gsw
from scipy.optimize import minimize
import matplotlib.pyplot as plt


# Assuming combined_df4 and result_dfs are already loaded DataFrames



def velocity_from_teos10(temp, salinity, pressure):
    # Convert pressure from kPa to dbar (1 dbar = 10 kPa)
    pressure_dbar = pressure*0.1
    # Latitude and Longitude
    longitude = -130.01149966
    latitude = 45.95882833

    # Calculate Absolute Salinity (SA) and Conservative Temperature (CT)
    SA = gsw.SA_from_SP(salinity, pressure_dbar, longitude, latitude)
    CT = gsw.CT_from_t(SA, temp, pressure_dbar)

    # Using gsw package for TEOS-10 sound speed calculation
    return gsw.sound_speed(CT, SA, pressure_dbar)

def objective_function(delta_T, temp_obs, salinity, pressure, velocity_obs):
    temp_true = temp_obs + delta_T
    velocity_calc = velocity_from_teos10(temp_true, salinity, pressure)
    mse = np.mean((velocity_calc - velocity_obs) ** 2)
    return mse

def find_best_temperature_offset(temp_obs, salinity, pressure, velocity_obs):
    result = minimize(objective_function, x0=0, args=(temp_obs, salinity, pressure, velocity_obs))
    return result.x

# Extracting data for analysis
temp_obs = combined_df4['Temperature Deg C'].values
salinity = combined_df4['Salinity'].values
pressure = combined_df4['Corrected Pressure (kPa)'].values
interp_v = combined_df4['interp_v'].values

# Finding the best temperature offset
best_offset = find_best_temperature_offset(temp_obs, salinity, pressure, interp_v)
print(f"Best-fit Temperature Offset: {best_offset}")

# Calculating velocity with best-fit temperature offset
adjusted_temp = temp_obs + best_offset
calculated_velocity = velocity_from_teos10(adjusted_temp, salinity, pressure)
#calculated_velocity0 = velocity_from_teos10(temp_obs, salinity, pressure)

# Plotting the results for comparison
plt.figure(figsize=(12, 6))
plt.plot(combined_df4['Record Time'], interp_v, label='Interpolated Observed Sound Speed', color='blue')
plt.plot(combined_df4['Record Time'], calculated_velocity, label='Calculated Velocity with Adjusted Temperature', color='red', linestyle='--')
plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
plt.title('Comparison of Interpolated Observed Sound Speed and Calculated Velocity')
plt.legend()
plt.show()

temp_obs

longitude = -130.01149966
latitude = 45.95882833
temp_obs = combined_df4['Temperature Deg C'].values
adjusted_temp = temp_obs + 0.2
calculated_velocity0 = velocity_timeseries_TEOS10(temp_obs, salinity, pressure)
calculated_velocity1 = velocity_timeseries_TEOS10(temp_obs+0.1, salinity, pressure)
calculated_velocity2 = velocity_timeseries_TEOS10(temp_obs+0.2, salinity, pressure)
calculated_velocity3 = velocity_timeseries_TEOS10(temp_obs+0.3, salinity, pressure)
plt.figure(figsize=(12, 6))
plt.plot(result_dfs['2504']['Record Time'], result_dfs['2504']['SoundSpeed'], label='2504', color='red')

plt.plot(combined_df4['Record Time'], calculated_velocity0, label='Calculated Velocity with Temperature', linestyle='--')
plt.plot(combined_df4['Record Time'], calculated_velocity2, label='Calculated Velocity with Adjusted Temperature 0.2', linestyle='--')
plt.plot(combined_df4['Record Time'], calculated_velocity3, label='Calculated Velocity with Adjusted Temperature 0.3', linestyle='--')

plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
plt.title('Comparison of Interpolated Observed Sound Speed and Calculated Velocity')
plt.legend()
plt.show()

v=velocity_from_teos10(combined_df4['Temperature Deg C'].values+0.4,combined_df4['Salinity'].values,combined_df4['Corrected Pressure (kPa)'].values)
v1=velocity_from_teos10(combined_df4['Temperature Deg C'].values,combined_df4['Salinity'].values,combined_df4['Corrected Pressure (kPa)'].values)
plt.plot(combined_df4['Record Time'],v1,label='adjusted')
plt.plot(combined_df4['Record Time'],v,label='unadjusted')
plt.plot(result_dfs['2504']['Record Time'],result_dfs['2504']['SoundSpeed'],label = 'recorded')
#plt.plot(combined_df4['Record Time'], interpolated_velocity, label='Interpolated Observed Sound Speed')
plt.legend()

## Sensitivity Analysis

In [None]:
import gsw

# Initial conditions
salinity = 34.5265  # PSU
temperature = 1.98  # degrees Celsius
pressure_kpa = 15561  # kPa
pressure_dbar = pressure_kpa * 0.1  # dbar for GSW
pressure_bar_cm = pressure_kpa * 0.01  # bars for Chen and Millero
longitude = -130.01149966
latitude = 45.95882833

# Function to calculate sensitivities and baseline velocity using GSW with TEOS-10 conversions
def calculate_sensitivities_TEOS10(salinity, temperature, pressure_dbar, longitude, latitude):
    delta = 0.1  # Change for sensitivity calculations
    SA_ref = gsw.SA_from_SP(salinity, pressure_dbar, longitude, latitude)
    CT_ref = gsw.CT_from_t(SA_ref, temperature, pressure_dbar)
    baseline_velocity = gsw.sound_speed(SA_ref, CT_ref, pressure_dbar)

    # Sensitivity to salinity
    SA_new = gsw.SA_from_SP(salinity + delta, pressure_dbar, longitude, latitude)
    sensitivity_S = (gsw.sound_speed(SA_new, CT_ref, pressure_dbar) - baseline_velocity) / delta

    # Sensitivity to temperature
    CT_new = gsw.CT_from_t(SA_ref, temperature + delta, pressure_dbar)
    sensitivity_T = (gsw.sound_speed(SA_ref, CT_new, pressure_dbar) - baseline_velocity) / delta

    # Sensitivity to pressure
    sensitivity_P = (gsw.sound_speed(SA_ref, CT_ref, pressure_dbar + delta) - baseline_velocity) / delta

    return baseline_velocity, sensitivity_S, sensitivity_T, sensitivity_P

# Function to calculate sensitivities and baseline velocity using basic T, P, S
def calculate_sensitivities_basic(salinity, temperature, pressure_dbar):
    delta = 0.1  # Change for sensitivity calculations
    baseline_velocity = gsw.sound_speed(salinity, temperature, pressure_dbar)

    # Sensitivity to salinity
    sensitivity_S = (gsw.sound_speed(salinity + delta, temperature, pressure_dbar) - baseline_velocity) / delta

    # Sensitivity to temperature
    sensitivity_T = (gsw.sound_speed(salinity, temperature + delta, pressure_dbar) - baseline_velocity) / delta

    # Sensitivity to pressure
    sensitivity_P = (gsw.sound_speed(salinity, temperature, pressure_dbar + delta) - baseline_velocity) / delta

    return baseline_velocity, sensitivity_S, sensitivity_T, sensitivity_P


# Perform calculations
baseline_velocity_gsw_TEOS10, sensitivity_S_gsw_TEOS10, sensitivity_T_gsw_TEOS10, sensitivity_P_gsw_TEOS10 = calculate_sensitivities_TEOS10(salinity, temperature, pressure_dbar, longitude, latitude)
baseline_velocity_gsw_basic, sensitivity_S_gsw_basic, sensitivity_T_gsw_basic, sensitivity_P_gsw_basic = calculate_sensitivities_basic(salinity, temperature, pressure_dbar)

# Initial data: Changes in velocity for standard deviation changes in each variable
delta_v_temperature = 0.1133  # Change in velocity for 0.025°C change in temperature (m/s)
delta_v_pressure = 0.0119     # Change in velocity for 7 kPa change in pressure (m/s)
delta_v_salinity = 0.0049     # Change in velocity for 0.003 PSU change in salinity (m/s)

# Squaring the changes in velocity
delta_v_temperature_squared = delta_v_temperature ** 2
delta_v_pressure_squared = delta_v_pressure ** 2
delta_v_salinity_squared = delta_v_salinity ** 2

# Calculating the total impact of all squared changes
total_squared_changes = delta_v_temperature_squared + delta_v_pressure_squared + delta_v_salinity_squared

# Calculating the exact percentage contribution of each variable
percentage_temperature = (delta_v_temperature_squared / total_squared_changes) * 100
percentage_pressure = (delta_v_pressure_squared / total_squared_changes) * 100
percentage_salinity = (delta_v_salinity_squared / total_squared_changes) * 100
                       
# Output baseline velocities
print("Baseline Velocities:")
print(f"GSW with TEOS-10: {baseline_velocity_gsw_TEOS10} m/s")
print(f"GSW Basic T, P, S: {baseline_velocity_gsw_basic} m/s")

# Output sensitivity results with units
print("\nGSW Method Sensitivities with TEOS-10 (m/s/PSU, m/s/°C, m/s/kPa):")
print(f"Salinity: {sensitivity_S_gsw_TEOS10}, Temperature: {sensitivity_T_gsw_TEOS10}, Pressure: {sensitivity_P_gsw_TEOS10*10}")
print(f"Percentage Variance explained by Salinity: {percentage_salinity} %")
print(f"Percentage Variance explained by Temperature: {percentage_temperature} %")
print(f"Percentage Variance explained by Pressure: {percentage_pressure} %")
      
print("\nChen and Millero Method Sensitivities (Basic T, P, S) (m/s/PSU, m/s/°C, m/s/kPa):")
print(f"Salinity: {sensitivity_S_gsw_basic}, Temperature: {sensitivity_T_gsw_basic}, Pressure: {sensitivity_P_gsw_basic*10}")


## Velocity Calculation

In [None]:
# Define constants
longitude = -130.01149966
latitude = 45.95882833

# Example DataFrame columns
T_series = np.array(combined_df4['Temperature Deg C'])
P_series_kPa = np.array(combined_df4['Corrected Pressure (kPa)'])

# If salinity is a constant
S_constant =  np.array(combined_df4['Salinity'])  # Assuming salinity is a constant value

# Call the functions with timeseries data or constants
TEOS10_4 = velocity_timeseries_TEOS10(T_series, P_series_kPa, S_constant)  # Replace S_constant with S_series if salinity is a timeseries
combined_df4['velocity']=TEOS10_4
combined_df4.reset_index(inplace=True)

# Example DataFrame columns
T_series = np.array(combined_df3['Temperature Deg C'])
P_series_kPa = np.array(combined_df3['Corrected Pressure (kPa)'])

# If salinity is a constant
S_constant =  np.array(combined_df3['Salinity'])  # Assuming salinity is a constant value

# Call the functions with timeseries data or constants
TEOS10_3 = velocity_timeseries_TEOS10(T_series, P_series_kPa, S_constant)  # Replace S_constant with S_series if salinity is a timeseries
combined_df3['velocity']=TEOS10_3
combined_df3.reset_index(inplace=True)

# Example DataFrame columns
T_series = np.array(combined_df2['Temperature Deg C'])
P_series_kPa = np.array(combined_df2['Corrected Pressure (kPa)'])

# If salinity is a constant
S_constant =  np.array(combined_df2['Salinity'])  # Assuming salinity is a constant value

# Call the functions with timeseries data or constants
TEOS10_2 = velocity_timeseries_TEOS10(T_series, P_series_kPa, S_constant)  # Replace S_constant with S_series if salinity is a timeseries
combined_df2['velocity']=TEOS10_2
combined_df2.reset_index(inplace=True)

In [None]:
T_series = np.array(combined_df4['Temperature Deg C'])
T_DQZ = np.array(combined_df4['TempDQZ'])
P_series_kPa = np.array(combined_df4['Corrected Pressure (kPa)'])
S_constant =  np.array(combined_df4['Salinity'])  # Assuming salinity is a constant value
TEOS1 = velocity_timeseries_TEOS10(T_series, P_series_kPa, S_constant)#T_series+0.351  # Replace S_constant with S_series if salinity is a timeseries
combined_df4['velocity_c']=TEOS1
TEOSDQZ= velocity_timeseries_TEOS10(T_DQZ, P_series_kPa, S_constant)
combined_df4['velocity_dqz']=TEOSDQZ

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(combined_df2['Record Time'],combined_df2['Temperature Deg C'],label='2502 Temperature')
plt.plot(combined_df3['Record Time'],combined_df3['Temperature Deg C'],label='2503 Temperature')
plt.plot(combined_df4['Record Time'],combined_df4['Temperature Deg C']+0.351,label='2504 Temperature (Corrected)')
plt.xlabel('Time')
plt.ylabel('Temperature Deg C')
#plt.title('Comparison of Calculated Velocity')
plt.legend()
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(combined_df4['Record Time'],combined_df4['Temperature Deg C'],label='Recorded Temperature')
plt.plot(combined_df4['Record Time'],combined_df4['TempDQZ'],label='DQZ Temperature')
plt.plot(combined_df4['Record Time'],combined_df4['Temperature Deg C']+0.351,label='Corrected Temperature')
plt.xlabel('Time')
plt.ylabel('Temperature Deg C')
#plt.title('Comparison of Calculated Velocity')
plt.legend()
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(combined_df4['Record Time'],combined_df4['velocity'],label='V(T,P,S)')
plt.plot(result_dfs['2504']['Record Time'], result_dfs['2504']['SoundSpeed'],label='Recorded Velocity')
plt.plot(combined_df4['Record Time'],combined_df4['velocity_dqz'],label='V(T(DQZ),P,S')
plt.plot(combined_df4['Record Time'],combined_df4['velocity_c'],label='V(T+0.351,P,S)',color='black')

plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
#plt.title('Comparison of Calculated Velocity')
plt.legend()
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
combined_df3.reset_index(inplace=True)

plt.figure(figsize=(12, 6))
plt.plot(result_dfs['2504']['Record Time'], result_dfs['2504']['SoundSpeed'],label='2504 Recorded Velocity')
plt.plot(combined_df4['Record Time'],combined_df4['velocity_c'],label='V(T+0.351,P,S) 2504',linewidth=1)
plt.plot(combined_df3['Record Time'],combined_df3['velocity'],label='V(T,P,S) 2504',linewidth=1)

plt.plot(result_dfs['2502']['Record Time'], result_dfs['2502']['SoundSpeed'],label='2502 Recorded Velocity')
plt.plot(combined_df2['Record Time'],combined_df2['velocity'],label='V(T,P,S) 2502',color = 'black',linewidth=1)

plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
#plt.title('Comparison of Calculated Velocity')
plt.legend()
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
import numpy as np

def calculate_rms_error(values1, values2):
    # Ensure the arrays are numpy arrays
    values1 = np.array(values1)
    values2 = np.array(values2)

    # Calculate the difference
    differences = values1 - values2

    # Remove NaN values from the differences
    differences = differences[~np.isnan(differences)]

    # Calculate the mean squared error, excluding NaN values
    mean_squared_error = np.nanmean(differences ** 2)

    # Take the square root of the mean squared error
    rms_error = np.sqrt(mean_squared_error)
    return rms_error

# Assuming combined_df4 is your DataFrame
interp_v = combined_df4['interp_v'].values
velocity = combined_df4['velocity_c'].values

# Calculate the RMS error
rms_error = calculate_rms_error(interp_v, velocity)
print(f"RMS Error: {rms_error}")


In [None]:
harmonic_mean_dfs['2502_2503']

In [None]:
plt.figure(figsize=(12, 6))
off = [0.36,0.354,0.352,0.351, 0.35,0.348,0.345,0.34] #0
rmserror = [0.04271085839137481,0.0212513857,0.017255038520958124,0.016665532022, 0.017196258148926, 0.0211084845, 0.030905983,0.0506384479773202]#1.5357631742372628
plt.plot(pd.to_numeric(off),pd.to_numeric(rmserror))
plt.xlabel('Temperature offset (C)')
plt.ylabel('RMS (m/s)')
#plt.title('Comparison of Calculated Velocity')
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample data frame creation (to be replaced with your actual data)
# combined_df4 = pd.DataFrame({
#     'Temperature Deg C': [/* your temperature data */],
#     'Corrected Pressure (kPa)': [/* your pressure data */],
#     'Salinity': [/* your salinity data */],
#     'interp_v': [/* your interp_v data */]
# })

def velocity_timeseries_TEOS10(T_series, P_series_kPa, S_constant):
    # Dummy function for demonstration (replace with your actual function)
    return T_series + P_series_kPa + S_constant



offsets = np.arange(0.1, 0.1, 1)  # Adjust the range and step as needed
rms_erro = []

for offset in offsets:
    T_series = np.array(combined_df4['Temperature Deg C'])
    P_series_kPa = np.array(combined_df4['Corrected Pressure (kPa)'])
    S_constant = np.array(combined_df4['Salinity'])

    TEOS1 = velocity_timeseries_TEOS10(T_series + offset, P_series_kPa, S_constant)
    combined_df4['velocity_c'] = TEOS1

    interp_v = combined_df4['interp_v'].values
    velocity = combined_df4['velocity_c'].values

    rms_erro = calculate_rms_error(interp_v, velocity)
    rms_erro.append(rms_error)
            # Plotting
    plt.figure(figsize=(10, 6))
    #plt.plot(offsets, rms_errors, label='RMS Error', color='blue')
    plt.scatter([offset], [rms_erro], color='red')
    plt.xlabel('Offset')
    plt.ylabel('RMS Error')
    plt.title('RMS Error for Different Offsets')
    plt.legend()
    plt.grid(True)
    plt.rcParams.update({'font.size': 14})
    plt.show()
    

In [None]:
import numpy as np
import pandas as pd

def find_best_offset(combined_df4):
    min_rms_error = float('inf')
    best_offset = None

    # Iterate over a range of possible offsets
    for offset in np.arange(0.1,0.01,0.5):  # Adjust the range and step as needed
        T_series = np.array(combined_df4['Temperature Deg C'])
        P_series_kPa = np.array(combined_df4['Corrected Pressure (kPa)'])
        S_constant = np.array(combined_df4['Salinity'])

        TEOS1 = velocity_timeseries_TEOS10(T_series + offset, P_series_kPa, S_constant)
        combined_df4['velocity_c'] = TEOS1

        interp_v = combined_df4['interp_v'].values
        velocity = combined_df4['velocity_c'].values

        rms_error = calculate_rms_error(interp_v, velocity)

        if rms_error < min_rms_error:
            min_rms_error = rms_error
            best_offset = offset

    return best_offset, min_rms_error

# Find the best offset
best_offset, min_rms_error = find_best_offset(combined_df4)
print(f"Best Offset: {best_offset}, Min RMS Error: {min_rms_error}")


In [None]:
combined_df3

### Compute Harmonic Means

In [None]:
import pandas as pd
from scipy.stats import hmean
import matplotlib.pyplot as plt

# Assuming combined_df, combined_df1, and combined_df2 are already defined

# Function to set 'Record Time' as datetime index if it's not already
def set_datetime_index(df):
    if 'Record Time' in df.columns:
        df['Record Time'] = pd.to_datetime(df['Record Time'])
        df.set_index('Record Time', inplace=True)
    elif not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

# Applying the function to each DataFrame
set_datetime_index(combined_df4)
set_datetime_index(combined_df3)
set_datetime_index(combined_df2)

# Define a function to compute the harmonic mean for a pair of DataFrames
def compute_harmonic_mean(df1, df2, column='velocity'):
    # Intersect the indices (Record Time) of both dataframes
    common_index = df1.index.intersection(df2.index)
    # Select only the common timestamps
    aligned_df1 = df1.loc[common_index]
    aligned_df2 = df2.loc[common_index]
    # Ensure no zero or negative values as they are invalid for harmonic mean
    aligned_df1[column] = aligned_df1[column].clip(lower=0.0001)
    aligned_df2[column] = aligned_df2[column].clip(lower=0.0001)
    # Compute the harmonic mean
    return common_index, hmean([aligned_df1[column], aligned_df2[column]], axis=0)

# Define a function to compute the harmonic mean for a pair of DataFrames
def compute_harmonic_mean(df1, df2, column='velocity'):
    # Intersect the indices (Record Time) of both DataFrames
    common_index = df1.index.intersection(df2.index)

    # Select only the common timestamps
    aligned_df1 = df1.loc[common_index, [column]].copy()
    aligned_df2 = df2.loc[common_index, [column]].copy()

    # Drop any remaining NaNs after alignment
    aligned_df1.dropna(inplace=True)
    aligned_df2.dropna(inplace=True)

    # Ensure both have the same length after dropping NaNs
    min_length = min(len(aligned_df1), len(aligned_df2))
    aligned_df1 = aligned_df1.iloc[:min_length]
    aligned_df2 = aligned_df2.iloc[:min_length]

    # Ensure no zero or negative values as they are invalid for harmonic mean
    aligned_df1[column] = aligned_df1[column].clip(lower=0.0001)
    aligned_df2[column] = aligned_df2[column].clip(lower=0.0001)

    # Convert to NumPy arrays for compatibility with `hmean()`
    values1 = aligned_df1[column].to_numpy()
    values2 = aligned_df2[column].to_numpy()

    # Compute the harmonic mean
    harmonic_mean_values = hmean(np.vstack([values1, values2]), axis=0)

    return common_index[:min_length], harmonic_mean_values

# Compute the harmonic mean for each pair
index_2504_2503, harmonic_mean_2504_2503 = compute_harmonic_mean(combined_df4, combined_df3)
index_2504_2502, harmonic_mean_2504_2502 = compute_harmonic_mean(combined_df4, combined_df2)
index_2503_2502, harmonic_mean_2503_2502 = compute_harmonic_mean(combined_df3, combined_df2)


#Assuming the harmonic means and their corresponding indices have been calculated
harmonic_means = {
    '2502_2503': (index_2503_2502, harmonic_mean_2503_2502),
    '2502_2504': (index_2504_2502, harmonic_mean_2504_2502),
    '2503_2504': (index_2504_2503, harmonic_mean_2504_2503)
}

# Dictionary to store DataFrames
harmonic_df_dict = {}

for key, (index, values) in harmonic_means.items():
    # Create a DataFrame without setting 'Record Time' as the index
    df = pd.DataFrame({'Record Time': index, 'HMean': values})
    harmonic_df_dict[key] = df

# Now, harmonic_df_dict contains DataFrames for each harmonic mean series under the specified keys


# Plotting
plt.figure(figsize=(10, 4))

#plt.plot(index_2504_2503, harmonic_mean_2504_2503, label='Harmonic Mean 2504-2503')
#plt.plot(index_2504_2502, harmonic_mean_2504_2502, label='Harmonic Mean 2504-2502')
plt.plot(index_2503_2502, harmonic_mean_2503_2502, label='TEOS 2503-2502')
plt.plot(harmonic_mean_dfs['2502_2503']['Record Time'],harmonic_mean_dfs['2502_2503']['SoundSpeed'],label='Velo 2503-2502')
plt.plot(harmonic_mean_dfs['2503_2504']['Record Time'],harmonic_mean_dfs['2503_2504']['SoundSpeed'],label='Velo 2503-2504')

#plt.title('Harmonic Mean Velocity between 2504-2502')
plt.xlabel('Record Time')
plt.ylabel('Harmonic Mean Velocity')
plt.legend()
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
harmonic_mean_dfs

## Range Calculation

In [None]:
bsl_df = df_dict['2504']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2503_df = bsl_df[bsl_df['RangeAddress'] == '2503'].copy()

plt.figure(figsize=(10, 4))
plt.scatter(bsl_2503_df['Record Time'],bsl_2503_df['Range(ms)'],s=1)
plt.title('Two way travel time from 2504-2503 ')
plt.xlabel('Record Time')
plt.ylabel('Range (ms)')
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
plt.figure(figsize=(10, 4))
plt.scatter(filtered_bsl_2502_copy['Record Time'],filtered_bsl_2502_copy['Range(ms)'],s=1)
plt.title('Two way travel time from 2504-2502 ')
plt.xlabel('Record Time')
plt.ylabel('Range (ms)')
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


## Final Figures

#### 2504 to east and west

In [None]:
# Extract DataFrames based on the 'RangeAddress' column values
bsl_df = df_dict['2504']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2501_df = bsl_df[bsl_df['RangeAddress'] == '2501'].copy()
bsl_2502_df = bsl_df[bsl_df['RangeAddress'] == '2502'].copy()
bsl_2503_df = bsl_df[bsl_df['RangeAddress'] == '2503'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2502_df['Range(ms)'].quantile(0.15)
Q3 = bsl_2502_df['Range(ms)'].quantile(0.85)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


total_rows1 = len(bsl_2502_df)

# Filter out outliers
filtered_bsl_2502_df = bsl_2502_df[(bsl_2502_df['Range(ms)'] >= lower_bound) & (bsl_2502_df['Range(ms)'] <= upper_bound)]
outside_values_df1 = bsl_2502_df[(bsl_2502_df['Range(ms)'] < lower_bound) | (bsl_2502_df['Range(ms)'] > upper_bound)]

# Number of rows after removing outliers
rows_after_filtering1 = len(filtered_bsl_2502_df)
# Calculate the number of outlier rows
num_outliers1 = total_rows1 - rows_after_filtering1
# Calculate the percentage of outliers
percent_outliers_2504_2502 = (num_outliers1 / total_rows1) * 100


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2503_df['Range(ms)'] = pd.to_numeric(bsl_2503_df['Range(ms)'], errors='coerce')

total_rows = len(bsl_2503_df)

# Filter out data points with 'Range(ms)' values less than 2618 from the original bsl_2503_df
filtered_bsl_2503_df = bsl_2503_df[bsl_2503_df['Range(ms)'] >= 2618]
outside_values_df2 = bsl_2503_df[bsl_2503_df['Range(ms)'] < 2618]


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2502_df['Range(ms)'] = pd.to_numeric(bsl_2502_df['Range(ms)'], errors='coerce')

# Creating explicit copies of the DataFrames to avoid the warning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()
filtered_bsl_2503_copy = filtered_bsl_2503_df.copy()

# Create explicit copies to avoid the SettingWithCopyWarning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()

ensure_datetime(filtered_bsl_2502_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2504'], 'Record Time')


# Interpolate sound speed values onto the filtered_bsl_2502_copy timestamps
filtered_bsl_2502_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Calculated Distance (m)'] = filtered_bsl_2502_copy['Interpolated Sound Speed'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel

filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Harmonic Mean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Harmonic Distance (m)'] = filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel


R2504_2502 = filtered_bsl_2502_copy
#-------------------------------------------------------------------------------------------------------


# Create an explicit copy of filtered_bsl_2503_df to avoid the SettingWithCopyWarning
filtered_bsl_2503_copy = filtered_bsl_2503_df.copy()

ensure_datetime(filtered_bsl_2503_copy, 'Record Time')
ensure_datetime(harmonic_mean_dfs['2503_2504'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2503_copy timestamps
filtered_bsl_2503_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2503_copy['Calculated Distance (m)'] = filtered_bsl_2503_copy['Interpolated Sound Speed'] * ((filtered_bsl_2503_copy['Range(ms)']-filtered_bsl_2503_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel


# Interpolate Harmonic Mean Velocity onto the filtered_bsl_2503_copy timestamps
filtered_bsl_2503_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['SoundSpeed']
)

# Calculate distance using the interpolated Harmonic Mean Velocity
# Remember to divide the time (Range - TAT) by 2000 to convert ms to seconds and for two-way travel
filtered_bsl_2503_copy['Harmonic Distance (m)'] = filtered_bsl_2503_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2503_copy['Range(ms)'] - filtered_bsl_2503_copy['TAT(ms)']) / 2000)

R2504_2503 = filtered_bsl_2503_copy



In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2504_2503['Record Time'] = pd.to_datetime(R2504_2503['Record Time'])

# Sort by time to ensure proper rolling calculations
R2504_2503 = R2504_2503.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2504_2503 = R2504_2503.set_index('Record Time')

# Compute 30-day rolling mean
R2504_2503['Moving Average'] = (
    R2504_2503['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2504_2503 = R2504_2503.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2504_2503['Record Time'], R2504_2503['Harmonic Distance (m)']*100, '.', label="Distance", color='salmon')

# Plot moving average
plt.plot(R2504_2503['Record Time'], 
         R2504_2503['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Distance (cm)")
plt.legend()
plt.ylim(176650,176690)
plt.title("Central -> West Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2504_2503['Record Time'] = pd.to_datetime(R2504_2503['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2504_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2504_2503.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("Central-West Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2504_2502['Record Time'] = pd.to_datetime(R2504_2502['Record Time'])

# Sort by time to ensure proper rolling calculations
R2504_2502 = R2504_2502.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2504_2502 = R2504_2502.set_index('Record Time')

# Compute 30-day rolling mean
R2504_2502['Moving Average'] = (
    R2504_2502['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2504_2502 = R2504_2502.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2504_2502['Record Time'], R2504_2502['Harmonic Distance (m)']*100, '.', label="Velocimeter Distance", color='salmon')

# Plot moving average
plt.plot(R2504_2502['Record Time'], 
         R2504_2502['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Harmonic Distance (cm)")
plt.legend()
plt.ylim(164200,164225)
plt.title("Central -> East Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2504_2502['Record Time'] = pd.to_datetime(R2504_2502['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2504_2502.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2504_2502.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("Central-East Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


## 2503 to central and east

In [None]:
# Extract DataFrames based on the 'RangeAddress' column values
bsl_df = df_dict['2503']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2502_df = bsl_df[bsl_df['RangeAddress'] == '2502'].copy()
bsl_2504_df = bsl_df[bsl_df['RangeAddress'] == '2504'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2502_df['Range(ms)'].quantile(0.25)
Q3 = bsl_2502_df['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_bsl_2502_df = bsl_2502_df[(bsl_2502_df['Range(ms)'] >= lower_bound) & (bsl_2502_df['Range(ms)'] <= upper_bound)]
outside_values_df1 = bsl_2502_df[(bsl_2502_df['Range(ms)'] < lower_bound) | (bsl_2502_df['Range(ms)'] > upper_bound)]

# Convert the 'Range(ms)' column to numeric for plotting
bsl_2502_df['Range(ms)'] = pd.to_numeric(bsl_2502_df['Range(ms)'], errors='coerce')

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2504_df['Range(ms)'].quantile(0.25)
Q3 = bsl_2504_df['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


# Filter out data points with 'Range(ms)' values using IQR
filtered_bsl_2504_df = bsl_2504_df[(bsl_2504_df['Range(ms)'] >= lower_bound) & (bsl_2504_df['Range(ms)'] <= upper_bound)]
outside_values_df2 = bsl_2504_df[(bsl_2504_df['Range(ms)'] < lower_bound) | (bsl_2504_df['Range(ms)'] > upper_bound)]


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2504_df['Range(ms)'] = pd.to_numeric(bsl_2504_df['Range(ms)'], errors='coerce')

# Creating explicit copies of the DataFrames to avoid the warning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()
filtered_bsl_2504_copy = filtered_bsl_2504_df.copy()

# Create explicit copies to avoid the SettingWithCopyWarning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()

ensure_datetime(filtered_bsl_2502_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2503'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2502_copy timestamps
filtered_bsl_2502_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2503']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2503']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Calculated Distance (m)'] = filtered_bsl_2502_copy['Interpolated Sound Speed'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
# Display the top rows of the updated dataframe
filtered_bsl_2502_copy[['Record Time', 'Range(ms)', 'Interpolated Sound Speed', 'Calculated Distance (m)']].head()

##############
filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Harmonic Mean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Harmonic Distance (m)'] = filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
##############
R2503_2502 = filtered_bsl_2502_copy





##
ensure_datetime(filtered_bsl_2504_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2504'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2502_copy timestamps
filtered_bsl_2504_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2504_copy['Calculated Distance (m)'] = filtered_bsl_2504_copy['Interpolated Sound Speed'] * ((filtered_bsl_2504_copy['Range(ms)']-filtered_bsl_2504_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
# Display the top rows of the updated dataframe
filtered_bsl_2504_copy[['Record Time', 'Range(ms)', 'Interpolated Sound Speed', 'Calculated Distance (m)']].head()

##############
filtered_bsl_2504_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Harmonic Mean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2504_copy['Harmonic Distance (m)'] = filtered_bsl_2504_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2504_copy['Range(ms)']-filtered_bsl_2504_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
##############
R2503_2504 = filtered_bsl_2504_copy





# Interpolation for 2504
#ensure_datetime(filtered_bsl_2504_copy, 'Record Time')
#ensure_datetime(harmonic_df_dict['2503_2504'], 'Record Time')

#filtered_bsl_2504_copy['Interpolated Sound Speed'] = np.interp(
#    filtered_bsl_2504_copy['Record Time'].astype(np.int64),
#    harmonic_df_dict['2503_2504']['Record Time'].view(np.int64),
#    harmonic_df_dict['2503_2504']['HMean']
#)
#
#filtered_bsl_2504_copy['Harmonic Distance (m)'] = filtered_bsl_2504_copy['Interpolated Sound Speed'] * ((filtered_bsl_2504_copy['Range(ms)']-pd.to_numeric(filtered_bsl_2504_copy['TAT(ms)'])) / 2000)

##    Now we do this for central

#R2503_2504 = filtered_bsl_2504_copy



In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2503_2502['Record Time'] = pd.to_datetime(R2503_2502['Record Time'])

# Sort by time to ensure proper rolling calculations
R2503_2502 = R2503_2502.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2503_2502 = R2503_2502.set_index('Record Time')

# Compute 30-day rolling mean
R2503_2502['Moving Average'] = (
    R2503_2502['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2503_2502 = R2503_2502.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2503_2502['Record Time'], R2503_2502['Harmonic Distance (m)']*100, '.', label="Velocimeter Distance", color='salmon')

# Plot moving average
plt.plot(R2503_2502['Record Time'], 
         R2503_2502['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Harmonic Distance (cm)")
plt.legend()
plt.ylim(326200,326245)
plt.title("West -> East Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2503_2502['Record Time'] = pd.to_datetime(R2503_2502['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2503_2502.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2503_2502.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("West-East Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2503_2504['Record Time'] = pd.to_datetime(R2503_2504['Record Time'])

# Sort by time to ensure proper rolling calculations
R2503_2504 = R2503_2504.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2503_2504 = R2503_2504.set_index('Record Time')

# Compute 30-day rolling mean
R2503_2504['Moving Average'] = (
    R2503_2504['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2503_2504 = R2503_2504.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2503_2504['Record Time'], R2503_2504['Harmonic Distance (m)']*100, '.', label="Distance", color='salmon')

# Plot moving average
plt.plot(R2503_2504['Record Time'], 
         R2503_2504['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Distance (cm)")
plt.legend()
plt.ylim(176600,176630)
plt.title("West -> Central Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2503_2504['Record Time'] = pd.to_datetime(R2503_2504['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2503_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2503_2504.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("West-Central Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


## 2502 to west and central

In [None]:
import pandas as pd
import numpy as np

# Extract DataFrames based on the 'RangeAddress' column values
bsl_dfs = df_dict['2502']['BSL']
bsl_dfs['RangeAddress'] = bsl_dfs['RangeAddress'].astype(int).astype(str)
bsl_dfs['Range(ms)'] = pd.to_numeric(bsl_dfs['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2503_dfs = bsl_dfs[bsl_dfs['RangeAddress'] == '2503'].copy()
bsl_2504_dfs = bsl_dfs[bsl_dfs['RangeAddress'] == '2504'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2503_dfs['Range(ms)'].quantile(0.25)
Q3 = bsl_2503_dfs['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_bsl_2503_dfs = bsl_2503_dfs[(bsl_2503_dfs['Range(ms)'] >= 4636.2) & (bsl_2503_dfs['Range(ms)'] <= 4638)]
outside_values_df2 = bsl_2503_dfs[(bsl_2503_dfs['Range(ms)'] < 4636.2) & (bsl_2503_dfs['Range(ms)'] > 4638)]

# Convert the 'Range(ms)' column to numeric for plotting
bsl_2503_dfs['Range(ms)'] = pd.to_numeric(bsl_2503_dfs['Range(ms)'], errors='coerce')
bsl_2503_dfs['TAT(ms)'] = pd.to_numeric(bsl_2503_dfs['TAT(ms)'], errors='coerce')
Q1 = bsl_2504_dfs['Range(ms)'].quantile(0.25)
Q3 = bsl_2504_dfs['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_bsl_2504_dfs = bsl_2504_dfs[(bsl_2504_dfs['Range(ms)'] >= 2400)]
outside_values_df2 = bsl_2504_dfs[(bsl_2504_dfs['Range(ms)'] < 2400)]

bsl_2504_dfs['Range(ms)'] = pd.to_numeric(bsl_2504_dfs['Range(ms)'], errors='coerce')
bsl_2504_dfs['TAT(ms)'] = pd.to_numeric(bsl_2504_dfs['TAT(ms)'], errors='coerce')

filtered_bsl_2503_copy = filtered_bsl_2503_dfs.copy()
filtered_bsl_2504_copy = filtered_bsl_2504_dfs.copy()

ensure_datetime(filtered_bsl_2503_copy, 'Record Time')
ensure_datetime(harmonic_mean_dfs['2502_2503'], 'Record Time')

# Interpolation for 2503
filtered_bsl_2503_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2503']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2503']['HMean']
)

filtered_bsl_2503_copy['Harmonic Distance (m)'] = filtered_bsl_2503_copy['Interpolated Sound Speed'] * ((filtered_bsl_2503_copy['Range(ms)']-pd.to_numeric(filtered_bsl_2503_copy['TAT(ms)'])) / 2000)

R2502_2503 = filtered_bsl_2503_copy

# Interpolation for 2504
ensure_datetime(filtered_bsl_2504_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2504'], 'Record Time')

filtered_bsl_2504_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['HMean']
)

filtered_bsl_2504_copy['Harmonic Distance (m)'] = filtered_bsl_2504_copy['Interpolated Sound Speed'] * ((filtered_bsl_2504_copy['Range(ms)']-pd.to_numeric(filtered_bsl_2504_copy['TAT(ms)'])) / 2000)

##    Now we do this for central

R2502_2504 = filtered_bsl_2504_copy

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2502_2503['Record Time'] = pd.to_datetime(R2502_2503['Record Time'])

# Sort by time to ensure proper rolling calculations
R2502_2503 = R2502_2503.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2502_2503 = R2502_2503.set_index('Record Time')

# Compute 30-day rolling mean
R2502_2503['Moving Average'] = (
    R2502_2503['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2502_2503 = R2502_2503.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2502_2503['Record Time'], R2502_2503['Harmonic Distance (m)']*100, '.', label="Distance", color='salmon')

# Plot moving average
plt.plot(R2502_2503['Record Time'], 
         R2502_2503['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.ylim(326000,326040)
plt.title("East -> west Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2502_2503['Record Time'] = pd.to_datetime(R2502_2503['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2502_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2502_2503.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("East-West Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
R2502_2504

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
R2502_2504['Record Time'] = pd.to_datetime(R2502_2504['Record Time'])

# Sort by time to ensure proper rolling calculations
R2502_2504 = R2502_2504.sort_values(by='Record Time')

# Set 'Record Time' as the index
R2502_2504 = R2502_2504.set_index('Record Time')

# Compute 30-day rolling mean
R2502_2504['Moving Average'] = (
    R2502_2504['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100

# Reset index for plotting
R2502_2504 = R2502_2504.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(R2502_2504['Record Time'], R2502_2504['Harmonic Distance (m)']*100, '.', label="Distance", color='salmon')

# Plot moving average
plt.plot(R2502_2504['Record Time'], 
         R2502_2504['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.ylim(164200,164222)
plt.title("East -> Central Distance")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2502_2504['Record Time'] = pd.to_datetime(R2502_2504['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2502_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = R2502_2504.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("East-Central Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
# Extract DataFrames based on the 'RangeAddress' column values
bsl_df = df_dict['2504']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2501_df = bsl_df[bsl_df['RangeAddress'] == '2501'].copy()
bsl_2502_df = bsl_df[bsl_df['RangeAddress'] == '2502'].copy()
bsl_2503_df = bsl_df[bsl_df['RangeAddress'] == '2503'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2502_df['Range(ms)'].quantile(0.25)
Q3 = bsl_2502_df['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


total_rows1 = len(bsl_2502_df)

# Filter out outliers
filtered_bsl_2502_df = bsl_2502_df[(bsl_2502_df['Range(ms)'] >= lower_bound) & (bsl_2502_df['Range(ms)'] <= upper_bound)]
outside_values_df1 = bsl_2502_df[(bsl_2502_df['Range(ms)'] < lower_bound) | (bsl_2502_df['Range(ms)'] > upper_bound)]

# Number of rows after removing outliers
rows_after_filtering1 = len(filtered_bsl_2502_df)
# Calculate the number of outlier rows
num_outliers1 = total_rows1 - rows_after_filtering1
# Calculate the percentage of outliers
percent_outliers_2504_2502 = (num_outliers1 / total_rows1) * 100


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2503_df['Range(ms)'] = pd.to_numeric(bsl_2503_df['Range(ms)'], errors='coerce')

total_rows = len(bsl_2503_df)

# Filter out data points with 'Range(ms)' values less than 2618 from the original bsl_2503_df
filtered_bsl_2503_df = bsl_2503_df[bsl_2503_df['Range(ms)'] >= 2618]
outside_values_df2 = bsl_2503_df[bsl_2503_df['Range(ms)'] < 2618]


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2502_df['Range(ms)'] = pd.to_numeric(bsl_2502_df['Range(ms)'], errors='coerce')

# Creating explicit copies of the DataFrames to avoid the warning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()
filtered_bsl_2503_copy = filtered_bsl_2503_df.copy()

# Create explicit copies to avoid the SettingWithCopyWarning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()

ensure_datetime(filtered_bsl_2502_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2504'], 'Record Time')


# Interpolate sound speed values onto the filtered_bsl_2502_copy timestamps
filtered_bsl_2502_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Calculated Distance (m)'] = filtered_bsl_2502_copy['Interpolated Sound Speed'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel

##############
filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Harmonic Mean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Harmonic Distance (m)'] = filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
##############


# Create an explicit copy of filtered_bsl_2503_df to avoid the SettingWithCopyWarning
filtered_bsl_2503_copy = filtered_bsl_2503_df.copy()

ensure_datetime(filtered_bsl_2503_copy, 'Record Time')
ensure_datetime(harmonic_mean_dfs['2503_2504'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2503_copy timestamps
filtered_bsl_2503_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2503_copy['Calculated Distance (m)'] = filtered_bsl_2503_copy['Interpolated Sound Speed'] * ((filtered_bsl_2503_copy['Range(ms)']-filtered_bsl_2503_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel


# Interpolate Harmonic Mean Velocity onto the filtered_bsl_2503_copy timestamps
filtered_bsl_2503_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['SoundSpeed']
)

# Calculate distance using the interpolated Harmonic Mean Velocity
# Remember to divide the time (Range - TAT) by 2000 to convert ms to seconds and for two-way travel
filtered_bsl_2503_copy['Harmonic Distance (m)'] = filtered_bsl_2503_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2503_copy['Range(ms)'] - filtered_bsl_2503_copy['TAT(ms)']) / 2000)

# Calculate centered distances by subtracting the mean
#baseline_2504_2502 = filtered_bsl_2502_copy['Calculated Distance (m)'] - filtered_bsl_2502_copy['Calculated Distance (m)'].mean()
#baseline_2504_2503 = filtered_bsl_2503_copy['Calculated Distance (m)'] - filtered_bsl_2503_copy['Calculated Distance (m)'].mean()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has necessary columns

# Prepare data for plotting (convert to Unix timestamp and drop NaNs)
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)', 'Harmonic Distance (m)'], inplace=True)

# Fit quadratic and linear polynomials for 'Calculated Distance (m)'
coefficients_poly_calc = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 2)
coefficients_linear_calc = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 1)
polynomial_calc = np.poly1d(coefficients_poly_calc)
linear_calc = np.poly1d(coefficients_linear_calc)

# Fit quadratic and linear polynomials for 'Harmonic Distance (m)'
coefficients_poly_harmonic = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Harmonic Distance (m)'], 2)
coefficients_linear_harmonic = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Harmonic Distance (m)'], 1)
polynomial_harmonic = np.poly1d(coefficients_poly_harmonic)
linear_harmonic = np.poly1d(coefficients_linear_harmonic)

# Generate data points for the polynomial and linear lines
x_values = np.linspace(filtered_bsl_2502_copy['Record Time Numeric'].min(), filtered_bsl_2502_copy['Record Time Numeric'].max(), 100)
y_poly_calc = polynomial_calc(x_values)
y_linear_calc = linear_calc(x_values)
y_poly_harmonic = polynomial_harmonic(x_values)
y_linear_harmonic = linear_harmonic(x_values)

# Convert numeric time back to datetime for plotting
x_dates = pd.to_datetime(x_values, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data for Calculated Distance
plt.plot(filtered_bsl_2502_copy['Record Time'], filtered_bsl_2502_copy['Calculated Distance (m)'], '.', label="TEOS-10 Distance", color='blue')
# Original data for Harmonic Distance
plt.plot(filtered_bsl_2502_copy['Record Time'], filtered_bsl_2502_copy['Harmonic Distance (m)'], '.', label="Velocimeter Distance", color='green')


# Polynomial fit for Calculated Distance
plt.plot(x_dates, y_poly_calc, label="Quadratic Fit - TEOS10", color='Red')

#Polynomial fit for Harmonic Distance
plt.plot(x_dates, y_poly_harmonic, label="Quadratic Fit - Velocimeter", color='orange')

# Linear fit for Calculated Distance
#plt.plot(x_dates, y_linear_calc, label="Linear Fit - TEOS10", color='black')
# Linear fit for Harmonic Distance
#plt.plot(x_dates, y_linear_harmonic, label="Linear Fit - Velocimeter", color='purple')

plt.title("Distance 2504-2502 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
filtered_bsl_2502_copy['Harmonic Distance (m)'].std()

In [None]:
from matplotlib.ticker import ScalarFormatter
import matplotlib.ticker as mticker

# Calculate residuals and standard deviation for Calculated Distance
filtered_bsl_2502_copy['Residuals Calc'] = filtered_bsl_2502_copy['Calculated Distance (m)'] - polynomial_calc(filtered_bsl_2502_copy['Record Time Numeric'])
std_dev_calc = filtered_bsl_2502_copy['Residuals Calc'].std()

# Filter out outliers for Calculated Distance
filtered_bsl_2502_no_outliers_calc = filtered_bsl_2502_copy[abs(filtered_bsl_2502_copy['Residuals Calc']) <= 3 * std_dev_calc]

# Calculate residuals and standard deviation for Harmonic Distance
filtered_bsl_2502_copy['Residuals Harmonic'] = filtered_bsl_2502_copy['Harmonic Distance (m)'] - polynomial_harmonic(filtered_bsl_2502_copy['Record Time Numeric'])
std_dev_harmonic = filtered_bsl_2502_copy['Residuals Harmonic'].std()

# Filter out outliers for Harmonic Distance
filtered_bsl_2502_no_outliers_harmonic = filtered_bsl_2502_copy[abs(filtered_bsl_2502_copy['Residuals Harmonic']) <= 3 * std_dev_harmonic]

# Plotting with outliers removed
plt.figure(figsize=(12, 7))

# Data for Calculated Distance without outliers
plt.plot(filtered_bsl_2502_no_outliers_calc['Record Time'], filtered_bsl_2502_no_outliers_calc['Calculated Distance (m)']*100-164200, '.', label="TEOS-10 Distance", color='grey')

# Data for Harmonic Distance without outliers
plt.plot(filtered_bsl_2502_no_outliers_harmonic['Record Time'], filtered_bsl_2502_no_outliers_harmonic['Harmonic Distance (m)']*100-164200, '.', label="Velocimeter Distance", color='salmon')

# Polynomial fit for Calculated Distance
plt.plot(x_dates, y_poly_calc*100-164200, label="Quadratic Fit - TEOS10", color='black',linewidth=3)

# Polynomial fit for Harmonic Distance
plt.plot(x_dates, y_poly_harmonic*100-164200, label="Quadratic Fit - Velocimeter", color='red',linewidth=3)

# Linear fit for Calculated Distance
#plt.plot(x_dates, y_linear_calc, label="Linear Fit - TEOS10", color='black')

# Linear fit for Harmonic Distance
#plt.plot(x_dates, y_linear_harmonic, label="Linear Fit - Velocimeter", color='purple')

plt.title("Distance 2504-2502 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (cm)")
plt.legend(loc='lower right')#, bbox_to_anchor=(1, 1))
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
# Set y-axis to use scientific notation
# Force scientific notation on the y-axis
#ax.yaxis.set_major_formatter(mticker.ScalarFormatter(useOffset=True))
#ax.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
#ax.text(0.0, 1.02, '× 164200', transform=ax.transAxes, fontsize=10, verticalalignment='bottom')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'Record Time' is in datetime format
filtered_bsl_2502_no_outliers_harmonic['Record Time'] = pd.to_datetime(filtered_bsl_2502_no_outliers_harmonic['Record Time'])

# Sort by time to ensure proper rolling calculations
filtered_bsl_2502_no_outliers_harmonic = filtered_bsl_2502_no_outliers_harmonic.sort_values(by='Record Time')

# Set 'Record Time' as the index
filtered_bsl_2502_no_outliers_harmonic = filtered_bsl_2502_no_outliers_harmonic.set_index('Record Time')

# Compute 30-day rolling mean
filtered_bsl_2502_no_outliers_harmonic['Moving Average'] = (
    filtered_bsl_2502_no_outliers_harmonic['Harmonic Distance (m)']
    .rolling('30D')
    .mean()
) * 100 - 164200

# Reset index for plotting
filtered_bsl_2502_no_outliers_harmonic = filtered_bsl_2502_no_outliers_harmonic.reset_index()
plt.figure(figsize=(12, 7))
plt.plot(filtered_bsl_2502_no_outliers_harmonic['Record Time'], filtered_bsl_2502_no_outliers_harmonic['Harmonic Distance (m)']*100-164200, '.', label="Velocimeter Distance", color='salmon')

# Plot moving average
plt.plot(filtered_bsl_2502_no_outliers_harmonic['Record Time'], 
         filtered_bsl_2502_no_outliers_harmonic['Moving Average'],'r.', 
         label="30-day Moving Average")

# Formatting
plt.xlabel("Record Time")
plt.ylabel("Harmonic Distance (cm)")
plt.legend()
plt.title("East-Central Distance")
plt.grid(True)
plt.show()


In [None]:
EastCentral=filtered_bsl_2502_no_outliers_harmonic

In [None]:
filtered_bsl_2502_no_outliers_calc['Harmonic Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients_poly = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients_poly)

# Fit a linear polynomial (1st degree)
coefficients_linear = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 1)
linear = np.poly1d(coefficients_linear)

# Generate data points for the polynomial and linear lines
x_values = np.linspace(filtered_bsl_2502_copy['Record Time Numeric'].min(), filtered_bsl_2502_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_values)
y_linear = linear(x_values)

# Convert numeric time back to datetime for plotting
x_dates = pd.to_datetime(x_values, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2502_copy['Record Time'], filtered_bsl_2502_copy['Calculated Distance (m)'], '.', label="Distance 2504-2502", color='blue')
# Polynomial fit
plt.plot(x_dates, y_poly, label="Quadratic Fit", color='red')
# Linear fit
plt.plot(x_dates, y_linear, label="Linear Fit", color='black')

plt.title("Distance 2504-2502 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients_poly = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients_poly)

# Calculate residuals
residuals = filtered_bsl_2502_copy['Calculated Distance (m)'] - polynomial(filtered_bsl_2502_copy['Record Time Numeric'])

# Standard deviation of residuals
std_dev = np.std(residuals)

# Filter out data points that are more than 3 standard deviations from the polynomial fit
filtered_data = filtered_bsl_2502_copy[np.abs(residuals) <= 3 * std_dev]

# Fit a linear polynomial (1st degree) to the filtered data
coefficients_linear = np.polyfit(filtered_data['Record Time Numeric'], filtered_data['Calculated Distance (m)'], 1)
linear = np.poly1d(coefficients_linear)

# Generate data points for the polynomial and linear lines
x_values = np.linspace(filtered_data['Record Time Numeric'].min(), filtered_data['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_values)
y_linear = linear(x_values)

# Convert numeric time back to datetime for plotting
x_dates = pd.to_datetime(x_values, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data without outliers
plt.plot(filtered_data['Record Time'], filtered_data['Calculated Distance (m)'], '.', label="Filtered Distance 2504-2502", color='blue')
# Polynomial fit
plt.plot(x_dates, y_poly, label="Quadratic Fit", color='red')
# Linear fit
plt.plot(x_dates, y_linear, label="Linear Fit", color='green')

plt.title("Distance 2504-2502 Timeseries (Outliers Removed)")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
filtered_data['Calculated Distance (m)'].std()

In [None]:
filtered_bsl_2502_copy['Calculated Distance (m)'].std()

In [None]:
filtered_bsl_2502_copy['Calculated Distance (m)'].std()

In [None]:
# Calculate standard deviation and mean of the distances
std_dev_distance = np.std(filtered_bsl_2502_copy['Calculated Distance (m)'])
mean_distance = np.mean(filtered_bsl_2502_copy['Calculated Distance (m)'])

# Convert standard deviation to centimeters and mean distance to kilometers
std_dev_cm = std_dev_distance * 100  # Convert meters to centimeters
mean_distance_km = mean_distance / 1000  # Convert meters to kilometers

# Calculate scatter in cm/km
scatter_cm_per_km = std_dev_cm / mean_distance_km

print(f"Scatter: {scatter_cm_per_km} cm/km")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy and filtered_data are defined and have 'Calculated Distance (m)'

# Fit a quadratic polynomial (2nd degree) on filtered_bsl_2502_copy
coefficients = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2502_copy['Record Time Numeric'])

# Calculate residuals (misfits) for filtered_bsl_2502_copy
residuals_bsl = filtered_bsl_2502_copy['Calculated Distance (m)'] - y_poly

# Assuming residuals calculation for filtered_data is similar to above
# If not, replace with appropriate calculation
residuals_filtered = filtered_data['Calculated Distance (m)'] - polynomial(filtered_data['Record Time Numeric'])

# Plotting the histogram of distances for both datasets
plt.figure(figsize=(12, 6))

# Histogram for filtered_bsl_2502_copy distances
plt.hist(filtered_bsl_2502_copy['Calculated Distance (m)'], bins=30, color='blue', alpha=0.5, label='Original Data')

# Histogram for filtered_data distances
plt.hist(filtered_data['Calculated Distance (m)'], bins=30, color='green', alpha=0.5, label='Filtered Data')

plt.title("Histogram of Distances for 2504-2502 (Original and Filtered)")
plt.xlabel("Distance (m)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
filtered_bsl_2502_copy['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2503_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2503_copy['Record Time Numeric'], filtered_bsl_2503_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2503_copy['Record Time Numeric'].min(), filtered_bsl_2503_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2503_copy['Record Time'], filtered_bsl_2503_copy['Calculated Distance (m)'], '.', label="Distance 2504-2503", color='blue')

# Polynomial fit
plt.plot(x_poly_dates, y_poly, label="Quadratic Fit", color='red')

plt.title("Distance 2504-2503 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Centered Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Ensure 'Record Time' is in datetime format
filtered_bsl_2503_copy['Record Time'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Sort by time for consistency
filtered_bsl_2503_copy = filtered_bsl_2503_copy.sort_values(by='Record Time')

# Set 'Record Time' as the index for rolling calculation
filtered_bsl_2503_copy = filtered_bsl_2503_copy.set_index('Record Time')

# Compute 30-day moving average
filtered_bsl_2503_copy['Calculated Distance Moving Average'] = (
    filtered_bsl_2503_copy['Calculated Distance (m)']
    .rolling('30D')
    .mean()
)

# Reset index to keep 'Record Time' as a column
filtered_bsl_2503_copy = filtered_bsl_2503_copy.reset_index()

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2503_copy['Record Time Numeric'] = filtered_bsl_2503_copy['Record Time'].astype(np.int64) // 10**9

# Fit a cubic polynomial (degree 3)
coefficients = np.polyfit(
    filtered_bsl_2503_copy['Record Time Numeric'], 
    filtered_bsl_2503_copy['Calculated Distance (m)'], 
    3
)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2503_copy['Record Time Numeric'].min(), 
                      filtered_bsl_2503_copy['Record Time Numeric'].max(), 
                      100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2503_copy['Record Time'], 
         filtered_bsl_2503_copy['Calculated Distance (m)']*100, 
         '.', label="Distance 2504-2503", color='salmon', alpha=0.5)

# Moving average
plt.plot(filtered_bsl_2503_copy['Record Time'], 
         filtered_bsl_2503_copy['Calculated Distance Moving Average']*100,'r.', 
         label="30-day Moving Average")

# Polynomial fit
#plt.plot(x_poly_dates, y_poly, label="Cubic Fit", color='red', linewidth=2)

plt.title("Distance west-central Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
#plt.ylim([1765.4,1765.9])
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2503_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2503_copy['Record Time Numeric'], filtered_bsl_2503_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2503_copy['Record Time Numeric'])

# Calculate residuals (misfits)
residuals = filtered_bsl_2503_copy['Calculated Distance (m)'] - y_poly

# Plotting the histogram of residuals
plt.figure(figsize=(12, 6))
plt.hist(residuals, bins=30, color='red', alpha=0.7)
plt.title("Histogram of Residuals from Polynomial Fit for 2504-2503")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


### 2503

In [None]:
# Extract DataFrames based on the 'RangeAddress' column values
bsl_df = df_dict['2503']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2502_df = bsl_df[bsl_df['RangeAddress'] == '2502'].copy()
bsl_2504_df = bsl_df[bsl_df['RangeAddress'] == '2504'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2502_df['Range(ms)'].quantile(0.25)
Q3 = bsl_2502_df['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_bsl_2502_df = bsl_2502_df[(bsl_2502_df['Range(ms)'] >= lower_bound) & (bsl_2502_df['Range(ms)'] <= upper_bound)]
outside_values_df1 = bsl_2502_df[(bsl_2502_df['Range(ms)'] < lower_bound) | (bsl_2502_df['Range(ms)'] > upper_bound)]

# Convert the 'Range(ms)' column to numeric for plotting
bsl_2502_df['Range(ms)'] = pd.to_numeric(bsl_2502_df['Range(ms)'], errors='coerce')

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2504_df['Range(ms)'].quantile(0.25)
Q3 = bsl_2504_df['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


# Filter out data points with 'Range(ms)' values using IQR
filtered_bsl_2504_df = bsl_2504_df[(bsl_2504_df['Range(ms)'] >= lower_bound) & (bsl_2504_df['Range(ms)'] <= upper_bound)]
outside_values_df2 = bsl_2504_df[(bsl_2504_df['Range(ms)'] < lower_bound) | (bsl_2504_df['Range(ms)'] > upper_bound)]


# Convert the 'Range(ms)' column to numeric for plotting
bsl_2504_df['Range(ms)'] = pd.to_numeric(bsl_2504_df['Range(ms)'], errors='coerce')

# Creating explicit copies of the DataFrames to avoid the warning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()
filtered_bsl_2504_copy = filtered_bsl_2504_df.copy()

# Create explicit copies to avoid the SettingWithCopyWarning
filtered_bsl_2502_copy = filtered_bsl_2502_df.copy()

ensure_datetime(filtered_bsl_2502_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2503'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2502_copy timestamps
filtered_bsl_2502_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2503']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2503']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Calculated Distance (m)'] = filtered_bsl_2502_copy['Interpolated Sound Speed'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
# Display the top rows of the updated dataframe
filtered_bsl_2502_copy[['Record Time', 'Range(ms)', 'Interpolated Sound Speed', 'Calculated Distance (m)']].head()

##############
filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2502_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2502_2504']['Harmonic Mean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2502_copy['Harmonic Distance (m)'] = filtered_bsl_2502_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2502_copy['Range(ms)']-filtered_bsl_2502_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
##############
west_east=filtered_bsl_2502_copy


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has necessary columns

# Prepare data for plotting (convert to Unix timestamp and drop NaNs)
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)', 'Harmonic Distance (m)'], inplace=True)

# Fit quadratic and linear polynomials for 'Calculated Distance (m)'
coefficients_poly_calc = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 2)
coefficients_linear_calc = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 1)
polynomial_calc = np.poly1d(coefficients_poly_calc)
linear_calc = np.poly1d(coefficients_linear_calc)

# Fit quadratic and linear polynomials for 'Harmonic Distance (m)'
coefficients_poly_harmonic = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Harmonic Distance (m)'], 2)
coefficients_linear_harmonic = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Harmonic Distance (m)'], 1)
polynomial_harmonic = np.poly1d(coefficients_poly_harmonic)
linear_harmonic = np.poly1d(coefficients_linear_harmonic)

# Generate data points for the polynomial and linear lines
x_values = np.linspace(filtered_bsl_2502_copy['Record Time Numeric'].min(), filtered_bsl_2502_copy['Record Time Numeric'].max(), 100)
y_poly_calc = polynomial_calc(x_values)
y_linear_calc = linear_calc(x_values)
y_poly_harmonic = polynomial_harmonic(x_values)
y_linear_harmonic = linear_harmonic(x_values)

# Convert numeric time back to datetime for plotting
x_dates = pd.to_datetime(x_values, unit='s')


In [None]:
# Calculate residuals and standard deviation for Calculated Distance
filtered_bsl_2502_copy['Residuals Calc'] = filtered_bsl_2502_copy['Calculated Distance (m)'] - polynomial_calc(filtered_bsl_2502_copy['Record Time Numeric'])
std_dev_calc = filtered_bsl_2502_copy['Residuals Calc'].std()

# Filter out outliers for Calculated Distance
filtered_bsl_2502_no_outliers_calc = filtered_bsl_2502_copy[abs(filtered_bsl_2502_copy['Residuals Calc']) <= 3 * std_dev_calc]

# Calculate residuals and standard deviation for Harmonic Distance
filtered_bsl_2502_copy['Residuals Harmonic'] = filtered_bsl_2502_copy['Harmonic Distance (m)'] - polynomial_harmonic(filtered_bsl_2502_copy['Record Time Numeric'])
std_dev_harmonic = filtered_bsl_2502_copy['Residuals Harmonic'].std()

# Filter out outliers for Harmonic Distance
filtered_bsl_2502_no_outliers_harmonic = filtered_bsl_2502_copy[abs(filtered_bsl_2502_copy['Residuals Harmonic']) <= 3 * std_dev_harmonic]


In [None]:
filtered_bsl_2502_no_outliers_calc['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
filtered_bsl_2502_no_outliers_calc['Record Time'] = pd.to_datetime(filtered_bsl_2502_no_outliers_calc['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
filtered_bsl_2502_no_outliers_calc.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Sort by time for consistency
filtered_bsl_2502_no_outliers_calc = filtered_bsl_2502_no_outliers_calc.sort_values(by='Record Time')

# Set 'Record Time' as the index for rolling calculation
filtered_bsl_2502_no_outliers_calc = filtered_bsl_2502_no_outliers_calc.set_index('Record Time')

# Compute 30-day moving average
filtered_bsl_2502_no_outliers_calc['Calculated Distance Moving Average'] = (
    filtered_bsl_2502_no_outliers_calc['Calculated Distance (m)']
    .rolling('30D')
    .mean()
) * 100  # Multiply by 100 to match original scaling

# Reset index to keep 'Record Time' as a column
filtered_bsl_2502_no_outliers_calc = filtered_bsl_2502_no_outliers_calc.reset_index()
plt.figure(figsize=(12, 7))
# Plot original data
plt.plot(filtered_bsl_2502_no_outliers_calc['Record Time'], 
         filtered_bsl_2502_no_outliers_calc['Calculated Distance (m)'] * 100, 
         '.', label="TEOS-10 Distance", color='salmon', alpha=0.5)

# Plot moving average
plt.plot(filtered_bsl_2502_no_outliers_calc['Record Time'], 
         filtered_bsl_2502_no_outliers_calc['Calculated Distance Moving Average'],'r.', 
         label="30-day Moving Average")

plt.title("TEOS-10 Distance Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
#plt.ylim([326010,326030])
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:

# Plotting with outliers removed
plt.figure(figsize=(12, 7))
plt.rcParams.update({'font.size': 15})
# Data for Calculated Distance without outliers
plt.plot(filtered_bsl_2502_no_outliers_calc['Record Time'], filtered_bsl_2502_no_outliers_calc['Calculated Distance (m)']*100, '.', label="TEOS-10 Distance", color='grey')

# Data for Harmonic Distance without outliers
#plt.plot(filtered_bsl_2502_no_outliers_harmonic['Record Time'], filtered_bsl_2502_no_outliers_harmonic['Harmonic Distance (m)'], '.', label="Velocimeter Distance", color='grey')

# Polynomial fit for Calculated Distance
plt.plot(x_dates, y_poly_calc*100, label="Quadratic Fit - TEOS10", color='black',linewidth=3)

# Polynomial fit for Harmonic Distance
#plt.plot(x_dates, y_poly_harmonic, label="Quadratic Fit - Velocimeter", color='black')

# Linear fit for Calculated Distance
#plt.plot(x_dates, y_linear_calc, label="Linear Fit - TEOS10", color='black')

# Linear fit for Harmonic Distance
#plt.plot(x_dates, y_linear_harmonic, label="Linear Fit - Velocimeter", color='purple')

plt.title("Distance 2503-2502 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()



In [None]:
# Create an explicit copy of filtered_bsl_2503_df to avoid the SettingWithCopyWarning
filtered_bsl_2504_copy = filtered_bsl_2504_df.copy()

ensure_datetime(filtered_bsl_2504_copy, 'Record Time')
ensure_datetime(harmonic_mean_dfs['2503_2504'], 'Record Time')

# Interpolate sound speed values onto the filtered_bsl_2503_copy timestamps
filtered_bsl_2504_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2503_2504']['HMean']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2504_copy['Calculated Distance (m)'] = filtered_bsl_2504_copy['Interpolated Sound Speed'] * ((filtered_bsl_2504_copy['Range(ms)']-filtered_bsl_2504_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel

##############
filtered_bsl_2504_copy['Interpolated Harmonic Velocity'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['Record Time'].view(np.int64),
    harmonic_mean_dfs['2503_2504']['SoundSpeed']
)

# Calculate distance using the interpolated sound speeds
filtered_bsl_2504_copy['Harmonic Distance (m)'] = filtered_bsl_2504_copy['Interpolated Harmonic Velocity'] * ((filtered_bsl_2504_copy['Range(ms)']-filtered_bsl_2504_copy['TAT(ms)']) / 2000)  # Dividing by 2000 to convert ms to seconds and account for two-way travel
##############

# Calculate centered distances by subtracting the mean
centered_distance_2503_2502 = filtered_bsl_2502_copy['Calculated Distance (m)'] - filtered_bsl_2502_copy['Calculated Distance (m)'].mean()
centered_distance_2503_2504 = filtered_bsl_2504_copy['Calculated Distance (m)'] - filtered_bsl_2504_copy['Calculated Distance (m)'].mean()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2502_copy['Record Time Numeric'].min(), filtered_bsl_2502_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2502_copy['Record Time'], filtered_bsl_2502_copy['Calculated Distance (m)'], '.', label="Distance 2503-2502", color='blue')

# Polynomial fit
plt.plot(x_poly_dates, y_poly, label="Quadratic Fit", color='red')

plt.title("Distance 2503-2502 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2502_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2502_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2502_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2502_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2502_copy['Record Time Numeric'], filtered_bsl_2502_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2502_copy['Record Time Numeric'])

# Calculate residuals (misfits)
residuals = filtered_bsl_2502_copy['Calculated Distance (m)'] - y_poly

# Plotting the histogram of residuals
plt.figure(figsize=(12, 6))
plt.hist(residuals, bins=30, color='blue', alpha=0.7)
plt.title("Histogram of Residuals from Polynomial Fit for 2503-2502")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
filtered_bsl_2502_copy['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2504_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2504_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2504_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2504_copy['Record Time Numeric'], filtered_bsl_2504_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2504_copy['Record Time Numeric'].min(), filtered_bsl_2504_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2504_copy['Record Time'], filtered_bsl_2504_copy['Calculated Distance (m)']*100, '.', label="Distance 2503-2504", color='grey')

# Polynomial fit
plt.plot(x_poly_dates, y_poly*100, label="Quadratic Fit", color='black',linewidth=3)

plt.title("Distance 2503-2504 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
filtered_bsl_2504_copy['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2504_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2504_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2504_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2504_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2504_copy['Record Time Numeric'], filtered_bsl_2504_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2504_copy['Record Time Numeric'])

# Calculate residuals (misfits)
residuals = filtered_bsl_2504_copy['Calculated Distance (m)'] - y_poly

# Plotting the histogram of residuals
plt.figure(figsize=(12, 6))
plt.hist(residuals, bins=30, color='green', alpha=0.7)
plt.title("Histogram of Residuals from Polynomial Fit for 2503-2504")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
filtered_bsl_2504_copy['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import numpy as np

# Extract DataFrames based on the 'RangeAddress' column values
bsl_dfs = df_dict['2502']['BSL']
bsl_dfs['RangeAddress'] = bsl_dfs['RangeAddress'].astype(int).astype(str)
bsl_dfs['Range(ms)'] = pd.to_numeric(bsl_dfs['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2503_dfs = bsl_dfs[bsl_dfs['RangeAddress'] == '2503'].copy()
bsl_2504_dfs = bsl_dfs[bsl_dfs['RangeAddress'] == '2504'].copy()

# Calculate the IQR for the 'Range(ms)' column in bsl_2502_df
Q1 = bsl_2503_dfs['Range(ms)'].quantile(0.25)
Q3 = bsl_2503_dfs['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_bsl_2503_dfs = bsl_2503_dfs[(bsl_2503_dfs['Range(ms)'] >= 4636.2) & (bsl_2503_dfs['Range(ms)'] <= 4638)]
outside_values_df2 = bsl_2503_dfs[(bsl_2503_dfs['Range(ms)'] < 4636.2) & (bsl_2503_dfs['Range(ms)'] > 4638)]

# Convert the 'Range(ms)' column to numeric for plotting
bsl_2503_dfs['Range(ms)'] = pd.to_numeric(bsl_2503_dfs['Range(ms)'], errors='coerce')
bsl_2503_dfs['TAT(ms)'] = pd.to_numeric(bsl_2503_dfs['TAT(ms)'], errors='coerce')
Q1 = bsl_2504_dfs['Range(ms)'].quantile(0.25)
Q3 = bsl_2504_dfs['Range(ms)'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_bsl_2504_dfs = bsl_2504_dfs[(bsl_2504_dfs['Range(ms)'] >= 2400)]
outside_values_df2 = bsl_2504_dfs[(bsl_2504_dfs['Range(ms)'] < 2400)]

bsl_2504_dfs['Range(ms)'] = pd.to_numeric(bsl_2504_dfs['Range(ms)'], errors='coerce')
bsl_2504_dfs['TAT(ms)'] = pd.to_numeric(bsl_2504_dfs['TAT(ms)'], errors='coerce')

filtered_bsl_2503_copy = filtered_bsl_2503_dfs.copy()
filtered_bsl_2504_copy = filtered_bsl_2504_dfs.copy()

ensure_datetime(filtered_bsl_2503_copy, 'Record Time')
ensure_datetime(harmonic_mean_dfs['2502_2503'], 'Record Time')

# Interpolation for 2503
filtered_bsl_2503_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2503_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2503']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2503']['HMean']
)

filtered_bsl_2503_copy['Calculated Distance (m)'] = filtered_bsl_2503_copy['Interpolated Sound Speed'] * ((filtered_bsl_2503_copy['Range(ms)']-pd.to_numeric(filtered_bsl_2503_copy['TAT(ms)'])) / 2000)


# Interpolation for 2504
ensure_datetime(filtered_bsl_2504_copy, 'Record Time')
ensure_datetime(harmonic_df_dict['2502_2504'], 'Record Time')

filtered_bsl_2504_copy['Interpolated Sound Speed'] = np.interp(
    filtered_bsl_2504_copy['Record Time'].astype(np.int64),
    harmonic_df_dict['2502_2504']['Record Time'].view(np.int64),
    harmonic_df_dict['2502_2504']['HMean']
)

filtered_bsl_2504_copy['Calculated Distance (m)'] = filtered_bsl_2504_copy['Interpolated Sound Speed'] * ((filtered_bsl_2504_copy['Range(ms)']-pd.to_numeric(filtered_bsl_2504_copy['TAT(ms)'])) / 2000)

# Calculate centered distances
#centered_distance_2502_2503 = filtered_bsl_2503_copy['Calculated Distance (m)'] - filtered_bsl_2503_copy['Calculated Distance (m)'].mean()
#centered_distance_2502_2504 = filtered_bsl_2504_copy['Calculated Distance (m)'] - filtered_bsl_2504_copy['Calculated Distance (m)'].mean()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2503_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2503_copy['Record Time Numeric'], filtered_bsl_2503_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2503_copy['Record Time Numeric'].min(), filtered_bsl_2503_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2503_copy['Record Time'], filtered_bsl_2503_copy['Calculated Distance (m)'], '.', label="Distance 2502-2503", color='blue')

# Polynomial fit
plt.plot(x_poly_dates, y_poly, label="Quadratic Fit", color='red')

plt.title("Distance 2502-2503 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2503_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2503_copy['Record Time Numeric'], filtered_bsl_2503_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2503_copy['Record Time Numeric'])

# Calculate residuals (misfits)
residuals = filtered_bsl_2503_copy['Calculated Distance (m)'] - y_poly

# Plotting the histogram of residuals
plt.figure(figsize=(12, 6))
plt.hist(residuals, bins=30, color='red', alpha=0.7)
plt.title("Histogram of Residuals from Polynomial Fit for 2502-2503")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()

In [None]:
filtered_bsl_2503_copy['Calculated Distance (m)'].std()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2503_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2504_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2504_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2504_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2504_copy['Record Time Numeric'], filtered_bsl_2504_copy['Calculated Distance (m)'], 3)
polynomial = np.poly1d(coefficients)

# Generate data points for the polynomial line
x_poly = np.linspace(filtered_bsl_2504_copy['Record Time Numeric'].min(), filtered_bsl_2504_copy['Record Time Numeric'].max(), 100)
y_poly = polynomial(x_poly)

# Convert numeric time back to datetime for plotting
x_poly_dates = pd.to_datetime(x_poly, unit='s')

# Plotting
plt.figure(figsize=(12, 7))

# Original data
plt.plot(filtered_bsl_2504_copy['Record Time'], filtered_bsl_2504_copy['Calculated Distance (m)'], '.', label="Distance 2502-2504", color='blue')

# Polynomial fit
plt.plot(x_poly_dates, y_poly, label="Quadratic Fit", color='red')

plt.title("Distance 2502-2504 Timeseries")
plt.xlabel("Record Time")
plt.ylabel("Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming filtered_bsl_2504_copy is defined and has 'Record Time' and 'Calculated Distance (m)'

# Convert 'Record Time' to Unix timestamp (numeric) for fitting
filtered_bsl_2504_copy['Record Time Numeric'] = pd.to_datetime(filtered_bsl_2504_copy['Record Time']).astype(np.int64) // 10**9

# Remove any NaN values
filtered_bsl_2504_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Fit a quadratic polynomial (2nd degree)
coefficients = np.polyfit(filtered_bsl_2504_copy['Record Time Numeric'], filtered_bsl_2504_copy['Calculated Distance (m)'], 2)
polynomial = np.poly1d(coefficients)

# Calculate the polynomial values
y_poly = polynomial(filtered_bsl_2504_copy['Record Time Numeric'])

# Calculate residuals (misfits)
residuals = filtered_bsl_2504_copy['Calculated Distance (m)'] - y_poly

# Plotting the histogram of residuals
plt.figure(figsize=(12, 6))
plt.hist(residuals, bins=30, color='green', alpha=0.7)
plt.title("Histogram of Residuals from Polynomial Fit for 2502-2504")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid(True)
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
bsl_df = df_dict['2502']['BSL']
bsl_df['RangeAddress'] = bsl_df['RangeAddress'].astype(int).astype(str)
bsl_df['Range(ms)'] = pd.to_numeric(bsl_df['Range(ms)'], errors='coerce')
bsl_df['TAT(ms)'] = pd.to_numeric(bsl_df['TAT(ms)'], errors='coerce')
bsl_2503_df = bsl_df[bsl_df['RangeAddress'] == '2503'].copy()
bsl_2503_df['Record Time']

In [None]:
plt.figure(figsize=(10, 4))
plt.scatter(pd.to_datetime(bsl_2503_df['Record Time']),bsl_2503_df['Range(ms)'],s=1)
plt.title('Two way travel time from 2502-2503 ')
plt.xlabel('Record Time')
plt.ylabel('Range (ms)')
plt.grid(True)
plt.rcParams.update({'font.size': 13})
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hypothetical dataframes: filtered_bsl_2504_dfs and bsl_2503_df
# Both dataframes have a column 'Range(ms)'

# Finding the range of data in both datasets
min_range = min(filtered_bsl_2504_dfs['Range(ms)'].min(), bsl_2503_df['Range(ms)'].min())
max_range = max(filtered_bsl_2504_dfs['Range(ms)'].max(), bsl_2503_df['Range(ms)'].max())

# Creating 30 bins within this range
bins = np.linspace(min_range, max_range, 31)

# Creating the histograms with the same bins for both datasets
plt.hist(filtered_bsl_2504_dfs['Range(ms)'], bins, alpha=0.5, label='filtered_bsl_2504_dfs')
plt.hist(bsl_2503_df['Range(ms)'], bins, alpha=0.5, label='bsl_2503_df')

# Adding labels and title
plt.xlabel('Range(ms)')
plt.ylabel('Frequency')
plt.title('Histograms with Same Bin Width')
plt.legend()

# Display the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
filtered_bsl_2502_no_outliers_calc['Record Time'] = pd.to_datetime(filtered_bsl_2502_no_outliers_calc['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
filtered_bsl_2502_no_outliers_calc.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = filtered_bsl_2502_no_outliers_calc.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Convert mean values to cm to match original scaling
binned_data['Mean Distance'] *= 100
binned_data['Std Dev'] *= 100

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("TEOS-10 Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
filtered_bsl_2503_copy['Record Time'] = pd.to_datetime(filtered_bsl_2503_copy['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
filtered_bsl_2503_copy.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data = filtered_bsl_2503_copy.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='salmon', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("Central-West Distance (30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
west_east['Record Time'] = pd.to_datetime(west_east['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
west_east.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Group data into 30-day bins and calculate mean and standard deviation
binned_data =west_east.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': ['mean', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean Distance', 'Std Dev']

# Convert index (which is now the start of each 30-day bin) to a column for plotting
binned_data = binned_data.reset_index()

# Plot the binned mean values with error bars (std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean Distance'], 
    yerr=binned_data['Std Dev'], 
    fmt='o-', color='blue', label="30-day Binned Mean (±1σ)", capsize=5
)

plt.title("West-east(30-Day Binned) with Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (m)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


## new figures

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2504_2502['Record Time'] = pd.to_datetime(R2504_2502['Record Time'])

# Remove any NaN values in 'Calculated Distance (m)'
R2504_2502.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std
binned_data = R2504_2502.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Reset index to make the time bin starts into a column
binned_data = binned_data.reset_index()

# Convert mean, median, std to cm
binned_data['Mean (cm)'] = binned_data['Mean (m)'] * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot both mean and median with error bars
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Mean (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='o-',
    capsize=5,
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Median (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='s--',  # different marker/line style to distinguish from mean
    capsize=5,
    label="30-day Binned Median (±1σ)"
)

plt.title("Central-East Distance (30-Day Binned) with Mean and Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'Record Time' to datetime if needed
R2504_2502['Record Time'] = pd.to_datetime(R2504_2502['Record Time'])

# Remove NaN values
R2504_2502.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Compute overall mean and median (in cm)
mean_val = R2504_2502['Calculated Distance (m)'].mean() * 100
median_val = R2504_2502['Calculated Distance (m)'].median() * 100

# Convert distances to cm
distances_cm = R2504_2502['Calculated Distance (m)'] * 100

# Calculate residuals
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot the two sets of residuals on the same histogram
plt.figure(figsize=(12, 7))
plt.hist(
    residuals_mean, 
    bins=30, 
    alpha=0.5, 
    label='Residual: Data - Mean'
)
plt.hist(
    residuals_median, 
    bins=30, 
    alpha=0.5, 
    label='Residual: Data - Median'
)

# Optional: Add a line at x=0 to show zero residual
plt.axvline(x=0, linestyle='--', color='k', label="Zero Residual")

plt.title("Histogram of Residuals: Data vs. Overall Mean and Median: Central-East")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2503_2504['Record Time'] = pd.to_datetime(R2503_2504['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2503_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std dev
binned_data = R2503_2504.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Convert index (start of each 30-day bin) to a column
binned_data = binned_data.reset_index()

# Convert to centimeters
binned_data['Mean (cm)'] = binned_data['Mean (m)'] * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot both mean and median with error bars (±1 std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Mean (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='o-',
    capsize=5,
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Median (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='s--',
    capsize=5,
    label="30-day Binned Median (±1σ)"
)

plt.title("West-Central Distance (30-Day Binned) with Mean and Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2503_2504['Record Time'] = pd.to_datetime(R2503_2504['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2503_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Compute overall mean and median (convert to cm)
mean_val = R2503_2504['Harmonic Distance (m)'].mean() * 100
median_val = R2503_2504['Harmonic Distance (m)'].median() * 100

# Convert all distances to cm
distances_cm = R2503_2504['Harmonic Distance (m)'] * 100

# Calculate residuals relative to the overall mean and median
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot residuals for mean and median in one histogram (two overlays)
plt.figure(figsize=(12, 7))
plt.hist(
    residuals_mean,
    bins=30,
    alpha=0.5,
    label='Residuals: Data - Mean'
)
plt.hist(
    residuals_median,
    bins=30,
    alpha=0.5,
    label='Residuals: Data - Median'
)

# Optional: draw a vertical line at zero to highlight no residual
plt.axvline(x=0, linestyle='--', color='k', label='Zero Residual')

plt.title("Histogram of Residuals (Data vs. Overall Mean & Median)\nWest-Central Distance")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2503_2502['Record Time'] = pd.to_datetime(R2503_2502['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2503_2502.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std dev
binned_data = R2503_2502.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Reset index to make the time bins a column
binned_data = binned_data.reset_index()

# Convert to centimeters
binned_data['Mean (cm)'] = binned_data['Mean (m)'] * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot both mean and median with error bars (±1 std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Mean (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='o-',
    capsize=5,
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Median (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='s--',
    capsize=5,
    label="30-day Binned Median (±1σ)"
)

plt.title("West-East Distance (30-Day Binned) with Mean and Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format (if not already done)
R2503_2502['Record Time'] = pd.to_datetime(R2503_2502['Record Time'])

# Remove NaN values in 'Harmonic Distance (m)'
R2503_2502.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Compute the overall mean and median (convert to cm)
mean_val = R2503_2502['Harmonic Distance (m)'].mean() * 100
median_val = R2503_2502['Harmonic Distance (m)'].median() * 100

# Convert all distances to cm
distances_cm = R2503_2502['Harmonic Distance (m)'] * 100

# Calculate residuals relative to the overall mean and median
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot histogram for both sets of residuals
plt.figure(figsize=(12, 7))
plt.hist(residuals_mean, bins=30, alpha=0.5, label='Residuals: Data - Mean')
plt.hist(residuals_median, bins=30, alpha=0.5, label='Residuals: Data - Median')

# Optional: add a vertical line at zero
plt.axvline(x=0, linestyle='--', color='k', label="Zero Residual")

plt.title("Histogram of Residuals (Data vs. Overall Mean & Median)\nWest-East Distance")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# 1) R2504_2502 Dataset
# ---------------------------
df_ce = R2504_2502.copy()  # e.g., "Central-East"
df_ce['Record Time'] = pd.to_datetime(df_ce['Record Time'])
df_ce.dropna(subset=['Calculated Distance (m)'], inplace=True)

# Compute overall median (in cm)
overall_median_ce = df_ce['Calculated Distance (m)'].median() * 100

# Compute 30-day binned median
df_ce_binned = df_ce.resample('30D', on='Record Time').agg({
    'Calculated Distance (m)': 'median'
}).reset_index()

# Convert the binned median to cm
df_ce_binned['Median (cm)'] = df_ce_binned['Calculated Distance (m)'] * 100

# Subtract overall median to get "difference from median"
df_ce_binned['Diff from Overall Median (cm)'] = (
    df_ce_binned['Median (cm)'] - overall_median_ce
)


# ---------------------------
# 2) R2503_2504 Dataset
# ---------------------------
df_wc = R2503_2504.copy()  # e.g., "West-Central"
df_wc['Record Time'] = pd.to_datetime(df_wc['Record Time'])
df_wc.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Compute overall median (in cm)
overall_median_wc = df_wc['Harmonic Distance (m)'].median() * 100

# Compute 30-day binned median
df_wc_binned = df_wc.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': 'median'
}).reset_index()

df_wc_binned['Median (cm)'] = df_wc_binned['Harmonic Distance (m)'] * 100

df_wc_binned['Diff from Overall Median (cm)'] = (
    df_wc_binned['Median (cm)'] - overall_median_wc
)


# ---------------------------
# 3) R2503_2502 Dataset
# ---------------------------
df_we = R2503_2502.copy()  # e.g., "West-East"
df_we['Record Time'] = pd.to_datetime(df_we['Record Time'])
df_we.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Compute overall median (in cm)
overall_median_we = df_we['Harmonic Distance (m)'].median() * 100

# Compute 30-day binned median
df_we_binned = df_we.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': 'median'
}).reset_index()

df_we_binned['Median (cm)'] = df_we_binned['Harmonic Distance (m)'] * 100

df_we_binned['Diff from Overall Median (cm)'] = (
    df_we_binned['Median (cm)'] - overall_median_we
)


# ---------------------------
# Plot all three "difference from median" series on one figure
# ---------------------------
plt.figure(figsize=(12, 7))

plt.plot(
    df_ce_binned['Record Time'],
    df_ce_binned['Diff from Overall Median (cm)'],
    marker='o',
    label='R2504_2502 (Central-East)'
)

plt.plot(
    df_wc_binned['Record Time'],
    df_wc_binned['Diff from Overall Median (cm)'],
    marker='s',
    label='R2503_2504 (West-Central)'
)

plt.plot(
    df_we_binned['Record Time'],
    df_we_binned['Diff from Overall Median (cm)'],
    marker='^',
    label='R2503_2502 (West-East)'
)

plt.axhline(y=0, color='k', linestyle='--', label='Overall Median Line')
plt.title("30-Day Binned Median Minus Overall Median for Each Dataset")
plt.xlabel("Record Time")
plt.ylabel("Difference from Own Overall Median (cm)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2504_2503['Record Time'] = pd.to_datetime(R2504_2503['Record Time'])

# Remove NaN values in 'Harmonic Distance (m)'
R2504_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std dev
binned_data = R2504_2503.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Convert index (start of each 30-day bin) to a column
binned_data = binned_data.reset_index()

# Convert values to centimeters
binned_data['Mean (cm)']   = binned_data['Mean (m)']   * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot both mean and median with error bars (±1 std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Mean (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='o-',
    capsize=5,
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'],
    binned_data['Median (cm)'],
    yerr=binned_data['Std Dev (cm)'],
    fmt='s--',
    capsize=5,
    label="30-day Binned Median (±1σ)"
)

plt.title("Central-West Distance (30-Day Binned) with Mean and Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format (if not already done)
R2504_2503['Record Time'] = pd.to_datetime(R2504_2503['Record Time'])

# Remove NaN values in 'Harmonic Distance (m)'
R2504_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Compute the overall mean and median (in cm)
mean_val = R2504_2503['Harmonic Distance (m)'].mean() * 100
median_val = R2504_2503['Harmonic Distance (m)'].median() * 100

# Convert all distances to cm
distances_cm = R2504_2503['Harmonic Distance (m)'] * 100

# Calculate residuals
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot histogram of both residuals
plt.figure(figsize=(12, 7))
plt.hist(residuals_mean, bins=30, alpha=0.5, label='Residuals: Data - Mean')
plt.hist(residuals_median, bins=30, alpha=0.5, label='Residuals: Data - Median')

# Optional: vertical line at zero
plt.axvline(0, linestyle='--', color='k', label='Zero Residual')

plt.title("Histogram of Residuals (Data vs. Overall Mean & Median)\nCentral-West Distance")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2502_2504['Record Time'] = pd.to_datetime(R2502_2504['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2502_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std dev
binned_data = R2502_2504.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Reset index to convert the time bins into a column
binned_data = binned_data.reset_index()

# Convert to centimeters
binned_data['Mean (cm)']   = binned_data['Mean (m)']   * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot the binned mean and median with ±1σ error bars
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean (cm)'], 
    yerr=binned_data['Std Dev (cm)'], 
    fmt='o-', 
    capsize=5, 
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Median (cm)'], 
    yerr=binned_data['Std Dev (cm)'], 
    fmt='s--', 
    capsize=5, 
    label="30-day Binned Median (±1σ)"
)

plt.title("East-Central Distance (30-Day Binned) with Mean & Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2502_2504['Record Time'] = pd.to_datetime(R2502_2504['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2502_2504.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Calculate the overall mean and median (in cm)
mean_val = R2502_2504['Harmonic Distance (m)'].mean() * 100
median_val = R2502_2504['Harmonic Distance (m)'].median() * 100

# Convert all distances to cm
distances_cm = R2502_2504['Harmonic Distance (m)'] * 100

# Compute residuals with respect to the overall mean and median
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot histograms of the residuals
plt.figure(figsize=(12, 7))
plt.hist(residuals_mean, bins=30, alpha=0.5, label='Residuals: Data - Mean')
plt.hist(residuals_median, bins=30, alpha=0.5, label='Residuals: Data - Median')

# Optional: add a vertical line at zero
plt.axvline(x=0, linestyle='--', color='k', label='Zero Residual')

plt.title("Histogram of Residuals (Data vs. Overall Mean & Median)\nEast-Central Distance")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'Record Time' is in datetime format
R2502_2503['Record Time'] = pd.to_datetime(R2502_2503['Record Time'])

# Remove any NaN values in 'Harmonic Distance (m)'
R2502_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Group data into 30-day bins; calculate mean, median, and std dev
binned_data = R2502_2503.resample('30D', on='Record Time').agg({
    'Harmonic Distance (m)': ['mean', 'median', 'std']
})

# Rename columns for clarity
binned_data.columns = ['Mean (m)', 'Median (m)', 'Std Dev (m)']

# Convert the time bins (index) to a column
binned_data = binned_data.reset_index()

# Convert to centimeters
binned_data['Mean (cm)']   = binned_data['Mean (m)']   * 100
binned_data['Median (cm)'] = binned_data['Median (m)'] * 100
binned_data['Std Dev (cm)'] = binned_data['Std Dev (m)'] * 100

# Plot both mean and median with error bars (±1 std dev)
plt.figure(figsize=(12, 7))
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Mean (cm)'], 
    yerr=binned_data['Std Dev (cm)'], 
    fmt='o-', 
    capsize=5, 
    label="30-day Binned Mean (±1σ)"
)
plt.errorbar(
    binned_data['Record Time'], 
    binned_data['Median (cm)'], 
    yerr=binned_data['Std Dev (cm)'], 
    fmt='s--', 
    capsize=5, 
    label="30-day Binned Median (±1σ)"
)

plt.title("East-West Distance (30-Day Binned) with Mean & Median + Error Bars")
plt.xlabel("Record Time")
plt.ylabel("Calculated Distance (cm)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'Record Time' to datetime if needed
R2502_2503['Record Time'] = pd.to_datetime(R2502_2503['Record Time'])

# Remove NaN values in 'Harmonic Distance (m)'
R2502_2503.dropna(subset=['Harmonic Distance (m)'], inplace=True)

# Calculate overall mean and median, converting to cm
mean_val = R2502_2503['Harmonic Distance (m)'].mean() * 100
median_val = R2502_2503['Harmonic Distance (m)'].median() * 100

# Convert all distances to cm
distances_cm = R2502_2503['Harmonic Distance (m)'] * 100

# Compute residuals
residuals_mean = distances_cm - mean_val
residuals_median = distances_cm - median_val

# Plot the histogram of residuals for both mean and median
plt.figure(figsize=(12, 7))
plt.hist(residuals_mean, bins=30, alpha=0.5, label='Residuals: Data - Mean')
plt.hist(residuals_median, bins=30, alpha=0.5, label='Residuals: Data - Median')

# (Optional) Add a vertical line at zero
plt.axvline(0, linestyle='--', color='k', label='Zero Residual')

plt.title("Histogram of Residuals (Data vs. Overall Mean & Median)\nEast-West Distance")
plt.xlabel("Residual (cm)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.rcParams.update({'font.size': 14})
plt.show()


In [None]:
import pandas as pd

# 1) Drop NaNs in each DataFrame
R2502_2504_clean = R2502_2504.dropna(subset=['Harmonic Distance (m)'])  # East-Central
R2503_2502_clean = R2503_2502.dropna(subset=['Harmonic Distance (m)'])  # East-West
R2503_2504_clean = R2503_2504.dropna(subset=['Harmonic Distance (m)'])  # West-Central
R2504_2503_clean = R2504_2503.dropna(subset=['Harmonic Distance (m)'])  # Central-West
R2502_2503_clean = R2502_2503.dropna(subset=['Harmonic Distance (m)'])  # West-East
R2504_2502_clean = R2504_2502.dropna(subset=['Harmonic Distance (m)'])  # Central-East

# 2) Calculate mean in kilometers, std dev in centimeters, and precision (cm/km)
distances = {
    "Central to East": (R2504_2502_clean['Harmonic Distance (m)'].mean() / 1000, 
                        R2504_2502_clean['Harmonic Distance (m)'].std() * 100),
    "East to Central": (R2502_2504_clean['Harmonic Distance (m)'].mean() / 1000, 
                        R2502_2504_clean['Harmonic Distance (m)'].std() * 100),
    "Central to West": (R2504_2503_clean['Harmonic Distance (m)'].mean() / 1000, 
                        R2504_2503_clean['Harmonic Distance (m)'].std() * 100),
    "West to Central": (R2503_2504_clean['Harmonic Distance (m)'].mean() / 1000, 
                        R2503_2504_clean['Harmonic Distance (m)'].std() * 100),
    "East to West": (R2503_2502_clean['Harmonic Distance (m)'].mean() / 1000, 
                     R2503_2502_clean['Harmonic Distance (m)'].std() * 100),
    "West to East": (R2502_2503_clean['Harmonic Distance (m)'].mean() / 1000, 
                     R2502_2503_clean['Harmonic Distance (m)'].std() * 100),
}

# 3) Print results with precision
print("=== Mean Distances in km, Std Dev in cm, & Precision (cm/km) ===\n")
for label, (mean_km, std_cm) in distances.items():
    precision_cm_per_km = std_cm / mean_km  # Compute precision
    print(f"{label}: Mean = {mean_km:.4f} km, Std Dev = {std_cm:.4f} cm, Precision = {precision_cm_per_km:.2f} cm/km")
