In [1]:
import pymongo
import pandas as pd
from scipy.stats import pearsonr
import plotly.express as px



client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["Monitoring"]
collection = db["PRJ-16"]

# db = client["SteelArena"]
# collection = db["Init"]

In [4]:
def generate_pipeline(sensor1_name, sensor2_name, bin_interval, multiplier):
    pipeline = [
        {
            "$match": {
                f"{sensor2_name}.t": {
                    "$exists": True
                },
                f"{sensor1_name}.s": {
                    "$exists": True
                }
            }
        },
        {
            "$addFields": {
                "time_bin": {
                    "$dateTrunc": {
                        "date": {
                            "$toDate": "$time.datetime"
                        },
                        "unit": "minute",
                        "binSize": bin_interval
                    }
                }
            }
        },
        {
            "$group": {
                "_id": "$time_bin",
                f"{sensor2_name}_values": {
                    "$push": f"${sensor2_name}.t"
                },
                f"{sensor1_name}_values": {
                    "$push": f"${sensor1_name}.s"
                }
            }
        },
        {
            "$project": {
                "time_bin": "$_id",
                "_id": 0,
                f"median_{sensor2_name}": {
                    "$let": {
                        "vars": {
                            f"sorted_{sensor2_name}": {
                                "$sortArray": {
                                    "input": f"${sensor2_name}_values",
                                    "sortBy": 1
                                }
                            }
                        },
                        "in": {
                            "$arrayElemAt": [
                                f"$$sorted_{sensor2_name}", {
                                    "$floor": {
                                        "$divide": [
                                            {
                                                "$size": f"$$sorted_{sensor2_name}"
                                            }, 2
                                        ]
                                    }
                                }
                            ]
                        }
                    }
                },
                f"median_{sensor1_name}": {
                    "$multiply": [
                        {
                            "$let": {
                                "vars": {
                                    f"sorted_{sensor1_name}": {
                                        "$sortArray": {
                                            "input": f"${sensor1_name}_values",
                                            "sortBy": 1
                                        }
                                    }
                                },
                                "in": {
                                    "$arrayElemAt": [
                                        f"$$sorted_{sensor1_name}", {
                                            "$floor": {
                                                "$divide": [
                                                    {
                                                        "$size": f"$$sorted_{sensor1_name}"
                                                    }, 2
                                                ]
                                            }
                                        }
                                    ]
                                }
                            }
                        },
                        multiplier
                    ]
                }
            }
        },
        {
            "$sort": {
                "time_bin": 1
            }
        }
    ]
    return pipeline


# Example usage
pipeline = generate_pipeline("S27", "T4", 18, 3.11e-9)

In [3]:
def calculate_pearson_correlation(df, TLeaf, SLeaf):
    """
    Calculates the Pearson correlation coefficient and p-value between two columns in a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    TLeaf (str): The name of the temperature column (e.g., 'T4').
    SLeaf (str): The name of the strain column (e.g., 'S27').

    Returns:
    tuple: A tuple containing the Pearson correlation coefficient and the p-value.
    """
    # Calculate Pearson correlation and p-value
    correlation, p_value = pearsonr(
        df[f'median_{TLeaf}'], df[f'median_{SLeaf}'])

    # Display the results
    print(f"Pearson correlation coefficient: {correlation}")
    print(f"P-value: {p_value}")

    return correlation, p_value

In [39]:
def create_scatter_plot(df, TLeaf, SLeaf):
    """
    Creates a scatter plot between two columns in a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    TLeaf (str): The name of the temperature column (e.g., 'T4').
    SLeaf (str): The name of the strain column (e.g., 'S27').

    Returns:
    None
    """
    # Create a scatter plot
    fig = px.scatter(df, x=f'median_{SLeaf}', y=f'median_{TLeaf}',
                     labels={f'median_{SLeaf}': f'Median {SLeaf}',
                             f'median_{TLeaf}': f'Median {TLeaf}'},
                     title=f'Scatter Plot of {SLeaf} vs {TLeaf}')

    # Show the plot
    fig.show()

# Example usage
# create_scatter_plot(df, 'T4', 'S27')

In [48]:
def create_scatter_plot(df, TLeaf, SLeaf, correlation, p_value, save=False, show=False):
    """
    Creates a scatter plot between two columns in a DataFrame, includes the Pearson correlation and p-value as annotations,
    applies a color gradient based on time, and optionally saves the plot.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    TLeaf (str): The name of the temperature column (e.g., 'T4').
    SLeaf (str): The name of the strain column (e.g., 'S27').
    correlation (float): The Pearson correlation coefficient.
    p_value (float): The p-value of the Pearson correlation.
    save (bool): If True, saves the plot as a file.

    Returns:
    None
    """
    # Create a scatter plot with color gradient based on time
    fig = px.scatter(df, x=f'median_{SLeaf}', y=f'median_{TLeaf}',
                     color=df.index,  # Assuming the DataFrame is sorted by time
                     color_continuous_scale=px.colors.sequential.Viridis,
                     labels={f'median_{SLeaf}': f'Median {SLeaf}',
                             f'median_{TLeaf}': f'Median {TLeaf}',
                             'color': 'Time (index)'},
                     title=f'Scatter Plot of {SLeaf} vs {TLeaf}')

    # Add annotation for correlation and p-value
    fig.add_annotation(
        text=f"r = {correlation:.2f}<br>p = {p_value:.2e}",
        xref="paper", yref="paper",
        x=0.95, y=0.05,
        showarrow=False,
        font=dict(size=12),
        align="left",
        bordercolor="black",
        borderwidth=1,
        bgcolor="rgba(255, 255, 255, 0.8)",  # Background with transparency
        opacity=0.9
    )

    # Update layout
    fig.update_layout(
        margin=dict(r=50),
        title_x=0.5,
        coloraxis_colorbar=dict(
            title="Time",
            tickvals=[df.index.min(), df.index.max()],
            ticktext=["Start", "End"]
        )
    )

    # Save the plot if save=True
    if save:
        filename = f"scatter_plot_{TLeaf}_vs_{SLeaf}.png"
        fig.write_image(filename)
        print(f"Plot saved as {filename}")

    # Show the plot
    if show:
        fig.show()

In [92]:
def create_scatter_plot(df, TLeaf, SLeaf, correlation, p_value, save=False, show=False):
    """
    Creates a scatter plot between two columns in a DataFrame, includes the Pearson correlation and p-value as annotations,
    applies a color gradient based on time, and optionally saves the plot.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    TLeaf (str): The name of the temperature column (e.g., 'T4').
    SLeaf (str): The name of the strain column (e.g., 'S27').
    correlation (float): The Pearson correlation coefficient.
    p_value (float): The p-value of the Pearson correlation.
    save (bool): If True, saves the plot as a file.
    show (bool): If True, displays the plot.

    Returns:
    None
    """
    # Calculate x-axis limits based on the mean ± 0.00032/2
    mean_value = df[f'median_{SLeaf}'].mean()
    x_min = mean_value - 0.00084 / 2
    x_max = mean_value + 0.00084 / 2

    # Create a scatter plot with color gradient based on time
    fig = px.scatter(df, x=f'median_{SLeaf}', y=f'median_{TLeaf}',
                     color=df.index,  # Assuming the DataFrame is sorted by time
                     color_continuous_scale=px.colors.sequential.Viridis,
                     labels={f'median_{SLeaf}': f'Median {SLeaf}',
                             f'median_{TLeaf}': f'Median {TLeaf}',
                             'color': 'Time (index)'},
                     title=f'Scatter Plot of {SLeaf} vs {TLeaf}')

    # Add annotation for correlation and p-value
    fig.add_annotation(
        text=f"r = {correlation:.2f}<br>p = {p_value:.2e}",
        xref="paper", yref="paper",
        x=0.95, y=0.05,
        showarrow=False,
        font=dict(size=12),
        align="left",
        bordercolor="black",
        borderwidth=1,
        bgcolor="rgba(255, 255, 255, 0.8)",  # Background with transparency
        opacity=0.9
    )

    # Update layout with x-axis limits
    fig.update_layout(
        xaxis=dict(range=[x_min, x_max]),  # Set x-axis limits
        margin=dict(r=50),
        title_x=0.5,
        coloraxis_colorbar=dict(
            title="Time",
            tickvals=[df.index.min(), df.index.max()],
            ticktext=["Start", "End"]
        )
    )

    # Save the plot if save=True
    if save:
        filename = f"scatter_plot_{TLeaf}_vs_{SLeaf}.png"
        fig.write_image(filename)
        print(f"Plot saved as {filename}")

    # Show the plot
    if show:
        fig.show()

In [51]:
{
    "T1": ["S7", "S8", "S9", "S10", "S11", "S12", "S13", "S14"],
    "T2": ["S15", "S16", "S17", "S18", "S11", "S12", "S13", "S14", "S22", "S23", "S25", "S26"],
    "T3": ["S22", "S23", "S25", "S26", "S19", "S20", "S21", "S24"]
 }

{'T1': ['S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14'],
 'T2': ['S15',
  'S16',
  'S17',
  'S18',
  'S11',
  'S12',
  'S13',
  'S14',
  'S22',
  'S23',
  'S25',
  'S26'],
 'T3': ['S22', 'S23', 'S25', 'S26', 'S19', 'S20', 'S21', 'S24']}

In [93]:
import pandas as pd

SLeaf = "S23"
TLeaf = "T3"


# Convert the results to a pandas DataFrame
df = pd.DataFrame(list(collection.aggregate(
    generate_pipeline(SLeaf, TLeaf, 20, 3.11e-9))))
df = df.dropna()
# df.head(3)

correlation, p_value = calculate_pearson_correlation(df, TLeaf, SLeaf)
create_scatter_plot(df, TLeaf, SLeaf, correlation, p_value, save=False, show=True)

Pearson correlation coefficient: -0.2647628202916346
P-value: 5.83326420021707e-27


In [33]:
df

Unnamed: 0,time_bin,median_T3,median_S19
0,2024-07-23 16:40:00,,-0.001005
1,2024-07-23 17:00:00,,-0.000967
2,2024-07-23 17:20:00,,-0.000956
3,2024-07-23 17:40:00,,-0.000950
4,2024-07-23 18:00:00,,-0.000941
...,...,...,...
1705,2024-08-16 09:00:00,29.1875,-0.000845
1706,2024-08-16 09:20:00,29.2500,-0.000843
1707,2024-08-16 09:40:00,29.3750,-0.000848
1708,2024-08-16 10:00:00,29.5000,-0.000838


In [95]:
df[f"median_{SLeaf}"].describe()["50%"]

-0.00251033913

In [73]:
# Initialize an empty DataFrame to collect all statistics
summary_df = pd.DataFrame()

# Iterate over the sensor pairs
for TLeaf, SLeaf_list in sensor_pairs.items():
    for SLeaf in SLeaf_list:
        # Generate the pipeline for the current pair
        pipeline = generate_pipeline(SLeaf, TLeaf, 20, 3.11e-9)

        # Execute the pipeline and convert results to a DataFrame
        df = pd.DataFrame(list(collection.aggregate(pipeline)))

        if not df.empty:
            # Calculate statistics for the strain (SLeaf) column
            stats_SLeaf = df[f"median_{SLeaf}"].describe()

            # Convert the series to a DataFrame with SLeaf as the column name
            stats_SLeaf_df = stats_SLeaf.to_frame(name=SLeaf)

            # Append to the summary DataFrame
            summary_df = pd.concat([summary_df, stats_SLeaf_df], axis=1)

# Display the summary DataFrame
summary_df

Unnamed: 0,S7,S8,S9,S10,S11,S12,S13,S14,S15,S16,...,S25,S26,S22,S23,S25.1,S26.1,S19,S20,S21,S24
count,1365.0,1365.0,1365.0,1365.0,1307.0,1307.0,1305.0,1305.0,1641.0,1640.0,...,120.0,120.0,1664.0,1593.0,1652.0,1652.0,1710.0,1710.0,1667.0,1710.0
mean,-0.001703,-0.000395,0.000285,-0.002239,-0.001727,0.002071,-0.001457,0.001075,-0.001418,0.000138,...,0.000442,-0.000506,0.000585,-0.002556,0.000491,-0.000539,-0.000855,0.00038,0.003053,-0.000299
std,4.6e-05,0.000111,2e-05,2.9e-05,4.4e-05,2.6e-05,4.5e-05,2.7e-05,8.8e-05,5.9e-05,...,4.9e-05,3.7e-05,4.2e-05,0.000131,7.7e-05,6.6e-05,3.3e-05,5.2e-05,0.000477,0.000119
min,-0.001824,-0.00083,0.000228,-0.002442,-0.00187,0.001976,-0.001582,0.000921,-0.001904,1.8e-05,...,0.000369,-0.000632,0.000482,-0.002871,0.000322,-0.000762,-0.001005,0.000234,0.001059,-0.000924
25%,-0.001734,-0.000402,0.000271,-0.002254,-0.001758,0.002057,-0.001482,0.00106,-0.00144,0.000102,...,0.0004,-0.000521,0.000559,-0.002665,0.000428,-0.000587,-0.000876,0.00035,0.002649,-0.000356
50%,-0.001704,-0.000366,0.000284,-0.002239,-0.001733,0.002075,-0.001459,0.001081,-0.001415,0.000129,...,0.000439,-0.000494,0.000581,-0.00251,0.000482,-0.00052,-0.000856,0.000394,0.002968,-0.000273
75%,-0.001671,-0.000332,0.000299,-0.002223,-0.001695,0.002092,-0.001425,0.001095,-0.001371,0.000169,...,0.000475,-0.000482,0.00061,-0.002446,0.000553,-0.000484,-0.000833,0.000423,0.003407,-0.000219
max,-0.001584,-0.000255,0.000358,-0.002133,-0.001615,0.002115,-0.001345,0.001128,-0.001276,0.000314,...,0.000574,-0.000454,0.000689,-0.002374,0.000669,-0.000452,-0.000777,0.000455,0.00414,-9.2e-05


In [78]:
# Calculate the difference between max and min for each column
diff_row = summary_df.loc['max'] - summary_df.loc['min']

# Add this as a new row to the summary_df DataFrame
summary_df.loc['diff'] = diff_row

In [91]:
summary_df.loc['diff'].sort_values()

S11    0.000022
S12    0.000095
S13    0.000124
S14    0.000127
S9     0.000130
S12    0.000139
S22    0.000155
S26    0.000178
S25    0.000205
S14    0.000207
S22    0.000207
S20    0.000221
S18    0.000227
S19    0.000227
S17    0.000232
S13    0.000237
S7     0.000240
S11    0.000255
S16    0.000296
S10    0.000309
S26    0.000310
S25    0.000347
S23    0.000415
S23    0.000497
S8     0.000575
S15    0.000628
S24    0.000832
S21    0.003081
Name: diff, dtype: float64

In [97]:
import pandas as pd

# Dictionary of temperature sensors and their corresponding strain sensors
sensor_pairs = {
    "T1": ["S7", "S8", "S9", "S10", "S11", "S12", "S13", "S14"],
    "T2": ["S15", "S16", "S17", "S18", "S11", "S12", "S13", "S14", "S22", "S23", "S25", "S26"],
    "T3": ["S22", "S23", "S25", "S26", "S19", "S20", "S21", "S24"]
}

sensor_pairs = {
    "T4": ["S27"],
    "T5": ["S28"],
    "T6": ["S29"]
}


# Loop through each temperature sensor and its associated strain sensors
for TLeaf, SLeaves in sensor_pairs.items():
    for SLeaf in SLeaves:
        # Generate the pipeline for the current TLeaf and SLeaf
        pipeline = generate_pipeline(SLeaf, TLeaf, 20, 3.11e-9)

        # Convert the results to a pandas DataFrame
        df = pd.DataFrame(list(collection.aggregate(pipeline)))
        df = df.dropna()
        
        # Ensure the DataFrame is not empty
        if not df.empty:
            # Calculate the Pearson correlation and p-value
            correlation, p_value = calculate_pearson_correlation(
                df, TLeaf, SLeaf)

            # Create and save the scatter plot
            create_scatter_plot(
                df, TLeaf, SLeaf, correlation, p_value, save=True, show=False)

Pearson correlation coefficient: -0.17880534372992574
P-value: 4.2648304236683555e-14
Plot saved as scatter_plot_T4_vs_S27.png
Pearson correlation coefficient: 0.015268313806292205
P-value: 0.5223288955414233
Plot saved as scatter_plot_T5_vs_S28.png
Pearson correlation coefficient: -0.3409816511904938
P-value: 1.7156998141375663e-48
Plot saved as scatter_plot_T6_vs_S29.png


In [113]:
import plotly.graph_objects as go


def plot_temperature_and_strain_over_time(df, TLeaf, SLeaf, save=False, show=True):
    """
    Creates a dual-axis line plot for temperature and strain over time, with the legend at the top.
    Optionally saves the plot and controls whether it is displayed.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    TLeaf (str): The name of the temperature column (e.g., 'T4').
    SLeaf (str): The name of the strain column (e.g., 'S27').
    save (bool): If True, saves the plot as a file.
    show (bool): If True, displays the plot.

    Returns:
    None
    """
    # Create a new figure
    fig = go.Figure()

    # Add trace for temperatures
    fig.add_trace(go.Scatter(
        x=df['time_bin'],
        y=df[f'median_{TLeaf}'],
        mode='lines',
        name=f'Median {TLeaf}',
        yaxis='y1'
    ))

    # Add trace for strains
    fig.add_trace(go.Scatter(
        x=df['time_bin'],
        y=df[f'median_{SLeaf}'],
        mode='lines',
        name=f'Median {SLeaf}',
        yaxis='y2'
    ))

    # Update layout with two y-axes and the legend at the top
    fig.update_layout(
        title=f'Median Temperature ({TLeaf}) and Strain ({SLeaf}) Over Time',
        xaxis_title='Time',
        yaxis_title=f'Temperature ({TLeaf})',
        yaxis=dict(
            title=f'Temperature ({TLeaf})',
            side='left'
        ),
        yaxis2=dict(
            title=f'Strain ({SLeaf})',
            side='right',
            overlaying='y',
            showgrid=False
        ),
        legend=dict(
            orientation="h",  # Horizontal legend
            yanchor="bottom",  # Anchor the legend at the bottom of the plot area
            y=1.02,  # Position it just above the top of the plot
            xanchor="center",  # Center the legend horizontally
            x=0.5  # Center it horizontally within the plot area
        ),
        legend_title='Series',
        hovermode="x unified"
    )

    # Save the plot if save=True
    if save:
        filename = f"temperature_strain_plot_{TLeaf}_vs_{SLeaf}.png"
        fig.write_image(filename)
        print(f"Plot saved as {filename}")

    # Show the plot if show=True
    if show:
        fig.show()

# Example usage
# plot_temperature_and_strain_over_time(df, 'T4', 'S27', save=True, show=True)

In [116]:
SLeaf = "S15"
TLeaf = "T2"


# Convert the results to a pandas DataFrame
df = pd.DataFrame(list(collection.aggregate(
    generate_pipeline(SLeaf, TLeaf, 60, 3.11e-9))))
df = df.dropna()
# df.head(3)

plot_temperature_and_strain_over_time(df, TLeaf, SLeaf,  save=True, show=True)

Plot saved as temperature_strain_plot_T2_vs_S15.png


In [115]:
import pandas as pd

# Dictionary of temperature sensors and their corresponding strain sensors
sensor_pairs = {
    "T1": ["S7", "S8", "S9", "S10", "S11", "S12", "S13", "S14"],
    "T2": ["S15", "S16", "S17", "S18", "S11", "S12", "S13", "S14", "S22", "S23", "S25", "S26"],
    "T3": ["S22", "S23", "S25", "S26", "S19", "S20", "S21", "S24"]
}

# Loop through each temperature sensor and its associated strain sensors
for TLeaf, SLeaves in sensor_pairs.items():
    for SLeaf in SLeaves:
        # Generate the pipeline for the current TLeaf and SLeaf
        pipeline = generate_pipeline(SLeaf, TLeaf, 60, 3.11e-9)

        # Convert the results to a pandas DataFrame
        df = pd.DataFrame(list(collection.aggregate(pipeline)))

        # Drop rows with NaN values
        df = df.dropna()

        # Ensure the DataFrame is not empty after dropping NaNs
        if not df.empty:
            # Plot temperature and strain over time, save the plot, and optionally show it
            plot_temperature_and_strain_over_time(
                df, TLeaf, SLeaf, save=True, show=False)

Plot saved as temperature_strain_plot_T1_vs_S7.png
Plot saved as temperature_strain_plot_T1_vs_S8.png
Plot saved as temperature_strain_plot_T1_vs_S9.png
Plot saved as temperature_strain_plot_T1_vs_S10.png
Plot saved as temperature_strain_plot_T1_vs_S11.png
Plot saved as temperature_strain_plot_T1_vs_S12.png
Plot saved as temperature_strain_plot_T1_vs_S13.png
Plot saved as temperature_strain_plot_T1_vs_S14.png
Plot saved as temperature_strain_plot_T2_vs_S15.png
Plot saved as temperature_strain_plot_T2_vs_S16.png
Plot saved as temperature_strain_plot_T2_vs_S17.png
Plot saved as temperature_strain_plot_T2_vs_S18.png
Plot saved as temperature_strain_plot_T2_vs_S11.png
Plot saved as temperature_strain_plot_T2_vs_S12.png
Plot saved as temperature_strain_plot_T2_vs_S13.png
Plot saved as temperature_strain_plot_T2_vs_S14.png
Plot saved as temperature_strain_plot_T2_vs_S22.png
Plot saved as temperature_strain_plot_T2_vs_S23.png
Plot saved as temperature_strain_plot_T2_vs_S25.png
Plot saved as t