# Verifying the phenotype measurement through correlation analysis
**Pearson correlation analysis** measures the strength and direction of the **linear relationship** between two continuous variables.

* The **Pearson correlation coefficient (r)** ranges from **-1 to +1**:

  * **+1** indicates a perfect positive linear relationship,
  * **-1** indicates a perfect negative linear relationship,
  * **0** indicates no linear relationship.

The formula is:

$$
r = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum (x_i - \bar{x})^2} \sqrt{\sum (y_i - \bar{y})^2}}
$$

**Key points**:

* Assumes both variables are normally distributed.
* Sensitive to outliers.
* Only captures linear relationships.

Used when you want to test if two variables tend to increase or decrease together in a linear fashion.


In [37]:
import pandas as pd
import json
import os
print(os.getcwd())
# load CSV data
root_pixel_data = pd.read_csv('/work/root_pixel_data.csv')

# load JSON data
with open('/work/stalk_images_UNET_logistic_regression_cuda.json', 'r') as f:
    stalk_images_data = json.load(f)

/


In [38]:
import pprint
pprint.pprint(root_pixel_data.head())

                                   Plant ID    Pixels/Label Height  \
0  stalk_images/IMG_20240715_095432_703.jpg                 70.114   
1  stalk_images/IMG_20240716_084956_718.jpg                 71.021   
2  stalk_images/IMG_20240716_084925_713.jpg                 69.181   
3  stalk_images/IMG_20240715_095405_699.jpg                 73.437   
4  stalk_images/IMG_20240725_113429_675.jpg               1018.311   

     W1 roots    W2 roots    W3 roots    W4 roots      Left root Width  \
0           2           6           0           0               27.295   
1          10           9           0           0               29.547   
2           2           7           0           0               21.095   
3           2           9           0           0               21.932   
4           2           6           0           0              226.877   

     Stalk Width    Root Height Left    Root Len Left    Root ht Right  \
0        123.016               141.0            124.0       

In [39]:
# Convert JSON data to DataFrame
stalk_images_df = pd.DataFrame(stalk_images_data)
pprint.pprint(stalk_images_df["/opt/RootTaggingGUI/stalk_images/IMG_20240715_095432_703.jpg"]["measurements"])

{'highest_emergence': [213, 276],
 'marker_width': 25.777830123901367,
 'root_angle': 41.729512076816434,
 'root_count': 11,
 'root_width': 30.363636363636363,
 'spread_center': [244.5, 313.0],
 'spread_width': 129,
 'stalk_width': 61.0}


In [44]:
#preprocessing 
import math
'''
marker width
stalk width
root height
root count 
root width
spreadwidth
angle 
'''
# We must generate a DataFrame with the following columns:
# 'marker_width', 'stalk_width', 'root_height', 'root_count', 'root_width', 'spread_width', 'angle'
# and fill it with the data from root_pixel_data and stalk_images_df
data = {
    'marker_width': {'RT': [], 'ours': []},
    'stalk_width': {'RT': [], 'ours': []},
    'root_height': {'RT': [], 'ours': []},
    'root_count': {'RT': [], 'ours': []},
    'root_width': {'RT': [], 'ours': []},
    'spread_width': {'RT': [], 'ours': []},
    'angle': {'RT': [], 'ours': []},
}   

for index, row in root_pixel_data.iterrows():
    plant_id = '/opt/RootTaggingGUI/stalk_images/' + row['Plant ID'].split('/')[-1]  # Extract the plant ID from the file path
    
    data['marker_width']['ours'].append(stalk_images_df[plant_id]["measurements"]["marker_width"])
    data['stalk_width']['ours'].append(stalk_images_df[plant_id]["measurements"]["stalk_width"])
    root_height = stalk_images_df[plant_id]["measurements"]["spread_center"][1] - stalk_images_df[plant_id]["measurements"]["highest_emergence"][1]
    data['root_height']['ours'].append(root_height)
    data['root_count']['ours'].append(stalk_images_df[plant_id]["measurements"]["root_count"])
    data['root_width']['ours'].append(stalk_images_df[plant_id]["measurements"]["root_width"])
    data['spread_width']['ours'].append(stalk_images_df[plant_id]["measurements"]["spread_width"])
    data['angle']['ours'].append(stalk_images_df[plant_id]["measurements"]["root_angle"])

    data['marker_width']['RT'].append(row['  Pixels/Label Height'])
    data['root_height']['RT'].append(row['  Root Height Left'])
    data['stalk_width']['RT'].append(row['  Stalk Width'])
    data['root_width']['RT'].append(row['    Left root Width'])
    
    total_root_count = row['  W1 roots'] + row['  W2 roots'] + row['  W3 roots'] + row['  W4 roots']     
    data['root_count']['RT'].append(total_root_count)

    spread_width = row['  Root Len Left'] + row['    Root Len Right']  + row['  Stalk Width']
    data['spread_width']['RT'].append(spread_width)
    angle = math.atan2(row['  Root Len Left'], row["  Root Height Left"]) * (180 / math.pi)
    data['angle']['RT'].append(angle)


In [54]:
import numpy as np
from scipy.stats import zscore, pearsonr

for key in data:
    # Convert lists to numpy arrays
    arr_ours = np.array(data[key]['ours'])
    arr_rt   = np.array(data[key]['RT'])

    # Compute z-scores
    z_ours = zscore(arr_ours)
    z_rt   = zscore(arr_rt)

    # Build mask to drop outliers beyond |z| >= 3 in either set
    mask = (np.abs(z_ours) < 2) & (np.abs(z_rt) < 2)

    # Apply mask
    filt_ours = arr_ours[mask]
    filt_rt   = arr_rt[mask]

    # Compute Pearson’s r on filtered data
    r, p = pearsonr(filt_ours, filt_rt)
    print(f"Pearson correlation for {key}: r = {r:.3f}, p = {p:.3f}")



Pearson correlation for marker_width: r = -0.181, p = 0.331
Pearson correlation for stalk_width: r = 0.461, p = 0.009
Pearson correlation for root_height: r = 0.467, p = 0.008
Pearson correlation for root_count: r = 0.506, p = 0.003
Pearson correlation for root_width: r = -0.156, p = 0.420
Pearson correlation for spread_width: r = 0.391, p = 0.030
Pearson correlation for angle: r = 0.288, p = 0.130
