# 1 Prepare functions

## 1.1 Load Python library functions

In [1]:
import numpy as np
import pandas as pd
import cv2
import math, os, json
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob

## 1.2 Make custom functions

In [2]:
def get_scaler(img_arr,img_height,img_width):  
    """
    Get the number of pixels of the scale in the image, corresponding to a physical length of 500 microns
    img_arr：image in matrix format
    img_height：image height, that is, the number of pixels in the vertical direction
    img_width：image width, that is, the number of pixels in the horizontal direction
    """  
    rd = img_arr[img_height//2:, img_width//2:, :] # take the lower right piece of the image
    mask = (rd[..., 0] >= 240) * (rd[..., 1] <= 20) * (rd[..., 2] <= 20) # True for red pixels, False for non-red ones
    scaler = np.max(np.sum(mask, axis = 1)) # get the number of pixels of the scale
    return(scaler) 

In [3]:
def get_center(cnt):
    """
    get the shape's center point and radius
    cnt：contour points
    """  
    (x,y),radius = cv2.minEnclosingCircle(cnt)
    center = (int(x),int(y))
    radius = int(radius)
    return center,radius

In [4]:
def vb_initial_process(points):   
    """
    get initial data on vascular bundle traits, including individual vascular bundle areas and 
    individual vascular bundle centers
    points：contour points of individual vascular bundle
    """  
    if shape_type == "polygon":
        vb_center = get_center(points)[0] # get center point
        area_pixel = cv2.contourArea(points) # get area        
    elif shape_type == "circle":      
        vb_center, vb_round = points[0], points[1] # get center point
        r = math.dist(vb_center,vb_round) # get radius
        area_pixel = round(math.pi * r ** 2,1) # get area
        
    return area_pixel,vb_center

In [5]:
def vb_more_process(area_pixel_union,vb_center_union,stem_center):  
    """
    get more data on vascular bundle traits
    area_pixel_union：vascular bundle pixel area set
    vb_center_union：Vascular bundle center point set
    stem_center：stem center point 
    """  
    num = len(area_pixel_union) # get vascular bundle number
    area_sum_pixel = np.sum(area_pixel_union) # get vascular bundle area
    area_mean_pixel = np.mean(area_pixel_union) # get average vascular bundle area
    # get the set of pixel distances from vascular bundles to stem center point
    center_dist_pixel_union = [math.dist(x,stem_center) for x in vb_center_union] 
    # get the average pixel distances of vascular bundles to stem center point
    center_dist_mean_pixel = np.mean(center_dist_pixel_union) 
    # get the max pixel distances of vascular bundles to stem center point
    center_dist_max_pixel = np.max(center_dist_pixel_union)
    # get the min pixel distances of vascular bundles to stem center point
    center_dist_min_pixel = np.min(center_dist_pixel_union) 
    
    return (num,area_sum_pixel,area_mean_pixel,center_dist_pixel_union,
            center_dist_mean_pixel,center_dist_max_pixel,center_dist_min_pixel)

In [6]:
def write2excel(df,savepath,sheet_name):
    """
    write data to an existing excel sheet
    df：DataFrame
    savepath：filepath of the existing excel file
    sheet_name：sheet name
    """
    import openpyxl

    wb = openpyxl.load_workbook(savepath)
    writer = pd.ExcelWriter(savepath,engine='openpyxl')
    writer.book = wb
 
    df.to_excel(writer,sheet_name = sheet_name)
    writer.save()
    writer.close()

# 2 Process data

In [None]:
wd = "your/working/directory/"
jsons = glob(wd + "*.json") # find all json files
print(len(jsons)) 
jsons[:5] # view the first five json files

## 2.1 Check the quality of label data

In [None]:
# there should be only one "out" label which represents one stem in each image
# if not, they need to be relabeled

samples = []
out_ns = []
in_ns = []

for json_file in jsons:
    filename = os.path.splitext(json_file)[0] 
    sample_name = filename.split("/")[-2] + "/" + filename.split("/")[-1] 
    samples.append(sample_name)
    print(sample_name)
    

    with open(json_file) as f:
        data = json.loads(f.read()) # read json files
        df = pd.DataFrame(data['shapes']) # transform json format into DataFrame format
        print(df["label"].value_counts()) # check the categories of labels, and the number of each category

        df_label = pd.DataFrame(df["label"].value_counts())
        if "out" in df_label.index.tolist():
            out_n = df_label.loc["out","label"]
        else:
            out_n = 0
            
        if "in" in df_label.index.tolist():
            in_n = df_label.loc["in","label"]
        else:
            in_n = 0
        
        out_ns.append(out_n)
        in_ns.append(in_n)
        print("out_n:",out_n) 
        print("in_n:",in_n) 
        print("*************")     

In [32]:
# get the check result
label_tf = pd.DataFrame(zip(samples,out_ns,in_ns),columns=["samples","out_n","in_n"])
label_tf.head(2)

Unnamed: 0,samples,out_n,in_n
0,HN/sc90,1,1
1,HN/sc86,1,1


In [33]:
label_tf["out_n"].value_counts()

1    200
Name: out_n, dtype: int64

In [34]:
label_tf["in_n"].value_counts()

1    200
Name: in_n, dtype: int64

In [None]:
# get the samples that have more than one "out" labels
label_tf[label_tf["out_n"] != 1] # then manually correct the wrong 

In [36]:
# get the samples that have more than one "in" labels
label_tf[label_tf["in_n"] != 1] # then manually correct the wrong 

Unnamed: 0,samples,out_n,in_n


## 2.2 Export intermediate data

In [None]:
all_res = {} # create a dictionary to store the results

for json_file in jsons:
    sample_res = {} # Create a dictionary to store the data for each sample
    filename = os.path.splitext(json_file)[0] 
    sitename = filename.split("/")[-2]
    kindname = filename.split("/")[-1]
    sample_name = sitename + "/" + kindname # get sample name
    print(sample_name)
    
    
    # firstly, get the scale length
    jpg_path = wd + sitename + "/" + kindname
    tif_file = jpg_path + ".tif"
    bmp_file = jpg_path + ".bmp"
    jpg_file = jpg_path + ".jpg"
    # load the image
    if os.path.exists(tif_file):
        img = Image.open(tif_file)
    elif os.path.exists(bmp_file):
        img = Image.open(bmp_file)
    elif os.path.exists(jpg_file):
        img = Image.open(jpg_file)
    # get the image height, width and array
    img_h = img.height
    img_w = img.width
    img_arr = np.array(img)
    # get the scale length on pixels
    scaler = get_scaler(img_arr,img_h,img_w)
    print("比例尺像素长度：",scaler)
    sample_res["比例尺像素长度"] = scaler
    
    # secondly，process json files to get traits data
    with open(json_file) as f:
        data = json.loads(f.read()) # read json files  
        
        # create lists to save traits datasets
        big_area_pixel_union, small_area_pixel_union = [], []
        big_center_union, small_center_union = [],[]
        hole_area_pixel_union = []
         
        for shape in data['shapes']:
            label = shape['label']
            shape_type = shape['shape_type']
            points = np.array(shape['points']).astype(int)
            
            # 2.1 data extraction
            # 2.1.1 get the data related to holes
            if label == "hole":
                hole_area_pixel = cv2.contourArea(points)
                print("hole_area_pixel:",hole_area_pixel)
                hole_area_pixel_union.append(hole_area_pixel)           
            
            # 2.1.2 get the data related to stems
            if label == "out":
                if shape_type == "polygon":
                    stem_center,steam_radius_pixel = get_center(points) 
                    stem_perimeter_pixel = cv2.arcLength(points, True)  
                    stem_area_pixel = cv2.contourArea(points) 
                              
            # 2.1.3 get the data related to vascular bundles
            # to large vascular bundles
            if label == "wgs":
                area_pixel, vb_center = vb_initial_process(points)
                big_area_pixel_union.append(area_pixel)
                big_center_union.append(vb_center)
                     
            # to small vascular bundles
            if label == "small wgs" or label == "smwgs":
                area_pixel, vb_center = vb_initial_process(points)
                small_area_pixel_union.append(area_pixel)
                small_center_union.append(vb_center)
                
            # 2.1.4 get the data related to peels
            if label == "in":
                if shape_type == "polygon":
                    inner_center,inner_radius_pixel = get_center(points)                          
                    inner_area_pixel = cv2.contourArea(points)  
                    pi_dist_pixel_union = [math.dist(x,stem_center) for x in points] 
                    pi_dist_pixel_mean = np.mean(pi_dist_pixel_union)
                    
                
        
        # 2.2 data assignment
        # 2.2.1 first-level data assignment
        sample_res["空腔像素面积集"] = hole_area_pixel_union   
        sample_res["空腔像素面积"] = np.sum(hole_area_pixel_union)

        sample_res["茎中心"] = stem_center  
        sample_res["茎像素半径"] = steam_radius_pixel
        sample_res["茎像素直径"] = steam_radius_pixel * 2
        sample_res["茎像素周长"] = stem_perimeter_pixel
        sample_res["茎像素面积"] = stem_area_pixel
        sample_res["去腔茎像素面积"] = sample_res["茎像素面积"] - sample_res["空腔像素面积"]
               
        sample_res["内环像素半径"] = inner_radius_pixel
        sample_res["内环像素面积"] = inner_area_pixel
        sample_res["皮像素厚度"] = sample_res["茎像素半径"] - sample_res["内环像素半径"]    
        sample_res["皮像素面积"] = sample_res["茎像素面积"] - sample_res["内环像素面积"]   
        
        sample_res["大维管束像素面积集"] = big_area_pixel_union
        sample_res["大维管束中心集"] = big_center_union

        sample_res["小维管束像素面积集"] = small_area_pixel_union
        sample_res["小维管束中心集"] = small_center_union
    
        vb_more_res = vb_more_process(big_area_pixel_union,big_center_union,stem_center)
        sample_res["大维管束数目"]  = vb_more_res[0]
        sample_res["大维管束像素面积和"]  = vb_more_res[1] 
        sample_res["大维管束平均像素面积"] = vb_more_res[2]
        sample_res["大维管束离心像素距集"] = vb_more_res[3]
        sample_res["大维管束离心像素距"] = vb_more_res[4]
        sample_res["大维管束最大离心像素距"] = vb_more_res[5]
        
        vb_more_res = vb_more_process(small_area_pixel_union,small_center_union,stem_center)
        sample_res["小维管束数目"]  = vb_more_res[0]
        sample_res["小维管束像素面积和"]  = vb_more_res[1] 
        sample_res["小维管束平均像素面积"] = vb_more_res[2]
        sample_res["小维管束离心像素距集"] = vb_more_res[3]
        sample_res["小维管束离心像素距"] = vb_more_res[4]
        sample_res["小维管束最小离心像素距"] = vb_more_res[6]
        
        sample_res["维管束总数目"] = sample_res["大维管束数目"] + sample_res["小维管束数目"]
        sample_res["维管束像素面积总和"] = sample_res["大维管束像素面积和"] + sample_res["小维管束像素面积和"]
        
        # 2.2.2 second-level data assignment
        sample_res["空腔面积占茎百分比"] = 100 * sample_res["空腔像素面积"] / sample_res["茎像素面积"]
             
        sample_res["大维管束数目占比"] = 100 * sample_res["大维管束数目"] / sample_res["维管束总数目"]
        sample_res["大小维管束平均面积比值"] = sample_res["大维管束平均像素面积"] / sample_res["小维管束平均像素面积"]
        sample_res["大维管束面积占比"] = 100 * sample_res["大维管束像素面积和"] / sample_res["维管束像素面积总和"]
        
        sample_res["大维管束面积占茎百分比"] = 100 * sample_res["大维管束像素面积和"] / sample_res["去腔茎像素面积"]
        sample_res["小维管束面积占茎百分比"] = 100 * sample_res["小维管束像素面积和"] / sample_res["去腔茎像素面积"]
        sample_res["大维管束面积占内环百分比"] = 100 * sample_res["大维管束像素面积和"] / (sample_res["内环像素面积"] - sample_res["空腔像素面积"])
        sample_res["小维管束面积占皮百分比"] = 100 * sample_res["小维管束像素面积和"] / sample_res["皮像素面积"]
             
        sample_res["大维管束相对离心距"] = sample_res["大维管束离心像素距"] / sample_res["茎像素半径"]
        sample_res["小维管束相对离心距"] = sample_res["小维管束离心像素距"] / sample_res["茎像素半径"]
        
        
    all_res[sample_name] = sample_res        

In [79]:
# view the intermediate data
all_res_df = pd.DataFrame(all_res).T
all_res_df.head(2)

Unnamed: 0,比例尺像素长度,茎中心,茎像素半径_外接圆法,茎像素半径_距离平均法,内环像素半径_外接圆法,内环像素半径_距离平均法,大维管束最大离心像素距,小维管束最小离心像素距,内环像素半径_维管束距离法,内环像素半径_大维管束距离
HN/sc90,259,"(1887, 1384)",963,921.917255,916,845.707567,894.767568,805.506673,850.13712,894.767568
HN/sc86,159,"(1790, 1345)",839,820.814627,796,753.68671,780.387724,740.184436,760.28608,780.387724


In [80]:
all_res_df.columns

Index(['比例尺像素长度', '茎中心', '茎像素半径_外接圆法', '茎像素半径_距离平均法', '内环像素半径_外接圆法',
       '内环像素半径_距离平均法', '大维管束最大离心像素距', '小维管束最小离心像素距', '内环像素半径_维管束距离法',
       '内环像素半径_大维管束距离'],
      dtype='object')

In [70]:
# save the intermediate data
savepath = wd + "my.xlsx"
sheet_name = "中间数据"
write2excel(all_res_df,savepath,sheet_name)

## 2.3 Process intermediate data to form final data

In [82]:
# extract pixel-related data for scale conversion
all_res_cols = all_res_df.columns
pixel_cols = all_res_cols[all_res_cols.str.contains("像素")]
pixel_cols = pixel_cols[~pixel_cols.str.contains("集")].tolist() 

remain_cols = pd.Series([x for x in all_res_cols if x not in pixel_cols])
remain_cols = remain_cols[~remain_cols.str.contains("集")].tolist() 

In [83]:
pixel_df = all_res_df[pixel_cols] # extract pixel-related traits
remain_df = all_res_df[remain_cols] # extract non-pixel-related data
pixel_df["缩放比例"] = 0.5 / pixel_df["比例尺像素长度"] # calculate the scaling

pixel_df.head(2)

Unnamed: 0,比例尺像素长度,茎像素半径_外接圆法,茎像素半径_距离平均法,内环像素半径_外接圆法,内环像素半径_距离平均法,大维管束最大离心像素距,小维管束最小离心像素距,内环像素半径_维管束距离法,内环像素半径_大维管束距离,缩放比例
HN/sc90,259,963,921.917255,916,845.707567,894.767568,805.506673,850.13712,894.767568,0.001931
HN/sc86,159,839,820.814627,796,753.68671,780.387724,740.184436,760.28608,780.387724,0.003145


In [84]:
# scale transformation, multiply by the scale for the length variables,
# and multiply by the square of the scale for the area variables
physics_df = pd.DataFrame()
for col in pixel_cols[1:]:
    col_name = col.replace('像素', '')
    print(col_name)
    if "面积" in col_name:            
        physics_df[col_name] = pixel_df[col] * (pixel_df["缩放比例"] ** 2)
    else:
        physics_df[col_name] = pixel_df[col] * pixel_df["缩放比例"]       

茎半径_外接圆法
茎半径_距离平均法
内环半径_外接圆法
内环半径_距离平均法
大维管束最大离心距
小维管束最小离心距
内环半径_维管束距离法
内环半径_大维管束距离


In [74]:
# merge scaled data with remaining data
output = pd.concat([physics_df,remain_df],axis=1)
output["小维管束“周长密度”"] = output["小维管束数目"] / output["茎周长"] # 计算密度类性状
output["小维管束“面积密度”"] = output["小维管束数目"] / output["皮面积"]
output["大维管束“面积密度”"] = output["大维管束数目"] / (output["内环面积"] - output["空腔面积"])
        
output.head(2)

Unnamed: 0,空腔面积,茎半径,茎直径,茎周长,茎面积,去腔茎面积,内环半径,内环面积,去腔内环面积,皮厚度,...,大维管束面积占比,大维管束面积占茎百分比,小维管束面积占茎百分比,大维管束面积占内环百分比,小维管束面积占皮百分比,大维管束相对离心距,小维管束相对离心距,小维管束“周长密度”,小维管束“面积密度”,大维管束“面积密度”
HN/sc90,0.0,1.859073,3.718147,11.155409,9.845748,9.845748,1.76834,8.345899,8.345899,0.090734,...,51.274179,4.68768,4.454699,5.530107,29.242842,0.733475,0.911381,6.812839,50.671765,5.751328
HN/sc86,0.895969,2.638365,5.27673,16.1915,20.763167,19.867198,2.503145,17.581489,16.68552,0.13522,...,67.490607,5.541797,2.669415,6.598534,16.6685,0.73526,0.940081,6.052558,30.801358,4.734644


In [75]:
traits = pd.read_excel(savepath,sheet_name="导出性状表") # read traits list
cols_order = traits["性状"].tolist() # set the order of the output data according to the order of the traits list
output = output[cols_order]
output.head(2)

Unnamed: 0,大维管束数目,小维管束数目,维管束总数目,大维管束面积和,小维管束面积和,维管束面积总和,大维管束平均面积,小维管束平均面积,大维管束离心距,小维管束离心距,...,大维管束面积占茎百分比,大维管束面积占内环百分比,小维管束面积占茎百分比,小维管束面积占皮百分比,大维管束相对离心距,小维管束相对离心距,小维管束“周长密度”,小维管束“面积密度”,大维管束“面积密度”,空腔面积占茎百分比
HN/sc90,48,76,124,0.461537,0.438598,0.900136,0.009615,0.005771,1.363585,1.694323,...,4.68768,5.530107,4.454699,29.242842,0.733475,0.911381,6.812839,50.671765,5.751328,0.0
HN/sc86,79,98,177,1.101,0.530338,1.631338,0.013937,0.005412,1.939884,2.480277,...,5.541797,6.598534,2.669415,16.6685,0.73526,0.940081,6.052558,30.801358,4.734644,4.315186


In [76]:
# save final data
savepath = wd + "my.xlsx"
sheet_name = "最终数据"
write2excel(output,savepath,sheet_name)