##  threshold
- SB > 0, 
- coverage ratio < 0.1,
- AF > 0.001.

In [1]:
threshold_AF = 0.001
threshold_SB = 0
threshold_COV = 0.1

In [2]:
import pandas as pd

## Part1_ERR_POS

In [3]:
File_for_errPos = '/nfs/research/goldman/zihao/Datas/p2_comp_viridian/Folder_3_mergeINFO/errpos_outputData.txt'

In [4]:
df_err = pd.read_csv(File_for_errPos, sep='\t')

df_err['Flag_SB'] = (df_err['SB'].astype(float) > threshold_SB).astype(int)
df_err['Flag_AF'] = (df_err['AF'].astype(float) > threshold_AF).astype(int)
df_err['Flag_COV'] = (df_err['RATIO'].astype(float) < threshold_COV).astype(int)

df_err

Unnamed: 0,ID,Position,position,nucleotide_martin,nucleotide_origin,label_masked,label_mar,label_ori,label_same,RATIO,AF,SB,Flag_SB,Flag_AF,Flag_COV
0,ERR4590003,3078,3078.0,t,t,0.0,0.0,0.0,1.0,0.735556,0.000000,0.000000e+00,0,0,0
1,ERR4590003,18425,18425.0,c,c,0.0,0.0,0.0,1.0,1.571556,0.000214,0.000000e+00,0,0,0
2,ERR4589177,29616,29616.0,c,c,0.0,0.0,0.0,1.0,0.517866,0.000000,0.000000e+00,0,0,0
3,ERR4589104,17722,17722.0,t,t,0.0,0.0,0.0,1.0,0.950682,0.000595,0.000000e+00,0,0,0
4,ERR4589145,5941,5941.0,a,a,0.0,0.0,0.0,1.0,0.964792,0.115942,1.353000e+03,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44454,ERR4423892,24144,24144.0,g,g,0.0,0.0,0.0,1.0,0.119646,0.000000,2.147484e+09,1,0,0
44455,ERR4423892,28275,28275.0,k,g,0.0,1.0,0.0,2.0,2.891230,0.330510,1.200000e+01,1,1,0
44456,ERR4425059,27008,27008.0,k,t,0.0,1.0,0.0,2.0,1.698294,0.292220,8.000000e+00,1,1,0
44457,SRR21662653,8984,8984.0,g,c,0.0,0.0,0.0,0.0,0.436244,0.290323,0.000000e+00,0,1,0


In [5]:
count_label_1_total = sum(df_err['label_mar'] == 1.0)
count_label_2_total = sum(df_err['label_same'] == 1.0)
count_label_3_total = sum(df_err['label_same'] == 0.0)

total_files = len(df_err)
total_records = count_label_1_total + count_label_2_total + count_label_3_total

print('================================= information =================================')
print("Martin version masked:", count_label_1_total, ', Total number:', (total_files))
print("Same nucleotide type:", count_label_2_total, ', Total number:', (total_files))
print("Different nucleotide type:", count_label_3_total, ', Total number:', (total_files))
print('Total records: ', total_records)
print('================================= percentage =================================')
print("Martin version masked:", round(count_label_1_total / total_records * 100, 3), '%')
print("Same nucleotide type:", round(count_label_2_total / total_records * 100, 3), '%')
print("Different nucleotide type:", round(count_label_3_total / total_records * 100, 3), '%')

Martin version masked: 11531 , Total number: 44459
Same nucleotide type: 19375 , Total number: 44459
Different nucleotide type: 13085 , Total number: 44459
Total records:  43991
Martin version masked: 26.212 %
Same nucleotide type: 44.043 %
Different nucleotide type: 29.745 %


# 1. PLOT

In [6]:
def format_number(number):
    suffixes = ['', 'K', 'M', 'B', 'T']

    for i in range(len(suffixes)):
        magnitude = number / (1000 ** i)
        if magnitude < 1000:
            if magnitude < 10:
                formatted = f"{magnitude:.1f}"
            else:
                formatted = f"{magnitude:.0f}"
            return f"{formatted}{suffixes[i]}"
    
    return f"{number:.1e}"

In [18]:
# Err pos:
numbers = [19375, 13085, 11531]

formatted_numbers_err = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_err.append(formatted_number)

print(formatted_numbers_err)

['19K', '13K', '12K']


In [None]:
================================= percentage =================================
Martin version masked: 26.212 %
Same nucleotide type: 44.043 %
Different nucleotide type: 29.745 %

In [19]:
import pandas as pd
import numpy as np

# creat DataFrame
data_err = {'Category': ['Same nucleotide',
                        'Different nucleotides',
                        'Viridian masked'],
        'Percentage': [44.043, 29.745, 26.212],
           'Raw_number':formatted_numbers_err}
df_err_plot = pd.DataFrame(data_err)

In [20]:
colors = ['#D9BFCB0', '#49998B', '#C1BEE3']

In [21]:
import plotly.graph_objects as go

fig = go.Figure(go.Pie(
    labels=df_err_plot['Category'],
    values=df_err_plot['Percentage'],
    hole=0.4,
    text=df_err_plot['Category']+': <br>' + df_err_plot['Raw_number'].astype(str),
    textinfo='percent+text',
    marker={'colors': colors},
    hovertemplate='%{label}<br>Current percentage: %{percent:.1%}',
    textfont={'size': 15},  # 设置文本的字体大小
    texttemplate='%{text} (%{percent:.1%})'  # 设置文本显示的格式
))

# Add titles for subplots
fig.update_layout(
    title="Comparison of assembled and sequenced sequences regarding MAPLE marker error positions"
)

fig.show()

# 2. For Venn chart

In [12]:
# For diff
df_err_diff = df_err[df_err['label_same'] == 1.0].copy()

df_err_diff['ID_POS'] = df_err_diff['ID'].astype(str) + "_" + df_err_diff['Position'].astype(str)

df_err_diff['Flag_AF'] = df_err_diff['AF'] > threshold_AF
df_err_diff['Flag_SB'] = df_err_diff['SB'] > threshold_SB
df_err_diff['Flag_COV'] = df_err_diff['RATIO'] < threshold_COV

df_err_diff = df_err_diff[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_diff.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_diff.txt", 
                   sep='\t', index=False)
df_err_diff

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
0,ERR4590003_3078,False,False,False
1,ERR4590003_18425,False,False,False
2,ERR4589177_29616,False,False,False
3,ERR4589104_17722,False,False,False
4,ERR4589145_5941,True,True,False
...,...,...,...,...
44445,SRR19929120_29543,False,False,True
44448,SRR21135713_1346,True,False,False
44449,ERR4461486_28546,True,False,True
44450,ERR4424721_27578,True,False,True


In [13]:
# For same
df_err_same = df_err[df_err['label_same'] == 0.0].copy()

df_err_same['ID_POS'] = df_err_same['ID'].astype(str) + "_" + df_err_same['Position'].astype(str)

df_err_same['Flag_AF'] = df_err_same['AF'] > threshold_AF
df_err_same['Flag_SB'] = df_err_same['SB'] > threshold_SB
df_err_same['Flag_COV'] = df_err_same['RATIO'] < threshold_COV

df_err_same = df_err_same[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_same.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_same.txt", 
                   sep='\t', index=False)
df_err_same

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
10,ERR4437926_27784,True,True,True
13,ERR4589830_29015,True,True,False
19,ERR4637124_10999,True,True,True
21,SRR23585665_1871,True,True,False
27,ERR4640487_10764,True,True,True
...,...,...,...,...
44436,SRR23594517_5260,True,True,False
44441,SRR23599857_1871,True,True,False
44446,SRR20057108_28254,True,True,False
44457,SRR21662653_8984,True,False,False


In [14]:
# For masked
df_err_masked = df_err[df_err['label_mar'] == 1.0].copy()

df_err_masked['ID_POS'] = df_err_masked['ID'].astype(str) + "_" + df_err_masked['Position'].astype(str)

df_err_masked['Flag_AF'] = df_err_masked['AF'] > threshold_AF
df_err_masked['Flag_SB'] = df_err_masked['SB'] > threshold_SB
df_err_masked['Flag_COV'] = df_err_masked['RATIO'] < threshold_COV

df_err_masked = df_err_masked[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_masked.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_masked.txt", 
                     sep='\t', index=False)
df_err_masked

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
11,ERR4460144_7394,True,True,False
12,ERR4589830_26655,True,True,False
15,ERR4589153_28964,True,False,True
16,ERR4589157_11105,True,True,False
20,ERR4461112_25739,True,False,True
...,...,...,...,...
44451,ERR4437827_6729,True,True,False
44452,ERR4423816_8888,True,True,True
44453,ERR4423816_23895,True,False,True
44455,ERR4423892_28275,True,True,False
