##  threshold
- SB > 0, 
- coverage ratio < 0.1,
- AF > 0.001.

In [1]:
threshold_AF = 0.001
threshold_SB = 0
threshold_COV = 0.1

In [2]:
import pandas as pd

## Part1_ERR_POS

In [3]:
File_for_errPos = '/nfs/research/goldman/zihao/Datas/p2_comp_viridian/Folder_3_mergeINFO/errpos_outputData.txt'

In [4]:
df_err = pd.read_csv(File_for_errPos, sep='\t')

df_err['Flag_SB'] = (df_err['SB'].astype(float) > threshold_SB).astype(int)
df_err['Flag_AF'] = (df_err['AF'].astype(float) > threshold_AF).astype(int)
df_err['Flag_COV'] = (df_err['RATIO'].astype(float) < threshold_COV).astype(int)

df_err

Unnamed: 0,ID,Position,position,nucleotide_martin,nucleotide_origin,label_masked,label_mar,label_ori,label_same,RATIO,AF,SB,Flag_SB,Flag_AF,Flag_COV
0,ERR4590003,3078,3078.0,t,t,0.0,0.0,0.0,1.0,0.735556,0.000000,0.000000e+00,0,0,0
1,ERR4590003,18425,18425.0,c,c,0.0,0.0,0.0,1.0,1.571556,0.000214,0.000000e+00,0,0,0
2,ERR4589145,5941,5941.0,a,a,0.0,0.0,0.0,1.0,0.964792,0.115942,1.353000e+03,1,1,0
3,ERR4589114,21715,21715.0,g,g,0.0,0.0,0.0,1.0,0.431545,0.000000,0.000000e+00,0,0,0
4,ERR4589835,18436,18436.0,c,c,0.0,0.0,0.0,1.0,1.291376,0.005394,0.000000e+00,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17828,ERR4643534,28170,28170.0,a,a,0.0,0.0,0.0,1.0,1.304612,0.204040,2.147484e+09,1,1,0
17829,ERR4461486,28546,28546.0,a,a,0.0,0.0,0.0,1.0,0.005309,0.004789,0.000000e+00,0,1,1
17830,ERR4424721,27578,27578.0,t,t,0.0,0.0,0.0,1.0,0.007433,0.004512,0.000000e+00,0,1,1
17831,ERR4423181,1733,1733.0,g,g,0.0,0.0,0.0,1.0,0.002928,0.000000,0.000000e+00,0,0,1


In [5]:
count_label_1_total = sum(df_err['label_mar'] == 1.0)
count_label_2_total = sum(df_err['label_same'] == 1.0)
count_label_3_total = sum(df_err['label_same'] == 0.0)

total_files = len(df_err)
total_records = count_label_1_total + count_label_2_total + count_label_3_total

print('================================= information =================================')
print("Martin version masked:", count_label_1_total, ', Total number:', (total_files))
print("Same nucleotide type:", count_label_2_total, ', Total number:', (total_files))
print("Different nucleotide type:", count_label_3_total, ', Total number:', (total_files))
print('Total records: ', total_records)
print('================================= percentage =================================')
print("Martin version masked:", round(count_label_1_total / total_records * 100, 3), '%')
print("Same nucleotide type:", round(count_label_2_total / total_records * 100, 3), '%')
print("Different nucleotide type:", round(count_label_3_total / total_records * 100, 3), '%')

Martin version masked: 23 , Total number: 17833
Same nucleotide type: 15343 , Total number: 17833
Different nucleotide type: 1461 , Total number: 17833
Total records:  16827
Martin version masked: 0.137 %
Same nucleotide type: 91.181 %
Different nucleotide type: 8.682 %


# 1. PLOT

In [6]:
def format_number(number):
    suffixes = ['', 'K', 'M', 'B', 'T']

    for i in range(len(suffixes)):
        magnitude = number / (1000 ** i)
        if magnitude < 1000:
            if magnitude < 10:
                formatted = f"{magnitude:.1f}"
            else:
                formatted = f"{magnitude:.0f}"
            return f"{formatted}{suffixes[i]}"
    
    return f"{number:.1e}"

In [7]:
# Err pos:
numbers = [16933, 12715, 10764]

formatted_numbers_err = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_err.append(formatted_number)

print(formatted_numbers_err)

['17K', '13K', '11K']


In [8]:
import pandas as pd
import numpy as np

# creat DataFrame
data_err = {'Category': ['Same nucleotide',
                        'Different nucleotides',
                        'Viridian masked'],
        'Percentage': [41.901, 31.463, 26.636],
           'Raw_number':formatted_numbers_err}
df_err_plot = pd.DataFrame(data_err)

In [16]:
colors = ['#D9BFCB0', '#49998B', '#C1BEE3']

In [10]:
# colors = ['rgb(128, 0, 0)', 'rgb(0, 255, 0)', 'rgb(0, 0, 255)']

In [17]:
import plotly.graph_objects as go

fig = go.Figure(go.Pie(
    labels=df_err_plot['Category'],
    values=df_err_plot['Percentage'],
    hole=0.4,
    text=df_err_plot['Category']+': <br>' + df_err_plot['Raw_number'].astype(str),
    textinfo='percent+text',
    marker={'colors': colors},
    hovertemplate='%{label}<br>Current percentage: %{percent:.1%}',
    textfont={'size': 15},  # 设置文本的字体大小
    texttemplate='%{text} (%{percent:.1%})'  # 设置文本显示的格式
))

# Add titles for subplots
fig.update_layout(
    title="Comparison of assembled and sequenced sequences regarding MAPLE marker error positions"
)

fig.show()

# 2. For Venn chart

In [13]:
# For diff
df_err_diff = df_err[df_err['label_same'] == 1.0].copy()

df_err_diff['ID_POS'] = df_err_diff['ID'].astype(str) + "_" + df_err_diff['Position'].astype(str)

df_err_diff['Flag_AF'] = df_err_diff['AF'] > threshold_AF
df_err_diff['Flag_SB'] = df_err_diff['SB'] > threshold_SB
df_err_diff['Flag_COV'] = df_err_diff['RATIO'] < threshold_COV

df_err_diff = df_err_diff[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_diff.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_diff.txt", 
                   sep='\t', index=False)
df_err_diff

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
0,ERR4590003_3078,False,False,False
1,ERR4590003_18425,False,False,False
2,ERR4589145_5941,True,True,False
3,ERR4589114_21715,False,False,False
4,ERR4589835_18436,True,False,False
...,...,...,...,...
17828,ERR4643534_28170,True,True,False
17829,ERR4461486_28546,True,False,True
17830,ERR4424721_27578,True,False,True
17831,ERR4423181_1733,False,False,True


In [14]:
# For same
df_err_same = df_err[df_err['label_same'] == 0.0].copy()

df_err_same['ID_POS'] = df_err_same['ID'].astype(str) + "_" + df_err_same['Position'].astype(str)

df_err_same['Flag_AF'] = df_err_same['AF'] > threshold_AF
df_err_same['Flag_SB'] = df_err_same['SB'] > threshold_SB
df_err_same['Flag_COV'] = df_err_same['RATIO'] < threshold_COV

df_err_same = df_err_same[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_same.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_same.txt", 
                   sep='\t', index=False)
df_err_same

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
25,ERR7904349_8318,True,True,False
42,SRR19473500_23854,True,False,True
53,ERR7878099_8835,False,False,True
78,ERR7874433_8835,True,True,False
92,ERR7852330_3357,True,True,False
...,...,...,...,...
17706,ERR4891263_28362,False,False,True
17707,ERR4891263_28400,False,False,True
17718,ERR4876783_27467,True,True,False
17783,SRR19473987_5260,True,True,True


In [15]:
# For masked
df_err_masked = df_err[df_err['label_mar'] == 1.0].copy()

df_err_masked['ID_POS'] = df_err_masked['ID'].astype(str) + "_" + df_err_masked['Position'].astype(str)

df_err_masked['Flag_AF'] = df_err_masked['AF'] > threshold_AF
df_err_masked['Flag_SB'] = df_err_masked['SB'] > threshold_SB
df_err_masked['Flag_COV'] = df_err_masked['RATIO'] < threshold_COV

df_err_masked = df_err_masked[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_masked.to_csv("/nfs/research/goldman/zihao/code/EBI_INTER/A_Datas/For_COMP/COMP_ERR_masked.txt", 
                     sep='\t', index=False)
df_err_masked

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
1340,SRR20881680_21212,True,False,False
2926,SRR21024191_28254,False,False,False
3278,SRR21023942_28254,False,False,False
3549,SRR22549436_28254,False,False,False
4158,SRR22577691_28254,False,False,False
4229,SRR22470641_28254,False,False,False
4293,SRR21704252_28254,False,False,False
4308,SRR22470411_28254,False,False,False
5205,SRR21682246_28254,False,False,False
5542,SRR21508619_28254,False,False,False
