##  threshold
- SB > 0, 
- coverage ratio < 0.1,
- AF > 0.001.

In [1]:
threshold_AF = 0.001
threshold_SB = 0
threshold_COV = 0.1

In [2]:
import pandas as pd

## Part1_ERR_POS

In [3]:
df_err = pd.read_csv("/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/err_pos_output_data_june.txt",
           sep='\t')

df_err['Flag_SB'] = (df_err['SB'].astype(float) > threshold_SB).astype(int)

df_err['Flag_AF'] = (df_err['AF'].astype(float) > threshold_AF).astype(int)

df_err['Flag_COV'] = (df_err['RATIO'].astype(float) < threshold_COV).astype(int)

df_err

Unnamed: 0,ID,Position,position,nucleotide_martin,nucleotide_origin,label_masked,label_mar,label_ori,label_same,RATIO,AF,SB,Flag_SB,Flag_AF,Flag_COV
0,ERR4423031,8755,8755.0,m,c,0.0,0.0,0.0,0.0,0.042336,0.450757,5.000000e+00,1,1,1
1,ERR4421647,11812,11812.0,m,a,0.0,0.0,0.0,0.0,0.326993,0.331522,0.000000e+00,0,1,0
2,ERR4421647,16174,16174.0,w,t,0.0,0.0,0.0,0.0,1.012681,0.264922,4.000000e+00,1,1,0
3,ERR4422806,10764,10764.0,a,t,0.0,0.0,0.0,0.0,0.007743,0.343750,7.000000e+01,1,1,1
4,ERR4424882,7071,7071.0,g,t,0.0,0.0,0.0,0.0,2.189053,0.388591,2.147484e+09,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48011,ERR4640670,20714,20714.0,s,c,0.0,0.0,0.0,0.0,0.684224,0.284420,0.000000e+00,0,1,0
48012,ERR4633273,10999,10999.0,t,a,0.0,0.0,0.0,0.0,0.009642,0.461538,2.300000e+01,1,1,1
48013,ERR4459789,26532,26532.0,g,g,0.0,0.0,0.0,1.0,1.617389,0.463502,2.147484e+09,1,1,0
48014,ERR4459789,28717,28717.0,a,a,0.0,0.0,0.0,1.0,1.616577,0.002441,0.000000e+00,0,1,0


In [4]:
# 初始化计数变量
count_label_1_total = sum(df_err['label_mar'] == 1.0)
count_label_2_total = sum(df_err['label_same'] == 1.0)
count_label_3_total = sum(df_err['label_same'] == 0.0)

# 打印总计数
total_files = len(df_err)
total_records = count_label_1_total + count_label_2_total + count_label_3_total

print('================================= information =================================')
print("Martin version masked:", count_label_1_total, ', Total number:', (total_files))
print("Same nucleotide type:", count_label_2_total, ', Total number:', (total_files))
print("Different nucleotide type:", count_label_3_total, ', Total number:', (total_files))
print('Total records: ', total_records)
print('================================= percentage =================================')
print("Martin version masked:", round(count_label_1_total / total_records * 100, 3), '%')
print("Same nucleotide type:", round(count_label_2_total / total_records * 100, 3), '%')
print("Different nucleotide type:", round(count_label_3_total / total_records * 100, 3), '%')

Martin version masked: 3503 , Total number: 48016
Same nucleotide type: 8788 , Total number: 48016
Different nucleotide type: 8732 , Total number: 48016
Total records:  21023
Martin version masked: 16.663 %
Same nucleotide type: 41.802 %
Different nucleotide type: 41.535 %


# 1. PLOT

In [3]:
def format_number(number):
    suffixes = ['', 'K', 'M', 'B', 'T']

    for i in range(len(suffixes)):
        magnitude = number / (1000 ** i)
        if magnitude < 1000:
            if magnitude < 10:
                formatted = f"{magnitude:.1f}"
            else:
                formatted = f"{magnitude:.0f}"
            return f"{formatted}{suffixes[i]}"
    
    return f"{number:.1e}"

In [4]:
# Err pos:
numbers = [8788, 8732, 3503]

formatted_numbers_err = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_err.append(formatted_number)

print(formatted_numbers_err)

['8.8K', '8.7K', '3.5K']


In [5]:
import pandas as pd
import numpy as np

# creat DataFrame
data_err = {'Category': ['Same nucleotide',
                        'Different nucleotides',
                        'Viridian masked'],
        'Percentage': [41.802, 41.535, 16.663],
           'Raw_number':formatted_numbers_err}
df_err_plot = pd.DataFrame(data_err)

In [6]:
colors = ['rgb(128, 0, 0)', 'rgb(0, 255, 0)', 'rgb(0, 0, 255)']

In [16]:
import plotly.graph_objects as go

fig = go.Figure(go.Pie(
    labels=df_err_plot['Category'],
    values=df_err_plot['Percentage'],
    hole=0.4,
    text=df_err_plot['Category']+': <br>' + df_err_plot['Raw_number'].astype(str),
    textinfo='percent+text',
    marker={'colors': colors},
    hovertemplate='%{label}<br>Current percentage: %{percent:.1%}',
    textfont={'size': 15},  # 设置文本的字体大小
    texttemplate='%{text} (%{percent:.1%})'  # 设置文本显示的格式
))

# Add titles for subplots
fig.update_layout(
    title="Comparison of assembled and sequenced sequences regarding MAPLE marker error positions"
)

fig.show()

# 2. For Venn

In [11]:
# For diff
df_err_diff = df_err[df_err['label_same'] == 1.0].copy()

df_err_diff['ID_POS'] = df_err_diff['ID'].astype(str) + "_" + df_err_diff['Position'].astype(str)

df_err_diff['Flag_AF'] = df_err_diff['AF'] > threshold_AF
df_err_diff['Flag_SB'] = df_err_diff['SB'] > threshold_SB
df_err_diff['Flag_COV'] = df_err_diff['RATIO'] < threshold_COV

df_err_diff = df_err_diff[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_diff.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ERR_diff.txt", sep='\t', index=False)
df_err_diff

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
14,ERR4460173_960,True,True,False
17,ERR4905383_27855,False,False,True
19,ERR4461609_4228,False,False,False
22,ERR4639973_28336,False,False,False
23,ERR4603981_28877,False,False,False
...,...,...,...,...
48009,ERR4460819_27750,False,True,False
48010,ERR4640670_18686,False,True,False
48013,ERR4459789_26532,True,True,False
48014,ERR4459789_28717,True,False,False


In [12]:
# For same
df_err_same = df_err[df_err['label_same'] == 0.0].copy()

df_err_same['ID_POS'] = df_err_same['ID'].astype(str) + "_" + df_err_same['Position'].astype(str)

df_err_same['Flag_AF'] = df_err_same['AF'] > threshold_AF
df_err_same['Flag_SB'] = df_err_same['SB'] > threshold_SB
df_err_same['Flag_COV'] = df_err_same['RATIO'] < threshold_COV

df_err_same = df_err_same[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_same.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ERR_same.txt", sep='\t', index=False)
df_err_same

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
0,ERR4423031_8755,True,True,True
1,ERR4421647_11812,True,False,False
2,ERR4421647_16174,True,True,False
3,ERR4422806_10764,True,True,True
4,ERR4424882_7071,True,True,False
...,...,...,...,...
48004,ERR4637423_26336,True,False,False
48005,ERR4637624_10999,True,True,True
48006,ERR4641300_10999,True,True,True
48011,ERR4640670_20714,True,False,False


In [13]:
# For masked
df_err_masked = df_err[df_err['label_mar'] == 1.0].copy()

df_err_masked['ID_POS'] = df_err_masked['ID'].astype(str) + "_" + df_err_masked['Position'].astype(str)

df_err_masked['Flag_AF'] = df_err_masked['AF'] > threshold_AF
df_err_masked['Flag_SB'] = df_err_masked['SB'] > threshold_SB
df_err_masked['Flag_COV'] = df_err_masked['RATIO'] < threshold_COV

df_err_masked = df_err_masked[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_masked.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ERR_masked.txt", sep='\t', index=False)
df_err_masked

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
15,ERR4638391_10764,True,True,True
42,ERR4421447_13502,True,True,True
55,ERR4597585_28394,True,True,True
64,ERR4638460_10764,True,True,True
76,ERR4638344_10999,True,True,True
...,...,...,...,...
47884,ERR4424881_10764,True,True,True
47919,ERR4423996_8888,True,True,True
47944,ERR4460424_21225,True,False,True
47970,ERR4638472_10999,True,True,True


## Part2_ALL_POS

In [14]:
df_all = pd.read_csv("/nfs/research/goldman/zihao/Datas/p1/Part1_2_for_assemble_data/all_sampled_positions.txt",
                     sep='\t', names=['ID', 'Position', 'nucleotide_martin', 'nucleotide_origin',
                                      'label_masked', 'label_mar', 'label_ori', 'label_same', 'RATIO', 'AF',
                                      'SB', 'Flag_SB', 'Flag_AF', 'Flag_COV'], low_memory=False)
df_all = df_all[df_all['SB']!='SB']

df_all['Flag_SB'] = (df_all['SB'].astype(float) > threshold_SB).astype(int)

df_all['Flag_AF'] = (df_all['AF'].astype(float) > threshold_AF).astype(int)

df_all['Flag_COV'] = (df_all['RATIO'].astype(float) < threshold_COV).astype(int)

df_all

Unnamed: 0,ID,Position,nucleotide_martin,nucleotide_origin,label_masked,label_mar,label_ori,label_same,RATIO,AF,SB,Flag_SB,Flag_AF,Flag_COV
0,ERR6613398,19189,t,t,0,0,0,1,1.6667150983545793,0.0,0,0,0,0
1,ERR6613398,21081,a,a,0,0,0,1,1.5661837584873357,0.0,0,0,0,0
2,ERR6613398,19859,c,c,0,0,0,1,1.7032328927508,0.0,0,0,0,0
3,ERR7740049,15247,a,a,0,0,0,1,0.05438757595442053,0.0,0,0,0,1
4,ERR6535196,23907,t,t,0,0,0,1,0.1754793275265588,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299025,ERR7104629,19547,c,c,0,0,0,1,0.31813452241575485,0.0,0,0,0,0
299026,ERR6456404,9731,t,t,0,0,0,1,0.01891440745354535,0.0,0,0,0,1
299027,ERR6613398,26685,t,t,0,0,0,1,0.15358954701939992,0.0,0,0,0,0
299028,ERR7740049,12936,a,a,0,0,0,1,0.9704338159825402,0.0,0,0,0,0


In [15]:
# 初始化计数变量
count_label_1_total = sum(df_all['label_mar'] == '1')
count_label_2_total = sum(df_all['label_same'] == '1')
count_label_3_total = sum(df_all['label_same'] == '0')

# 打印总计数
total_files = len(df_all)
total_records = count_label_1_total + count_label_2_total + count_label_3_total

print('================================= information =================================')
print("Martin version masked:", count_label_1_total, ', Total number:', (total_files))
print("Same nucleotide type:", count_label_2_total, ', Total number:', (total_files))
print("Different nucleotide type:", count_label_3_total, ', Total number:', (total_files))
print('Total records: ', total_records)
print('================================= percentage =================================')
print("Martin version masked:", round(count_label_1_total / total_records * 100, 3), '%')
print("Same nucleotide type:", round(count_label_2_total / total_records * 100, 3), '%')
print("Different nucleotide type:", round(count_label_3_total / total_records * 100, 3), '%')

Martin version masked: 5792 , Total number: 299018
Same nucleotide type: 265541 , Total number: 299018
Different nucleotide type: 20 , Total number: 299018
Total records:  271353
Martin version masked: 2.134 %
Same nucleotide type: 97.858 %
Different nucleotide type: 0.007 %


In [17]:
# Err pos:
numbers = [265541, 20, 5792]

formatted_numbers_all = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_all.append(formatted_number)

print(formatted_numbers_all)

['266K', '20', '5.8K']


In [21]:
# creat DataFrame
data_all = {'Category': ['Same nucleotide',
                        'Different nucleotides',
                        'Viridian masked'],
        'Percentage': [97.858, 0.007, 2.134],
           'Raw_number':formatted_numbers_all}
df_all_plot = pd.DataFrame(data_all)

In [22]:
fig = go.Figure(go.Pie(labels=df_all_plot['Category'], values=df_all_plot['Percentage'], hole=0.4,
                       text=df_all_plot['Category']+': <br>' + df_all_plot['Raw_number'].astype(str),
                       textinfo='percent+text',
                       marker={'colors': colors},
                       hovertemplate='%{label}<br>Current percentage: %{percent:.1%}<br>: %{text}',
                       textfont={'size': 15},
                       texttemplate='%{text} (%{percent:.1%})'  # 设置文本显示的格式
                      ))

# Add titles for subplots

fig.show()

In [19]:
# For same
df_all_same = df_all[df_all['label_same'] == '1'].copy()

df_all_same['ID_POS'] = df_all_same['ID'].astype(str) + "_" + df_all_same['Position'].astype(str)

df_all_same['Flag_SB'] = (df_all_same['SB'].astype(float) > threshold_SB).astype(int)
df_all_same['Flag_AF'] = (df_all_same['AF'].astype(float) > threshold_AF).astype(int)
df_all_same['Flag_COV'] = (df_all_same['RATIO'].astype(float) < threshold_COV).astype(int)

df_all_same = df_all_same[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_all_same.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ALL_same.txt", sep='\t', index=False)
df_all_same

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
0,ERR6613398_19189,0,0,0
1,ERR6613398_21081,0,0,0
2,ERR6613398_19859,0,0,0
3,ERR7740049_15247,0,0,1
4,ERR6535196_23907,0,0,0
...,...,...,...,...
299025,ERR7104629_19547,0,0,0
299026,ERR6456404_9731,0,0,1
299027,ERR6613398_26685,0,0,0
299028,ERR7740049_12936,0,0,0


In [20]:
# For diff
df_all_diff = df_all[df_all['label_same'] == '0'].copy()

df_all_diff['ID_POS'] = df_all_diff['ID'].astype(str) + "_" + df_all_diff['Position'].astype(str)

df_all_diff['Flag_SB'] = (df_all_diff['SB'].astype(float) > threshold_SB).astype(int)
df_all_diff['Flag_AF'] = (df_all_diff['AF'].astype(float) > threshold_AF).astype(int)
df_all_diff['Flag_COV'] = (df_all_diff['RATIO'].astype(float) < threshold_COV).astype(int)

df_all_diff = df_all_diff[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_all_diff.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ALL_diff.txt", sep='\t', index=False)
df_all_diff

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
24350,ERR6456404_11576,1,1,0
28830,ERR6534503_7101,1,1,0
69111,ERR6534503_25171,1,1,1
82885,ERR10023861_234,0,0,1
94611,ERR10024357_4084,1,1,0
100192,ERR6456404_24410,1,1,0
113101,ERR6076386_28086,1,1,0
132814,ERR10023861_29360,1,1,0
137534,ERR6534503_6352,1,0,0
151506,ERR7027015_24410,1,1,0


In [22]:
# For masked
df_all_masked = df_all[df_all['label_mar'] == '1'].copy()

df_all_masked['ID_POS'] = df_all_masked['ID'].astype(str) + "_" + df_all_masked['Position'].astype(str)

df_all_masked['Flag_SB'] = (df_all_masked['SB'].astype(float) > threshold_SB).astype(int)
df_all_masked['Flag_AF'] = (df_all_masked['AF'].astype(float) > threshold_AF).astype(int)
df_all_masked['Flag_COV'] = (df_all_masked['RATIO'].astype(float) < threshold_COV).astype(int)

df_all_masked = df_all_masked[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_all_masked.to_csv("/homes/zihao/EBI_INTER/A_Datas/For_COMP/COMP_ALL_masked.txt", sep='\t', index=False)
df_all_masked

Unnamed: 0,ID_POS,Flag_AF,Flag_SB,Flag_COV
108,ERR6456404_41,0,0,0
116,ERR6535196_29903,0,0,1
171,ERR6613398_8,0,0,1
179,ERR7740049_29754,0,0,1
201,ERR6456404_13870,0,0,1
...,...,...,...,...
298754,ERR7740049_1044,0,0,0
298756,ERR7740049_1157,0,0,1
298885,ERR7853675_21011,0,0,0
298958,ERR7809955_29903,0,0,1
