##  threshold
- SB > 0, 
- coverage ratio < 0.1,
- AF > 0.001.

In [1]:
threshold_AF = 0.001
threshold_SB = 0
threshold_COV = 0.1

In [2]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import iplot, plot
import plotly.io as pio

def format_number(number):
    suffixes = ['', 'K', 'M', 'B', 'T']

    for i in range(len(suffixes)):
        magnitude = number / (1000 ** i)
        if magnitude < 1000:
            if magnitude < 10:
                formatted = f"{magnitude:.1f}"
            else:
                formatted = f"{magnitude:.0f}"
            return f"{formatted}{suffixes[i]}"
    
    return f"{number:.1e}"

# <center>Catalog:</center>
- 0. [Input data](#0.input_data)

&nbsp;

- 1. [For Pie Chart](#1.For_Pie_Chart)

&nbsp;

- 2. [For Venn chart](#2.For_Venn_chart)
    - 2.1 [Different nucleotide](#2.1_Diff_nucleotide)
    - 2.2 [Same nucleotide](#2.2_Same_nucleotide)
    - 2.3 [Viridian masked](#2.3_Viridian_masked)

# 0.input_data
[Return to Catalog](#Catalog:)

In [3]:
File_for_errPos = '/nfs/research/goldman/zihao/Datas/p2_compViridian_P2/Folder_3_calculForPLOT/errPos/errpos_outputData.txt'
# File_for_errPos = '/nfs/research/goldman/zihao/Datas/p2_compViridian_P3/Folder_mapleOutput/COL_output_modified.txt'

In [4]:
df_err = pd.read_csv(File_for_errPos, sep='\t')

df_err['Flag_SB'] = (df_err['SB'].astype(float) > threshold_SB).astype(int)
df_err['Flag_AF'] = (df_err['AF'].astype(float) > threshold_AF).astype(int)
df_err['Flag_COV'] = (df_err['RATIO'].astype(float) < threshold_COV).astype(int)

df_err

In [5]:
count_label_1_total = sum((df_err['label_mar'] == 1.0)&(df_err['label_ori'] == 0.0))
count_label_2_total = sum(df_err['label_same'] == 1.0)
count_label_3_total = sum(df_err['label_same'] == 0.0)

total_files = len(df_err)
total_records = count_label_1_total + count_label_2_total + count_label_3_total

percentage_label_1_total = round(count_label_1_total / total_records * 100, 3)
percentage_label_2_total = round(count_label_2_total / total_records * 100, 3)
percentage_label_3_total = round(count_label_3_total / total_records * 100, 3)
print('================================= information =================================')
print("Martin version masked:", count_label_1_total, ', Total number:', (total_files))
print("Same nucleotide type:", count_label_2_total, ', Total number:', (total_files))
print("Different nucleotide type:", count_label_3_total, ', Total number:', (total_files))
print('Total records: ', total_records)
print('================================= percentage =================================')
print("Martin version masked:", percentage_label_1_total, '%')
print("Same nucleotide type:", percentage_label_2_total, '%')
print("Different nucleotide type:", percentage_label_3_total, '%')

# 1.For_Pie_Chart

In [6]:
numbers = [count_label_2_total, count_label_3_total, count_label_1_total]

formatted_numbers_err = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_err.append(formatted_number)

print(formatted_numbers_err)

# creat DataFrame
data_err = {'Category': ['Same nucleotide',
                        'Different nucleotides',
                        'Viridian masked'],
        'Percentage': [percentage_label_2_total, percentage_label_3_total, percentage_label_1_total],
           'Raw_number':formatted_numbers_err}
df_err_plot = pd.DataFrame(data_err)

colors = ['#D9BFCB0', '#49998B', '#C1BEE3']

In [7]:
fig = go.Figure(go.Pie(
    labels=df_err_plot['Category'],
    values=df_err_plot['Percentage'],
    hole=0.4,
    text=df_err_plot['Category']+': <br>' + df_err_plot['Raw_number'].astype(str),
    textinfo='percent+text',
    marker={'colors': colors},
    hovertemplate='%{label}<br>Current percentage: %{percent:.1%}',
    textfont={'size': 15},  # Set the font size of the text
    texttemplate='%{text} (%{percent:.2%})'  # Set the format of the text display
))

# Add titles for subplots
fig.update_layout(
    title="Comparison of assembled and sequenced sequences regarding MAPLE marker error positions"
)

fig.show()

pio.write_html(fig, '/nfs/research/goldman/zihao/Code/Figure/P2/errPIE.html')

# 2.For_Venn_chart
[Return to Catalog](#Catalog:)

### 2.1_Diff_nucleotide

In [8]:
# For diff
df_err_diff = df_err[df_err['label_same'] == 0.0].copy()

df_err_diff['ID_POS'] = df_err_diff['ID'].astype(str) + "_" + df_err_diff['Position'].astype(str)

df_err_diff['Flag_AF'] = df_err_diff['AF'] > threshold_AF
df_err_diff['Flag_SB'] = df_err_diff['SB'] > threshold_SB
df_err_diff['Flag_COV'] = df_err_diff['RATIO'] < threshold_COV

df_err_diff = df_err_diff[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_diff.to_csv("/nfs/research/goldman/zihao/Code/IntermediateDATA/P2-ERR_diff.txt", 
                   sep='\t', index=False)
df_err_diff

In [9]:
same_count = len(df_err_diff)
Flag_SB_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == False) & (df_err_diff['Flag_SB'] == True) & (df_err_diff['Flag_COV'] == False)])
Flag_AF_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == True) & (df_err_diff['Flag_SB'] == False) & (df_err_diff['Flag_COV'] == False)])
Flag_COV_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == False) & (df_err_diff['Flag_SB'] == False) & (df_err_diff['Flag_COV'] == True)])
both_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == True) & (df_err_diff['Flag_SB'] == True) & (df_err_diff['Flag_COV'] == True)])
both_UNPASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == False) & (df_err_diff['Flag_SB'] == False) & (df_err_diff['Flag_COV'] == False)])
Flag_SB_Flag_AF_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == True) & (df_err_diff['Flag_SB'] == True) & (df_err_diff['Flag_COV'] == False)])
Flag_SB_Flag_COV_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == False) & (df_err_diff['Flag_SB'] == True) & (df_err_diff['Flag_COV'] == True)])
Flag_AF_Flag_COV_PASSED = len(df_err_diff[(df_err_diff['Flag_AF'] == True) & (df_err_diff['Flag_SB'] == False) & (df_err_diff['Flag_COV'] == True)])

pass_1 = Flag_SB_PASSED+Flag_AF_PASSED+Flag_COV_PASSED
pass_2 = Flag_SB_Flag_AF_PASSED+Flag_SB_Flag_COV_PASSED+Flag_AF_Flag_COV_PASSED
pass_3 = both_PASSED + both_UNPASSED
nb_pass = pass_1+pass_2+pass_3

print(round(both_UNPASSED/nb_pass*100,3),'%',both_UNPASSED)
print('=======================================')

print(round(Flag_AF_PASSED/nb_pass*100,3),'%', Flag_AF_PASSED)
print(round(Flag_COV_PASSED/nb_pass*100,3),'%', Flag_COV_PASSED)
print(round(Flag_SB_PASSED/nb_pass*100,3),'%', Flag_SB_PASSED)
print('=======================================')

print(round(Flag_SB_Flag_AF_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_AF_PASSED)
print(round(Flag_SB_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_COV_PASSED)
print(round(Flag_AF_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_AF_Flag_COV_PASSED)

print('=======================================')
print(round(both_PASSED/nb_pass*100,3),'%',both_PASSED)

### 2.2_Same_nucleotide

In [10]:
# For same
df_err_same = df_err[df_err['label_same'] == 1.0].copy()

df_err_same['ID_POS'] = df_err_same['ID'].astype(str) + "_" + df_err_same['Position'].astype(str)

df_err_same['Flag_AF'] = df_err_same['AF'] > threshold_AF
df_err_same['Flag_SB'] = df_err_same['SB'] > threshold_SB
df_err_same['Flag_COV'] = df_err_same['RATIO'] < threshold_COV

df_err_same = df_err_same[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_same.to_csv("/nfs/research/goldman/zihao/Code/IntermediateDATA/P2-ERR_same.txt", 
                   sep='\t', index=False)
df_err_same

In [11]:
same_count = len(df_err_same)
Flag_SB_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == False) & (df_err_same['Flag_SB'] == True) & (df_err_same['Flag_COV'] == False)])
Flag_AF_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == True) & (df_err_same['Flag_SB'] == False) & (df_err_same['Flag_COV'] == False)])
Flag_COV_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == False) & (df_err_same['Flag_SB'] == False) & (df_err_same['Flag_COV'] == True)])
both_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == True) & (df_err_same['Flag_SB'] == True) & (df_err_same['Flag_COV'] == True)])
both_UNPASSED = len(df_err_same[(df_err_same['Flag_AF'] == False) & (df_err_same['Flag_SB'] == False) & (df_err_same['Flag_COV'] == False)])
Flag_SB_Flag_AF_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == True) & (df_err_same['Flag_SB'] == True) & (df_err_same['Flag_COV'] == False)])
Flag_SB_Flag_COV_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == False) & (df_err_same['Flag_SB'] == True) & (df_err_same['Flag_COV'] == True)])
Flag_AF_Flag_COV_PASSED = len(df_err_same[(df_err_same['Flag_AF'] == True) & (df_err_same['Flag_SB'] == False) & (df_err_same['Flag_COV'] == True)])

pass_1 = Flag_SB_PASSED+Flag_AF_PASSED+Flag_COV_PASSED
pass_2 = Flag_SB_Flag_AF_PASSED+Flag_SB_Flag_COV_PASSED+Flag_AF_Flag_COV_PASSED
pass_3 = both_PASSED + both_UNPASSED
nb_pass = pass_1+pass_2+pass_3

print(round(both_UNPASSED/nb_pass*100,3),'%',both_UNPASSED)
print('=======================================')

print(round(Flag_AF_PASSED/nb_pass*100,3),'%', Flag_AF_PASSED)
print(round(Flag_COV_PASSED/nb_pass*100,3),'%', Flag_COV_PASSED)
print(round(Flag_SB_PASSED/nb_pass*100,3),'%', Flag_SB_PASSED)
print('=======================================')

print(round(Flag_SB_Flag_AF_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_AF_PASSED)
print(round(Flag_SB_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_COV_PASSED)
print(round(Flag_AF_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_AF_Flag_COV_PASSED)

print('=======================================')
print(round(both_PASSED/nb_pass*100,3),'%',both_PASSED)

### 2.3_Viridian_masked

In [12]:
# For masked
df_err_masked = df_err[df_err['label_mar'] == 1.0].copy()

df_err_masked['ID_POS'] = df_err_masked['ID'].astype(str) + "_" + df_err_masked['Position'].astype(str)

df_err_masked['Flag_AF'] = df_err_masked['AF'] > threshold_AF
df_err_masked['Flag_SB'] = df_err_masked['SB'] > threshold_SB
df_err_masked['Flag_COV'] = df_err_masked['RATIO'] < threshold_COV

df_err_masked = df_err_masked[['ID_POS', 'Flag_AF', 'Flag_SB', 'Flag_COV']]

df_err_masked.to_csv("/nfs/research/goldman/zihao/Code/IntermediateDATA/P2-ERR_masked.txt", 
                     sep='\t', index=False)
df_err_masked

In [13]:
same_count = len(df_err_masked)
Flag_SB_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == False) & (df_err_masked['Flag_SB'] == True) & (df_err_masked['Flag_COV'] == False)])
Flag_AF_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == True) & (df_err_masked['Flag_SB'] == False) & (df_err_masked['Flag_COV'] == False)])
Flag_COV_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == False) & (df_err_masked['Flag_SB'] == False) & (df_err_masked['Flag_COV'] == True)])
both_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == True) & (df_err_masked['Flag_SB'] == True) & (df_err_masked['Flag_COV'] == True)])
both_UNPASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == False) & (df_err_masked['Flag_SB'] == False) & (df_err_masked['Flag_COV'] == False)])
Flag_SB_Flag_AF_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == True) & (df_err_masked['Flag_SB'] == True) & (df_err_masked['Flag_COV'] == False)])
Flag_SB_Flag_COV_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == False) & (df_err_masked['Flag_SB'] == True) & (df_err_masked['Flag_COV'] == True)])
Flag_AF_Flag_COV_PASSED = len(df_err_masked[(df_err_masked['Flag_AF'] == True) & (df_err_masked['Flag_SB'] == False) & (df_err_masked['Flag_COV'] == True)])

pass_1 = Flag_SB_PASSED+Flag_AF_PASSED+Flag_COV_PASSED
pass_2 = Flag_SB_Flag_AF_PASSED+Flag_SB_Flag_COV_PASSED+Flag_AF_Flag_COV_PASSED
pass_3 = both_PASSED + both_UNPASSED
nb_pass = pass_1+pass_2+pass_3

print(round(both_UNPASSED/nb_pass*100,3),'%',both_UNPASSED)
print('=======================================')

print(round(Flag_AF_PASSED/nb_pass*100,3),'%', Flag_AF_PASSED)
print(round(Flag_COV_PASSED/nb_pass*100,3),'%', Flag_COV_PASSED)
print(round(Flag_SB_PASSED/nb_pass*100,3),'%', Flag_SB_PASSED)
print('=======================================')

print(round(Flag_SB_Flag_AF_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_AF_PASSED)
print(round(Flag_SB_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_SB_Flag_COV_PASSED)
print(round(Flag_AF_Flag_COV_PASSED/nb_pass*100,3),'%',Flag_AF_Flag_COV_PASSED)

print('=======================================')
print(round(both_PASSED/nb_pass*100,3),'%',both_PASSED)