## <center>Comparison of Viridian Alignment and COVID-19 Data Portal Alignment.</center>


| **Label** | **start time** | **finish time** | **last modified** |
|:--------------:|:-----------:|:-----------:|:----------------:|
|   Project 3   |  2023-07-10 |  2023-07-30 |   2023-08-22     |

In [1]:
import numpy as np
import pandas as pd

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
def format_number(number):
    suffixes = ['', 'K', 'M', 'B', 'T']

    for i in range(len(suffixes)):
        magnitude = number / (1000 ** i)
        if magnitude < 1000:
            if magnitude < 10:
                formatted = f"{magnitude:.1f}"
            else:
                formatted = f"{magnitude:.0f}"
            return f"{formatted}{suffixes[i]}"
    
    return f"{number:.1e}"

====================== Info ======================  
Total masked positions in Virdian assembly:  8979822821  
Total masked positions in Data-Portal assembly:  24449689050  
====================== Info ======================  
Total unmasked positions in Virdian assembly, without errors identified by MAPLE:  382248974784  
Total unmasked positions in Data-Portal assembly, without errors identified by MAPLE:  366779091136  
====================== Info ======================  
Total positions in Virdian assembly identified as errors by MAPLE (Virdian's errors):  15905  
Total positions in Data-Portal assembly identified as errors by MAPLE (Data-Portal's errors):  33303  
====================== For VIR error ======================  
Viridian's assemblies are masked:  725  
Same nucleotide, Viridian's assembly error:  9581  
Same nucleotide, Viridian's assembly not error:  4227  
Diff nucleotide, Viridian's assembly error:  3  
Diff nucleotide, Viridian's assembly not error:  1349  
====================== For COL error ======================  
Data-Portal's assemblies are masked:  8466  
Same nucleotide, Data-Portal's assembly error:  9581  
Same nucleotide, Data-Portal's assembly not error:  6416  
Diff nucleotide, Data-Portal's assembly error:  3  
Diff nucleotide, Data-Portal's assembly not error:  8837

## 1. input data

In [3]:
VIR_masked = 8979822821
VIR_unmasked = 382248974784
VIR_error = 15905
VIR_both = VIR_masked+VIR_unmasked+VIR_error

VIR_error_othMasked = 725
VIR_error_sameError = 9581
VIR_error_sameCorrect = 4227
VIR_error_diffError = 3
VIR_error_diffCorrect = 1349
VIR_error_both = VIR_error_othMasked+VIR_error_sameError+VIR_error_sameCorrect+VIR_error_diffError+VIR_error_diffCorrect

In [4]:
COL_masked = 24449689050
COL_unmasked = 366779091136
COL_error = 33303
COL_both = COL_masked+COL_unmasked+COL_error

COL_error_othMasked = 8466
COL_error_sameError = 9581
COL_error_sameCorrect = 6416
COL_error_diffError = 3
COL_error_diffCorrect = 8837
COL_error_both = COL_error_othMasked+COL_error_sameError+COL_error_sameCorrect+COL_error_diffError+COL_error_diffCorrect

## 2. data processing
### 2.1 for Viridian Alignment

In [5]:
numbers = [VIR_masked, VIR_unmasked, VIR_error]

formatted_numbers_VIR = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_VIR.append(formatted_number)

print(formatted_numbers_VIR)

numbers = [VIR_error_othMasked, VIR_error_sameError, VIR_error_sameCorrect,
           VIR_error_diffError, VIR_error_diffCorrect]

formatted_numbers_VIR_ERROR = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_VIR_ERROR.append(formatted_number)

print(formatted_numbers_VIR_ERROR)

['9.0B', '382B', '16K']
['725', '9.6K', '4.2K', '3.0', '1.3K']


In [6]:
data_VIR = {'Category': ['Masked',
                         'Not Error',
                         'Error'],
        'Percentage': [np.log(VIR_masked)/VIR_both, 
                       np.log(VIR_unmasked)/VIR_both, 
                       np.log(VIR_error)/VIR_both],
        'Raw_Percentage': [VIR_masked/VIR_both, 
                           VIR_unmasked/VIR_both, 
                           VIR_error/VIR_both],
           'Raw_number':formatted_numbers_VIR}
df_VIR_plot = pd.DataFrame(data_VIR)

In [7]:
# creat DataFrame
data_VIR_ERR = {'Category': ['Vir_otherMasked',
                         'Vir_sameError',
                         'Vir_sameCorrect',
                         'Vir_diffError',
                         'Vir_diff'],
        'Percentage': [VIR_error_othMasked/VIR_error_both, 
                       VIR_error_sameError/VIR_error_both, 
                       VIR_error_sameCorrect/VIR_error_both, 
                       VIR_error_diffError/VIR_error_both, 
                       VIR_error_diffCorrect/VIR_error_both],
           'Raw_number':formatted_numbers_VIR_ERROR}
df_VIR_ERR_plot = pd.DataFrame(data_VIR_ERR)

### 2.2 for COVID-19 Data Portal Alignment

In [8]:
numbers = [COL_masked, COL_unmasked, COL_error]

formatted_numbers_COL = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_COL.append(formatted_number)

print(formatted_numbers_COL)

numbers = [COL_error_othMasked, COL_error_sameError, COL_error_sameCorrect,
           COL_error_diffError, COL_error_diffCorrect]

formatted_numbers_COL_ERROR = []

for number in numbers:
    formatted_number = format_number(number)
    formatted_numbers_COL_ERROR.append(formatted_number)

print(formatted_numbers_COL_ERROR)

['24B', '367B', '33K']
['8.5K', '9.6K', '6.4K', '3.0', '8.8K']


In [9]:
# creat DataFrame
data_COL = {'Category': ['Masked',
                         'Not Error',
                         'Error'],
        'Percentage': [np.log(COL_masked)/COL_both, 
                       np.log(COL_unmasked)/COL_both, 
                       np.log(COL_error)/COL_both],
        'Raw_Percentage': [COL_masked/COL_both, 
                           COL_unmasked/COL_both, 
                           COL_error/COL_both],
           'Raw_number':formatted_numbers_COL}
df_COL_plot = pd.DataFrame(data_COL)

In [10]:
# creat DataFrame
data_COL_ERR = {'Category': ['Col_otherMasked',
                         'Col_sameError',
                         'Col_sameCorrect',
                         'Col_diffError',
                         'Col_diff'],
        'Percentage': [COL_error_othMasked/COL_error_both, 
                       COL_error_sameError/COL_error_both, 
                       COL_error_sameCorrect/COL_error_both, 
                       COL_error_diffError/COL_error_both, 
                       COL_error_diffCorrect/COL_error_both],
           'Raw_number':formatted_numbers_COL_ERROR}
df_COL_ERR_plot = pd.DataFrame(data_COL_ERR)

## 3. data visualization

In [11]:
colors = ['#EBD67E', '#7FB7D1', '#89A78E']
colors_ERROR = ['#C9DFCC', '#9AC2A0', '#7AAC9A', '#688462', '#AAB07C']
label_err = ['otherMasked', 'sameError', 'sameCorrect', 'diffError', 'diff']

In [15]:
# Create the subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=['Viridian Alignment', 'COVID-19 Data Portal Alignment'],
                    specs=[[{'type':'pie'}, {'type':'pie'}]])

# Create the pie chart for VIR
pie_VIR = go.Pie(labels=df_VIR_plot['Category'], values=df_VIR_plot['Percentage'], hole=0.4,
                 text=df_VIR_plot['Category']+': <br>' + df_VIR_plot['Raw_number'].astype(str)+
                 ' ('+(df_VIR_plot['Raw_Percentage']*100).round(7).astype(str)+'%)',
                 textinfo='percent+text',
                 marker={'colors': colors},
                 hovertemplate='%{label}<br>Current percentage: %{percent:.1%}<br>: %{text}',
                 textfont={'size': 15},
                 texttemplate='%{text}',# (%{percent:.1%})',
                 name='VIR')

# Add pie chart for VIR to subplot
fig.add_trace(pie_VIR, row=1, col=1)

# Create the pie chart for COL
pie_COL = go.Pie(labels=df_COL_plot['Category'], values=df_COL_plot['Percentage'], hole=0.4,
                 text=df_COL_plot['Category']+': <br>' + df_COL_plot['Raw_number'].astype(str)+
                 ' ('+(df_COL_plot['Raw_Percentage']*100).round(4).astype(str)+'%)',
                 textinfo='percent+text',
                 marker={'colors': colors},
                 hovertemplate='%{label}<br>Current percentage: %{percent:.1%}<br>: %{text}',
                 textfont={'size': 15},
                 texttemplate='%{text}',# (%{percent:.1%})',
                 name='COL')

# Add pie chart for COL to subplot
fig.add_trace(pie_COL, row=1, col=2)


fig.show()
"""
# Set titles for the two subplots
# fig.update_layout(title_text="subplots_error")  # Increase figure size

# Save the figure as an HTML file
pio.write_html(fig, 'Figure/subplots_all.html')
"""

'\n# Set titles for the two subplots\n# fig.update_layout(title_text="subplots_error")  # Increase figure size\n\n# Save the figure as an HTML file\npio.write_html(fig, \'Figure/subplots_all.html\')\n'

In [13]:
fig = make_subplots(rows=1, cols=2, subplot_titles=['Viridian Alignment', 'COVID-19 Data Portal Alignment'], 
# fig = make_subplots(rows=1, cols=2, subplot_titles=['Viridian Assembly', 'COVID19 Data Portal'],
                    specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=label_err, values=df_VIR_ERR_plot['Percentage'], hole=0.35,
                     text=df_VIR_ERR_plot['Raw_number'].astype(str),
                     # text=df_VIR_ERR_plot['Category']+': <br>' + df_VIR_ERR_plot['Raw_number'].astype(str),
                     textinfo='percent+text',
                     marker={'colors': colors_ERROR},
                     hovertemplate='%{label}<br>Current percentage: %{percent:.1%}<br>: %{text}',
                     textfont={'size': 15},
                     texttemplate='%{text} (%{percent:.1%})'
                    ), 
              row=1, col=1)

fig.add_trace(go.Pie(labels=label_err, values=df_COL_ERR_plot['Percentage'], hole=0.35,
                     text=df_COL_ERR_plot['Raw_number'].astype(str),
                     # text=df_COL_ERR_plot['Category']+': <br>' + df_COL_ERR_plot['Raw_number'].astype(str),
                     textinfo='percent+text',
                     marker={'colors': colors_ERROR},
                     hovertemplate='%{label}<br>Current percentage: %{percent:.1%}<br>: %{text}',
                     textfont={'size': 15},
                     texttemplate='%{text} (%{percent:.1%})'
                    ), 
              row=1, col=2)

fig.show()
"""
# Set titles for the two subplots
# fig.update_layout(title_text="subplots_error")  # Increase figure size

# Save the figure as an HTML file
pio.write_html(fig, 'Figure/subplots_error.html')
"""

'\n# Set titles for the two subplots\n# fig.update_layout(title_text="subplots_error")  # Increase figure size\n\n# Save the figure as an HTML file\npio.write_html(fig, \'Figure/subplots_error.html\')\n'