For example, we would like to know at the positions that are estimated errors in the Viridian alignment if 
- Colman’s assemblies are masked, 
- if they have a different nucleotide, 
- and if MAPLE also estimates an error there in Colman’s alignment.  
I mean 
- “masked”, 
- “same nucleotide, error”, 
- “same nucleotide, not error”, 
- “different nucleotide, error”, 
- “different nucleotide, not error”.

In [1]:
import pandas as pd

In [2]:
Folder_path = "/nfs/research/goldman/zihao/Datas/p2_compViridian_P3/O_folderData_addError/ERR7269308.txt"
df_v0 = pd.read_csv(Folder_path, sep='\t')
df_v1 = df_v0.fillna(0)

df_v1["VIR"] = df_v1["VIR"].astype(int)
df_v1["COL"] = df_v1["COL"].astype(int)
df_v1

Unnamed: 0,position,nucleotide_martin,nucleotide_origin,label_mar,label_ori,VIR,COL
0,1,-,n,1,1,0,0
1,2,-,n,1,1,0,0
2,3,-,n,1,1,0,0
3,4,-,n,1,1,0,0
4,5,-,n,1,1,0,0
...,...,...,...,...,...,...,...
29898,29899,-,n,1,1,0,0
29899,29900,-,n,1,1,0,0
29900,29901,-,n,1,1,0,0
29901,29902,-,n,1,1,0,0


In [None]:
import os
import pandas as pd

# ==================Requires modification==================
# folder_path: The folder where the file with the error positions information was added.
# ==================Requires modification==================
def process_files(folder_path):
    num_viridian_masked = 0
    num_colman_masked = 0
    
    num_viridian_unmasked_no_error = 0
    num_colman_unmasked_no_error = 0
    
    num_viridian_error = 0
    num_colman_error = 0
    
    #======================
    nb_virError_colmasked = 0
    nb_virError_coldiff = 0
    nb_virError_colerror = 0
    nb_colError_virmasked = 0
    nb_colError_virdiff = 0
    nb_colError_virerror = 0
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            df_v0 = pd.read_csv(os.path.join(folder_path, file_name), sep='\t')
            df_v1 = df_v0.fillna(0)

            df_v1["VIR"] = df_v1["VIR"].astype(int)
            df_v1["COL"] = df_v1["COL"].astype(int)
            num_viridian_masked += sum(df_v1['label_mar'] == 1)
            num_colman_masked += sum(df_v1['label_ori'] == 1)
            
            num_viridian_unmasked_no_error += sum((df_v1['label_mar'] == 0)&(df_v1['VIR'] == 0))
            num_colman_unmasked_no_error += sum((df_v1['label_ori'] == 0)&(df_v1['COL'] == 0))
            
            num_viridian_error += sum(df_v1['VIR'] == 1)
            num_colman_error += sum(df_v1['COL'] == 1)
            
            #======================
            nb_virError_colmasked += sum((df_v1['VIR'] == 1) & (df_v1['label_ori'] == 1))
            nb_virError_coldiff += sum((df_v1['VIR'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            nb_virError_colerror += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1))
            
            nb_colError_virmasked += sum((df_v1['COL'] == 1) & (df_v1['label_mar'] == 1))
            nb_colError_virdiff += sum((df_v1['COL'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            nb_colError_virerror += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1))
            
            
    print('====================== Info ======================')
    print('Total masked positions in Virdian assembly: ', num_viridian_masked)
    print('Total positions in Colman assembly: ', num_colman_masked)
    
    print('====================== Info ======================')
    print('Total unmasked positions in Virdian assembly, without errors '
          'identified by MAPLE: ', num_viridian_unmasked_no_error)
    print('Total unmasked positions in Colman assembly, without errors '
          'identified by MAPLE: ', num_colman_unmasked_no_error)
    
    print('====================== Info ======================')
    print('Total positions in Virdian assembly identified as errors by MAPLE '
          '(Virdian\'s errors): ', num_viridian_error)
    print('Total positions in Colman assembly identified as errors by MAPLE '
          '(Colman\'s errors): ', num_colman_error)
    
    print('====================== For VIR error ======================')
    print('Colman’s assemblies are masked: ',nb_virError_colmasked)
    print('They have a different nucleotide: ',nb_virError_coldiff)
    print('MAPLE also estimates an error there in Colman’s alignment: ',nb_virError_colerror)
    print('====================== For COL error ======================')
    print('Viridian’s assemblies are masked: ',nb_colError_virmasked)
    print('They have a different nucleotide: ',nb_colError_virdiff)
    print('MAPLE also estimates an error there in Colman’s alignment: ',nb_colError_virerror)

folder_path = "/nfs/research/goldman/zihao/Datas/p2_compViridian_P3/folderData_addError"
process_files(folder_path)

In [None]:
# “masked”
sum((df_v1['COL'] == 1) & (df_v1['label_mar'] == 1))

# “same nucleotide, error”
sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
# “same nucleotide, not error”
sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))

# “different nucleotide, error”
sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
# “different nucleotide, not error”
sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))

In [None]:
# “masked”
sum((df_v1['VIR'] == 1) & (df_v1['label_ori'] == 1))

# “same nucleotide, error”
sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
# “same nucleotide, not error”
sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))

# “different nucleotide, error”
sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
# “different nucleotide, not error”
sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))

In [None]:
import os
import pandas as pd

# ==================Requires modification==================
# folder_path: The folder where the file with the error positions information was added.
# ==================Requires modification==================
def process_files(folder_path):
      
    num_viridian_masked = 0
    num_colman_masked = 0
    
    num_viridian_unmasked_no_error = 0
    num_colman_unmasked_no_error = 0
    
    num_viridian_error = 0
    num_colman_error = 0
    
    #======================
    NB_VIR_masked = 0
    NB_VIR_sameError = 0
    NB_VIR_same = 0
    NB_VIR_diffError = 0
    NB_VIR_diff = 0
    
    NB_COL_masked = 0
    NB_COL_sameError = 0
    NB_COL_same = 0
    NB_COL_diffError = 0
    NB_COL_diff = 0  
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            df_v0 = pd.read_csv(os.path.join(folder_path, file_name), sep='\t')
            df_v1 = df_v0.fillna(0)

            df_v1["VIR"] = df_v1["VIR"].astype(int)
            df_v1["COL"] = df_v1["COL"].astype(int)
            num_viridian_masked += sum(df_v1['label_mar'] == 1)
            num_colman_masked += sum(df_v1['label_ori'] == 1)
            
            num_viridian_unmasked_no_error += sum((df_v1['label_mar'] == 0)&(df_v1['VIR'] == 0))
            num_colman_unmasked_no_error += sum((df_v1['label_ori'] == 0)&(df_v1['COL'] == 0))
            
            num_viridian_error += sum(df_v1['VIR'] == 1)
            num_colman_error += sum(df_v1['COL'] == 1)
            
            #======================

            NB_COL_masked += sum((df_v1['COL'] == 1) & (df_v1['label_mar'] == 1))
            NB_COL_sameError += sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
            NB_COL_same += sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
            NB_COL_diffError += sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            NB_COL_diff += sum((df_v1['COL'] == 1) & (df_v1['VIR'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            

            NB_VIR_masked += sum((df_v1['VIR'] == 1) & (df_v1['label_ori'] == 1))
            NB_VIR_sameError += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
            NB_VIR_same += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] == df_v1['nucleotide_origin']))
            NB_VIR_diffError += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 1) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            NB_VIR_diff += sum((df_v1['VIR'] == 1) & (df_v1['COL'] == 0) & (df_v1['label_ori'] == 0) & (df_v1['label_mar'] == 0) &(df_v1['nucleotide_martin'] != df_v1['nucleotide_origin']))
            
    print('====================== Info ======================')
    print('Total masked positions in Virdian assembly: ', num_viridian_masked)
    print('Total positions in Colman assembly: ', num_colman_masked)
    
    print('====================== Info ======================')
    print('Total unmasked positions in Virdian assembly, without errors '
          'identified by MAPLE: ', num_viridian_unmasked_no_error)
    print('Total unmasked positions in Colman assembly, without errors '
          'identified by MAPLE: ', num_colman_unmasked_no_error)
    
    print('====================== Info ======================')
    print('Total positions in Virdian assembly identified as errors by MAPLE '
          '(Virdian\'s errors): ', num_viridian_error)
    print('Total positions in Colman assembly identified as errors by MAPLE '
          '(Colman\'s errors): ', num_colman_error)
    
    print('====================== For VIR error ======================')
    print('Colman’s assemblies are masked: ',NB_VIR_masked)
    print('Same nucleotide, Colman’s assembly error: ',NB_VIR_sameError)
    print('Same nucleotide, Colman’s assembly not error: ',NB_VIR_same)
    print('Diff nucleotide, Colman’s assembly error: ',NB_VIR_diffError)
    print('Diff nucleotide, Colman’s assembly not error: ',NB_VIR_diff)
    print('====================== For COL error ======================')
    print('Viridian’s assemblies are masked: ',NB_COL_masked)
    print('Same nucleotide, Viridian\'s assembly error: ',NB_COL_sameError)
    print('Same nucleotide, Viridian\'s assembly not error: ',NB_COL_same)
    print('Diff nucleotide, Viridian\'s assembly error: ',NB_COL_diffError)
    print('Diff nucleotide, Viridian\'s assembly not error: ',NB_COL_diff)

folder_path = "/nfs/research/goldman/zihao/Datas/p2_compViridian_P3/folderData_addError"
process_files(folder_path)

## Test - Final version

In [54]:
import os
import pandas as pd

def process_files(folder_path):
    num_viridian_masked = 0
    num_colman_masked = 0
    num_both_masked = 0
    
    num_viridian_unmasked_no_error = 0
    num_colman_unmasked_no_error = 0
    num_viridian_error = 0
    num_colman_error = 0
    num_both_error = 0

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            df_v0 = pd.read_csv(os.path.join(folder_path, file_name), sep='\t')
            df_v1 = df_v0.fillna(0)

            df_v1["VIR"] = df_v1["VIR"].astype(int)
            df_v1["COL"] = df_v1["COL"].astype(int)

            num_viridian_masked += sum((df_v1['label_mar'] == 1)&(df_v1['label_ori'] == 0))
            num_colman_masked += sum((df_v1['label_mar'] == 0)&(df_v1['label_ori'] == 1))
            num_both_masked += sum((df_v1['label_mar'] == 1)&(df_v1['label_ori'] == 1))
            
            num_viridian_unmasked_no_error += sum((df_v1['label_mar'] == 0)&(df_v1['VIR'] == 0))
            num_colman_unmasked_no_error += sum((df_v1['label_ori'] == 0)&(df_v1['COL'] == 0))
            
            num_viridian_error += sum((df_v1['VIR'] == 1)&(df_v1['COL'] == 0))
            num_colman_error += sum((df_v1['COL'] == 1)&(df_v1['VIR'] == 0))
            num_both_error += sum((df_v1['COL'] == 1)&(df_v1['VIR'] == 1))

    print('====================== Info ======================')
    print('Total masked positions in Virdian assembly (excludes Colman\'s): ', num_viridian_masked)
    print('Total positions in Colman assembly (excludes Virdian\'s): ', num_colman_masked)
    print('Total positions masked in both assemblies: ', num_both_masked)
    print('====================== Info ======================')
    print('Total unmasked positions in Virdian assembly, without errors '
          'identified by MAPLE: ', num_viridian_unmasked_no_error)
    print('Total unmasked positions in Colman assembly, without errors '
          'identified by MAPLE: ', num_colman_unmasked_no_error)
    print('====================== Info ======================')
    print('Total positions in Virdian assembly identified as errors by MAPLE '
          '(excludes Colman\'s errors): ', num_viridian_error)
    print('Total positions in Colman assembly identified as errors by MAPLE '
          '(excludes Virdian\'s errors): ', num_colman_error)
    print('Total positions identified as errors in both assemblies by MAPLE: ', 
        num_both_error)

folder_path = "/nfs/research/goldman/zihao/Datas/p2_compViridian_P3/folderData_test"
process_files(folder_path)

Total masked positions in Virdian assembly (excludes Colman's):  19070
Total positions in Colman assembly (excludes Virdian's):  168689
Total positions masked in both assemblies:  43755
Total unmasked positions in Virdian assembly, without errors identified by MAPLE:  2927438
Total unmasked positions in Colman assembly, without errors identified by MAPLE:  2777767
Total positions in Virdian assembly identified as errors by MAPLE (excludes Colman's errors):  11
Total positions in Colman assembly identified as errors by MAPLE (excludes Virdian's errors):  63
Total positions identified as errors in both assemblies by MAPLE:  26
