In [2]:
def parse_contig_number(header):
    """
    Extracts the contig number from the header.
    Example: ">NZ_JAEPPV010000006.1" -> 6
    """
    parts = header.split(".")
    contig_number = int(parts[0].split("_")[-1][6:])
    return contig_number

def process_fna_file(input_fna, concatenated_fna, sorted_concatenated_fna):
    contigs = {}
    current_header = None
    current_sequence = []

    # Read the file manually
    with open(input_fna, "r") as file:
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if current_header:
                    contigs[current_header] = ''.join(current_sequence)
                current_header = line
                current_sequence = []
            else:
                current_sequence.append(line)
        if current_header:
            contigs[current_header] = ''.join(current_sequence)

    # Concatenate all contigs into one sequence
    concatenated_seq = ''.join(contigs.values())

    # Save the concatenated sequence to the first output file
    with open(concatenated_fna, "w") as output_handle:
        output_handle.write(">concatenated_sequence\n")
        output_handle.write(concatenated_seq + "\n")

    # Sort the contigs based on their contig number
    sorted_contigs = sorted(contigs.items(), key=lambda x: parse_contig_number(x[0]))

    # Concatenate the sorted contigs into one sequence
    sorted_concatenated_seq = ''.join(seq for _, seq in sorted_contigs)

    # Save the sorted concatenated sequence to the second output file
    with open(sorted_concatenated_fna, "w") as output_handle:
        output_handle.write(">sorted_concatenated_sequence\n")
        output_handle.write(sorted_concatenated_seq + "\n")

# Example usage:
input_fna = "10.fna"  # Replace with your input file name
concatenated_fna = "concatenated_contigs.fna"
sorted_concatenated_fna = "sorted_concatenated_contigs.fna"

process_fna_file(input_fna, concatenated_fna, sorted_concatenated_fna)


In [4]:
def extract_first_n_contigs(input_fna, output_fna, n=5):
    """
    Extract the first n contigs from an input .fna file and write them to a new file.
    
    Parameters:
    - input_fna: Path to the input .fna file.
    - output_fna: Path to the output .fna file.
    - n: Number of contigs to extract (default is 5).
    """
    contig_count = 0
    with open(input_fna, "r") as infile, open(output_fna, "w") as outfile:
        for line in infile:
            if line.startswith(">"):
                contig_count += 1
                if contig_count > n:
                    break
            outfile.write(line)

# Example usage:
input_fna = "38.fna"  # Replace with your actual input file
output_fna = "5.fna"

extract_first_n_contigs(input_fna, output_fna, n=5)



In [7]:
import pandas as pd

# Define the column names based on the format of the BLAST output.
# The default BLAST tabular output (-outfmt 6) has these columns:
column_names = [
    "query_id", "subject_id", "perc_identity", "alignment_length", "mismatches",
    "gaps", "q_start", "q_end", "s_start", "s_end", "evalue", "bit_score", "qcov", "qcovhsp", "qlen", "slen"
]

# Read the BLAST output file into a pandas DataFrame
blast_output_file1 = "output_5_to_10.blast"
blast_output_file2 = "output_5_to_concat10.blast"
blast_output_file3 = "output_5_to_sorted_concat10.blast"# Replace with your BLAST output file
default_df = pd.read_csv(blast_output_file1, sep="\t", header=None, names=column_names)
concat_df = pd.read_csv(blast_output_file2, sep="\t", header=None, names=column_names)
sorted_concat_df = pd.read_csv(blast_output_file3, sep="\t", header=None, names=column_names)
# Display the first few rows of the DataFrame
default_df


Unnamed: 0,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,AWFD01000001,NZ_JAEPPV010000003.1,75.690,11493,2460,334,394896,406227,27515,38834,0.000000e+00,5437.0,6,2,646567,460968
1,AWFD01000001,NZ_JAEPPV010000003.1,85.645,2271,288,38,372787,375043,15346,17592,0.000000e+00,2353.0,6,0,646567,460968
2,AWFD01000001,NZ_JAEPPV010000003.1,80.080,2505,476,23,557989,560480,351254,353748,0.000000e+00,1840.0,6,0,646567,460968
3,AWFD01000001,NZ_JAEPPV010000003.1,74.885,3699,819,110,433985,437629,61383,65025,0.000000e+00,1583.0,6,1,646567,460968
4,AWFD01000001,NZ_JAEPPV010000003.1,77.516,2415,523,20,440547,442951,69678,72082,0.000000e+00,1434.0,6,0,646567,460968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,AWFD01000005,NZ_JAEPPV010000001.1,77.025,605,120,19,42712,43309,571136,570544,2.010000e-88,329.0,27,1,117204,1120828
205,AWFD01000005,NZ_JAEPPV010000001.1,74.571,641,157,6,39189,39826,575763,575126,2.660000e-72,276.0,27,1,117204,1120828
206,AWFD01000005,NZ_JAEPPV010000001.1,80.460,348,43,25,2444,2769,612714,612370,2.700000e-62,243.0,27,0,117204,1120828
207,AWFD01000005,NZ_JAEPPV010000001.1,100.000,78,0,0,114083,114160,508082,508005,7.830000e-33,145.0,27,0,117204,1120828


In [8]:
concat_df

Unnamed: 0,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,AWFD01000001,concatenated_sequence,75.692,11494,2458,336,394896,406227,488111,499430,0.000000e+00,5437.0,30,2,646567,3327926
1,AWFD01000001,concatenated_sequence,74.691,8973,2046,225,151122,159977,94669,103533,0.000000e+00,3781.0,30,1,646567,3327926
2,AWFD01000001,concatenated_sequence,82.442,3292,510,68,45310,48571,176321,179574,0.000000e+00,2815.0,30,1,646567,3327926
3,AWFD01000001,concatenated_sequence,80.768,3567,612,74,140848,144384,86886,90408,0.000000e+00,2719.0,30,1,646567,3327926
4,AWFD01000001,concatenated_sequence,80.213,3376,581,87,86964,90299,218117,221445,0.000000e+00,2453.0,30,1,646567,3327926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,AWFD01000005,concatenated_sequence,77.025,605,120,19,42712,43309,1903186,1902594,2.010000e-88,329.0,27,1,117204,3327926
205,AWFD01000005,concatenated_sequence,74.571,641,157,6,39189,39826,1907813,1907176,2.660000e-72,276.0,27,1,117204,3327926
206,AWFD01000005,concatenated_sequence,80.460,348,43,25,2444,2769,1944764,1944420,2.700000e-62,243.0,27,0,117204,3327926
207,AWFD01000005,concatenated_sequence,100.000,78,0,0,114083,114160,1840132,1840055,7.830000e-33,145.0,27,0,117204,3327926


In [9]:
sorted_concat_df

Unnamed: 0,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,AWFD01000001,sorted_concatenated_sequence,75.690,11493,2460,334,394896,406227,1787348,1798667,0.000000e+00,5437.0,30,2,646567,3327926
1,AWFD01000001,sorted_concatenated_sequence,74.691,8973,2046,225,151122,159977,3132916,3141780,0.000000e+00,3781.0,30,1,646567,3327926
2,AWFD01000001,sorted_concatenated_sequence,82.442,3292,510,68,45310,48571,2753972,2757225,0.000000e+00,2815.0,30,1,646567,3327926
3,AWFD01000001,sorted_concatenated_sequence,80.768,3567,612,74,140848,144384,3125133,3128655,0.000000e+00,2719.0,30,1,646567,3327926
4,AWFD01000001,sorted_concatenated_sequence,80.213,3376,581,87,86964,90299,2795768,2799096,0.000000e+00,2453.0,30,1,646567,3327926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,AWFD01000005,sorted_concatenated_sequence,77.025,605,120,19,42712,43309,571136,570544,2.010000e-88,329.0,27,1,117204,3327926
205,AWFD01000005,sorted_concatenated_sequence,74.571,641,157,6,39189,39826,575763,575126,2.660000e-72,276.0,27,1,117204,3327926
206,AWFD01000005,sorted_concatenated_sequence,80.460,348,43,25,2444,2769,612714,612370,2.700000e-62,243.0,27,0,117204,3327926
207,AWFD01000005,sorted_concatenated_sequence,100.000,78,0,0,114083,114160,508082,508005,7.830000e-33,145.0,27,0,117204,3327926


In [20]:
default_df_sorted = default_df.sort_values(by=["perc_identity", "alignment_length"], ascending=[False,False])
default_df_sorted = default_df_sorted.reset_index()

In [21]:
concat_df_sorted = concat_df.sort_values(by=["perc_identity", "alignment_length"], ascending=[False,False])
concat_df_sorted = concat_df_sorted.reset_index()

In [22]:
concat_sorted_df_sorted = sorted_concat_df.sort_values(by=["perc_identity", "alignment_length"], ascending=[False,False])
concat_sorted_df_sorted = concat_sorted_df_sorted.reset_index()

In [23]:
default_df_sorted['index'].equals(concat_df_sorted['index'])

False

In [24]:
mask = default_df_sorted != concat_df_sorted

In [39]:
mask

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,True
1,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
2,False,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
3,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
4,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
205,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
206,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True
207,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True


In [26]:
default_df_sorted

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,207,AWFD01000005,NZ_JAEPPV010000001.1,100.000,78,0,0,114083,114160,508082,508005,7.830000e-33,145.0,27,0,117204,1120828
1,78,AWFD01000001,NZ_JAEPPV010000007.1,98.795,83,0,1,590225,590306,59361,59443,1.200000e-32,147.0,2,0,646567,125019
2,142,AWFD01000002,NZ_JAEPPV010000003.1,98.039,51,1,0,1,51,460887,460837,6.100000e-16,89.8,1,0,191994,460968
3,182,AWFD01000004,NZ_JAEPPV010000004.1,94.505,91,5,0,59501,59591,171942,172032,1.190000e-31,141.0,12,0,137817,239583
4,137,AWFD01000002,NZ_JAEPPV010000005.1,93.443,122,8,0,122392,122513,218470,218591,9.780000e-44,182.0,21,0,191994,236043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,55,AWFD01000001,NZ_JAEPPV010000006.1,72.333,900,217,32,24603,25486,39638,40521,6.880000e-65,254.0,7,0,646567,216801
205,132,AWFD01000002,NZ_JAEPPV010000005.1,71.229,1505,379,54,122807,124284,218893,220370,9.170000e-89,331.0,21,1,191994,236043
206,114,AWFD01000001,NZ_JAEPPV010000001.1,71.128,523,118,33,477021,477530,273001,273503,3.410000e-18,99.0,8,0,646567,1120828
207,56,AWFD01000001,NZ_JAEPPV010000006.1,70.676,1006,230,65,93256,94242,105620,106579,1.190000e-37,163.0,7,0,646567,216801


In [27]:
concat_df_sorted

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,207,AWFD01000005,concatenated_sequence,100.000,78,0,0,114083,114160,1840132,1840055,7.830000e-33,145.0,27,0,117204,3327926
1,110,AWFD01000001,concatenated_sequence,98.795,83,0,1,590225,590306,394938,395020,1.200000e-32,147.0,30,0,646567,3327926
2,142,AWFD01000002,concatenated_sequence,98.039,51,1,0,1,51,921483,921433,6.100000e-16,89.8,26,0,191994,3327926
3,186,AWFD01000004,concatenated_sequence,94.505,91,5,0,59501,59591,1175939,1176029,1.190000e-31,141.0,21,0,137817,3327926
4,140,AWFD01000002,concatenated_sequence,93.443,122,8,0,122392,122513,3310353,3310474,9.780000e-44,182.0,26,0,191994,3327926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,103,AWFD01000001,concatenated_sequence,72.333,900,217,32,24603,25486,158414,159297,6.880000e-65,254.0,30,0,646567,3327926
205,135,AWFD01000002,concatenated_sequence,71.229,1505,379,54,122807,124284,3310776,3312253,9.170000e-89,331.0,26,1,191994,3327926
206,115,AWFD01000001,concatenated_sequence,71.128,523,118,33,477021,477530,1605051,1605553,3.410000e-18,99.0,30,0,646567,3327926
207,108,AWFD01000001,concatenated_sequence,70.676,1006,230,65,93256,94242,224396,225355,1.190000e-37,163.0,30,0,646567,3327926


In [28]:
default_df_sorted['alignment_length'].equals(concat_df_sorted['alignment_length'])

False

In [29]:
mask[mask['alignment_length']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
82,True,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True
149,False,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True


In [31]:
default_df_sorted.loc[82]

index                                 83
query_id                    AWFD01000001
subject_id          NZ_JAEPPV010000001.1
perc_identity                     79.899
alignment_length                    2378
mismatches                           423
gaps                                  55
q_start                           204143
q_end                             206487
s_start                            14036
s_end                              16391
evalue                               0.0
bit_score                         1692.0
qcov                                   8
qcovhsp                                0
qlen                              646567
slen                             1120828
Name: 82, dtype: object

In [32]:
concat_df_sorted.loc[82]

index                                  15
query_id                     AWFD01000001
subject_id          concatenated_sequence
perc_identity                      79.882
alignment_length                     2376
mismatches                            427
gaps                                   51
q_start                            204143
q_end                              206487
s_start                           1346086
s_end                             1348441
evalue                                0.0
bit_score                          1692.0
qcov                                   30
qcovhsp                                 0
qlen                               646567
slen                              3327926
Name: 82, dtype: object

In [33]:
default_df_sorted.loc[149]

index                                  0
query_id                    AWFD01000001
subject_id          NZ_JAEPPV010000003.1
perc_identity                      75.69
alignment_length                   11493
mismatches                          2460
gaps                                 334
q_start                           394896
q_end                             406227
s_start                            27515
s_end                              38834
evalue                               0.0
bit_score                         5437.0
qcov                                   6
qcovhsp                                2
qlen                              646567
slen                              460968
Name: 149, dtype: object

In [35]:
concat_df_sorted.loc[149]

index                                   0
query_id                     AWFD01000001
subject_id          concatenated_sequence
perc_identity                      75.692
alignment_length                    11494
mismatches                           2458
gaps                                  336
q_start                            394896
q_end                              406227
s_start                            488111
s_end                              499430
evalue                                0.0
bit_score                          5437.0
qcov                                   30
qcovhsp                                 2
qlen                               646567
slen                              3327926
Name: 149, dtype: object

In [40]:
mask[mask['perc_identity']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
82,True,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True
149,False,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True


In [41]:
mask[mask['mismatches']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
82,True,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True
149,False,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True


In [42]:
mask[mask['gaps']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
82,True,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True
149,False,False,True,True,True,True,True,False,False,True,True,False,False,True,False,False,True


In [43]:
mask2 = default_df_sorted != concat_sorted_df_sorted

In [45]:
mask2[mask2['alignment_length']==True]


Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
82,True,False,True,True,True,True,True,False,False,False,False,False,False,True,False,False,True


In [46]:
concat_sorted_df_sorted.loc[82]

index                                         15
query_id                            AWFD01000001
subject_id          sorted_concatenated_sequence
perc_identity                             79.882
alignment_length                            2376
mismatches                                   427
gaps                                          51
q_start                                   204143
q_end                                     206487
s_start                                    14036
s_end                                      16391
evalue                                       0.0
bit_score                                 1692.0
qcov                                          30
qcovhsp                                        0
qlen                                      646567
slen                                     3327926
Name: 82, dtype: object

In [47]:
concat_sorted_df_sorted.loc[149]

index                                          0
query_id                            AWFD01000001
subject_id          sorted_concatenated_sequence
perc_identity                              75.69
alignment_length                           11493
mismatches                                  2460
gaps                                         334
q_start                                   394896
q_end                                     406227
s_start                                  1787348
s_end                                    1798667
evalue                                       0.0
bit_score                                 5437.0
qcov                                          30
qcovhsp                                        2
qlen                                      646567
slen                                     3327926
Name: 149, dtype: object

In [71]:
def concatenate_fasta(input_fasta, output_fasta, output_header="concat_sequence"):
    """
    Concatenate all contigs from a FASTA file into a single sequence and write to a new FASTA file.
    
    Parameters:
    - input_fasta: Path to the input FASTA file containing multiple contigs.
    - output_fasta: Path to the output FASTA file where the concatenated sequence will be saved.
    - output_header: Header for the concatenated sequence in the output file.
    """
    with open(input_fasta, 'r') as infile, open(output_fasta, 'w') as outfile:
        sequence = ""
        
        for line in infile:
            if line.startswith(">"):
                # Skip headers from input FASTA file
                continue
            else:
                sequence += line.strip()  # Concatenate sequence lines
        
        # Write the concatenated sequence to the output file
        outfile.write(f">{output_header}\n")
        outfile.write("\n".join(sequence[i:i+80] for i in range(0, len(sequence), 80)) + '\n')

# Example usage:
input_fasta = "5.fna"  # Replace with your actual input file path
output_fasta = "concatenated_sequence.fna"

concatenate_fasta(input_fasta, output_fasta)


In [72]:
import pandas as pd

# Define the column names based on the format of the BLAST output.
# The default BLAST tabular output (-outfmt 6) has these columns:
column_names = [
    "query_id", "subject_id", "perc_identity", "alignment_length", "mismatches",
    "gaps", "q_start", "q_end", "s_start", "s_end", "evalue", "bit_score", "qcov", "qcovhsp", "qlen", "slen"
]

# Read the BLAST output file into a pandas DataFrame
blast_output_file = "all_new.blast"
all = pd.read_csv(blast_output_file, sep="\t", header=None, names=column_names)

# Display the first few rows of the DataFrame
all

Unnamed: 0,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,concat_sequence,concatenated_sequence,75.692,11494,2458,336,394896,406227,488111,499430,0.000000e+00,5437.0,28,1,1240583,3327926
1,concat_sequence,concatenated_sequence,82.279,5186,873,46,994229,999392,2701780,2696619,0.000000e+00,4444.0,28,0,1240583,3327926
2,concat_sequence,concatenated_sequence,81.186,5108,887,74,749899,754961,3290594,3295672,0.000000e+00,4041.0,28,0,1240583,3327926
3,concat_sequence,concatenated_sequence,74.691,8973,2046,225,151122,159977,94669,103533,0.000000e+00,3781.0,28,1,1240583,3327926
4,concat_sequence,concatenated_sequence,78.603,4767,935,85,659559,664286,3106571,3111291,0.000000e+00,3073.0,28,0,1240583,3327926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,concat_sequence,concatenated_sequence,83.471,121,16,4,568589,568707,390401,390519,3.020000e-21,110.0,28,0,1240583,3327926
205,concat_sequence,concatenated_sequence,89.873,79,5,3,920303,920378,741961,742039,6.550000e-18,99.0,28,0,1240583,3327926
206,concat_sequence,concatenated_sequence,71.128,523,118,33,477021,477530,1605051,1605553,6.550000e-18,99.0,28,0,1240583,3327926
207,concat_sequence,concatenated_sequence,98.039,51,1,0,646568,646618,921483,921433,3.940000e-15,89.8,28,0,1240583,3327926


In [73]:
all_sorted = all.sort_values(by=["perc_identity", "alignment_length"], ascending=[False,False])
all_sorted = all_sorted.reset_index()

In [74]:
mask2 = default_df_sorted != all_sorted

In [75]:
default_df_sorted

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,207,AWFD01000005,NZ_JAEPPV010000001.1,100.000,78,0,0,114083,114160,508082,508005,7.830000e-33,145.0,27,0,117204,1120828
1,78,AWFD01000001,NZ_JAEPPV010000007.1,98.795,83,0,1,590225,590306,59361,59443,1.200000e-32,147.0,2,0,646567,125019
2,142,AWFD01000002,NZ_JAEPPV010000003.1,98.039,51,1,0,1,51,460887,460837,6.100000e-16,89.8,1,0,191994,460968
3,182,AWFD01000004,NZ_JAEPPV010000004.1,94.505,91,5,0,59501,59591,171942,172032,1.190000e-31,141.0,12,0,137817,239583
4,137,AWFD01000002,NZ_JAEPPV010000005.1,93.443,122,8,0,122392,122513,218470,218591,9.780000e-44,182.0,21,0,191994,236043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,55,AWFD01000001,NZ_JAEPPV010000006.1,72.333,900,217,32,24603,25486,39638,40521,6.880000e-65,254.0,7,0,646567,216801
205,132,AWFD01000002,NZ_JAEPPV010000005.1,71.229,1505,379,54,122807,124284,218893,220370,9.170000e-89,331.0,21,1,191994,236043
206,114,AWFD01000001,NZ_JAEPPV010000001.1,71.128,523,118,33,477021,477530,273001,273503,3.410000e-18,99.0,8,0,646567,1120828
207,56,AWFD01000001,NZ_JAEPPV010000006.1,70.676,1006,230,65,93256,94242,105620,106579,1.190000e-37,163.0,7,0,646567,216801


In [76]:
all_sorted

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,196,concat_sequence,concatenated_sequence,100.000,78,0,0,1237462,1237539,1840132,1840055,8.290000e-32,145.0,28,0,1240583,3327926
1,195,concat_sequence,concatenated_sequence,98.795,83,0,1,590225,590306,394938,395020,2.310000e-32,147.0,28,0,1240583,3327926
2,207,concat_sequence,concatenated_sequence,98.039,51,1,0,646568,646618,921483,921433,3.940000e-15,89.8,28,0,1240583,3327926
3,198,concat_sequence,concatenated_sequence,94.505,91,5,0,1045063,1045153,1175939,1176029,1.070000e-30,141.0,28,0,1240583,3327926
4,188,concat_sequence,concatenated_sequence,93.443,122,8,0,768959,769080,3310353,3310474,6.320000e-43,182.0,28,0,1240583,3327926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,176,concat_sequence,concatenated_sequence,72.333,900,217,32,24603,25486,158414,159297,1.320000e-64,254.0,28,0,1240583,3327926
205,169,concat_sequence,concatenated_sequence,71.229,1505,379,54,769374,770851,3310776,3312253,5.930000e-88,331.0,28,0,1240583,3327926
206,206,concat_sequence,concatenated_sequence,71.128,523,118,33,477021,477530,1605051,1605553,6.550000e-18,99.0,28,0,1240583,3327926
207,191,concat_sequence,concatenated_sequence,70.676,1006,230,65,93256,94242,224396,225355,2.290000e-37,163.0,28,0,1240583,3327926


In [77]:
mask2

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
0,True,True,True,False,False,False,False,True,True,True,True,True,False,True,False,True,True
1,True,True,True,False,False,False,False,False,False,True,True,True,False,True,False,True,True
2,True,True,True,False,False,False,False,True,True,True,True,True,False,True,False,True,True
3,True,True,True,False,False,False,False,True,True,True,True,True,False,True,False,True,True
4,True,True,True,False,False,False,False,True,True,True,True,True,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,True,True,True,False,False,False,False,False,False,True,True,True,False,True,False,True,True
205,True,True,True,False,False,False,False,True,True,True,True,True,False,True,True,True,True
206,True,True,True,False,False,False,False,False,False,True,True,True,False,True,False,True,True
207,True,True,True,False,False,False,False,False,False,True,True,True,False,True,False,True,True


In [78]:
mask2[mask2['alignment_length']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
65,True,True,True,True,True,True,True,True,True,True,True,False,False,True,True,True,True
82,True,True,True,True,True,True,True,False,False,True,True,False,False,True,False,True,True
149,False,True,True,True,True,True,True,False,False,True,True,False,False,True,True,True,True


In [79]:
mask2[mask2['alignment_length']==True]

Unnamed: 0,index,query_id,subject_id,perc_identity,alignment_length,mismatches,gaps,q_start,q_end,s_start,s_end,evalue,bit_score,qcov,qcovhsp,qlen,slen
65,True,True,True,True,True,True,True,True,True,True,True,False,False,True,True,True,True
82,True,True,True,True,True,True,True,False,False,True,True,False,False,True,False,True,True
149,False,True,True,True,True,True,True,False,False,True,True,False,False,True,True,True,True


In [81]:
default_df_sorted.loc[65]

index                                116
query_id                    AWFD01000002
subject_id          NZ_JAEPPV010000005.1
perc_identity                     81.179
alignment_length                    5106
mismatches                           891
gaps                                  70
q_start                           103332
q_end                             108394
s_start                           198711
s_end                             203789
evalue                               0.0
bit_score                         4041.0
qcov                                  21
qcovhsp                                3
qlen                              191994
slen                              236043
Name: 65, dtype: object

In [82]:
all_sorted.loc[65]

index                                   2
query_id                  concat_sequence
subject_id          concatenated_sequence
perc_identity                      81.186
alignment_length                     5108
mismatches                            887
gaps                                   74
q_start                            749899
q_end                              754961
s_start                           3290594
s_end                             3295672
evalue                                0.0
bit_score                          4041.0
qcov                                   28
qcovhsp                                 0
qlen                              1240583
slen                              3327926
Name: 65, dtype: object

In [83]:
default_df_sorted.loc[82]

index                                 83
query_id                    AWFD01000001
subject_id          NZ_JAEPPV010000001.1
perc_identity                     79.899
alignment_length                    2378
mismatches                           423
gaps                                  55
q_start                           204143
q_end                             206487
s_start                            14036
s_end                              16391
evalue                               0.0
bit_score                         1692.0
qcov                                   8
qcovhsp                                0
qlen                              646567
slen                             1120828
Name: 82, dtype: object

In [84]:
all_sorted.loc[82]

index                                  31
query_id                  concat_sequence
subject_id          concatenated_sequence
perc_identity                      79.882
alignment_length                     2376
mismatches                            427
gaps                                   51
q_start                            204143
q_end                              206487
s_start                           1346086
s_end                             1348441
evalue                                0.0
bit_score                          1692.0
qcov                                   28
qcovhsp                                 0
qlen                              1240583
slen                              3327926
Name: 82, dtype: object

In [85]:
default_df_sorted.loc[149]

index                                  0
query_id                    AWFD01000001
subject_id          NZ_JAEPPV010000003.1
perc_identity                      75.69
alignment_length                   11493
mismatches                          2460
gaps                                 334
q_start                           394896
q_end                             406227
s_start                            27515
s_end                              38834
evalue                               0.0
bit_score                         5437.0
qcov                                   6
qcovhsp                                2
qlen                              646567
slen                              460968
Name: 149, dtype: object

In [86]:
all_sorted.loc[149]

index                                   0
query_id                  concat_sequence
subject_id          concatenated_sequence
perc_identity                      75.692
alignment_length                    11494
mismatches                           2458
gaps                                  336
q_start                            394896
q_end                              406227
s_start                            488111
s_end                              499430
evalue                                0.0
bit_score                          5437.0
qcov                                   28
qcovhsp                                 1
qlen                              1240583
slen                              3327926
Name: 149, dtype: object