<a href="https://colab.research.google.com/github/NehaSontakk/BATH-Prokka-Comparison/blob/main/BATH_file_deduplication_(Positive_Negative)_Oct724.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Strand-specific Genomic Data Deduplication Using E-value and Overlap Metrics


A method for filtering and deduplicating genomic data is designed to enhance the accuracy and reliability of sequence alignments by leveraging E-value thresholds and alignment overlap metrics. Initially, an E-value threshold of 0.000001 is applied to exclude low-confidence alignments. Subsequently, the DNA strand (positive or negative) is identified based on alignment positions. For each strand, a series of deduplication steps are performed:

*   100% Deduplication: Exact duplicates are identified and removed by comparing E-values and scores, retaining only the highest quality alignments.
*   70% Deduplication: Alignments with significant overlap (≥70%) are addressed by comparing E-values and sequence lengths, with the more reliable alignment being retained.
*  <70% Deduplication: For alignments with minor overlap (0.01% to <70%), adjustments are made to the alignment positions to resolve conflicts, ensuring non-redundancy of the retained alignments.

Each deduplication step is applied to ensure that the final dataset is both comprehensive and non-redundant, providing high-confidence data for subsequent analyses. This method is applied separately to positive and negative strands, allowing for tailored processing and accurate strand-specific results.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Input file specification

bathsearch_op_path = "/content/drive/MyDrive/Lab Work/Parkinsons_Data/Iteration_Sep30/BATH_Deduplication/BATHOUTPUT/*.tbl"
deduplication_op_path = "/content/drive/MyDrive/Lab Work/Parkinsons_Data/Iteration_Sep30/dedup_combinned_binned_unbinned_sep30.xlsx"

In [4]:
e_value_threshold = 0.000001

In [5]:
import pandas as pd
from glob import glob
import os

In [6]:
class filtering_operations:

  @staticmethod
  def e_value_filtering(df):
    return df.loc[df['E-value']<=e_value_threshold]

  @staticmethod
  def pos_neg_strand_filtering(df_filtered):
    #df should be post e_value_filteration
    df = filtering_operations.e_value_filtering(df_filtered)
    #Strand identification
    df['ali from'] = df['ali from'].astype(int)
    df['ali to'] = df['ali to'].astype(int)
    strand = []

    for index,row in df.iterrows():
      if row['ali from'] < row['ali to']:
        strand.append("+")
      elif row['ali from'] > row['ali to']:
        strand.append("-")

    df['strand'] = strand
    return df

  @staticmethod
  def get_specific_strand(df_raw,strand_info):
    df = filtering_operations.pos_neg_strand_filtering(df_raw)
    return df.loc[df['strand']==strand_info]


In [7]:
class overlap_deduplications:

  ali_to_column = ""
  ali_from_column = ""

  # Initialize variables to hold counts
  initial_count = 0
  count_after_100 = 0
  count_after_70 = 0
  count_after_less_than_70 = 0

  #METHODS FOR 100% DEDUPLICATION

  @staticmethod
  def handle_group(group):
    #Now for each element in group
    min_e_value = group['E-value'].min()
    #find elements with lowest e-values
    group_min_e = group.loc[group['E-value'] == min_e_value]
    #if group contains a single row save it
    if len(group_min_e) == 1:
      return group_min_e
    #if group contains multiple rows with same e-values check min score
    elif len(group_min_e) > 1:
      print("Group contains multiple rows")
      return group_min_e.iloc[0:1]

  @staticmethod
  def deduplicate_full_overlaps(df):
    #100% Overlap Dedup
    overlap_deduplications.initial_count = df.shape[0]
    print("Number of elements in relevant strand: ",df.shape)
    #Find all the duplicates
    duplicates = df.duplicated(subset=['target name',overlap_deduplications.ali_from_column, overlap_deduplications.ali_to_column], keep=False)
    #make a duplicates df
    duplicates_df = df[duplicates]
    #make a non duplicates df
    not_duplicate_df = df[~duplicates]
    print("Number of exact duplicates or homologs: ",duplicates_df.shape)
    #Sort by E-value and score
    duplicates_df1 = duplicates_df.sort_values(by=['E-value', 'score'], ascending=[True, False])
    deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)
    #rejion after deduplication
    dedup_step1 = pd.concat([not_duplicate_df,deduplicated_remhomologs]).sort_values(['target name','ali from','E-value'])
    overlap_deduplications.count_after_100 = dedup_step1.shape[0]
    print("Number of elements after removing exact duplicates or homologs: ",dedup_step1.shape)
    print("Elements removed: ",df.shape[0] - dedup_step1.shape[0])
    return dedup_step1

  #METHODS FOR 70% DEDUPLICATION

  @staticmethod
  def calculate_overlap(hit_a, hit_b):
    #print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
    start_a, end_a = hit_a[overlap_deduplications.ali_from_column], hit_a[overlap_deduplications.ali_to_column]
    start_b, end_b = hit_b[overlap_deduplications.ali_from_column], hit_b[overlap_deduplications.ali_to_column]

    overlap_length = max(0, min(end_a, end_b) - max(start_a, start_b) + 1)
    #print(min(end_a, end_b))
    #print(max(start_a, start_b))
    #print("Overlap length",overlap_length)

    length_a = end_a - start_a + 1
    length_b = end_b - start_b + 1

    #print("Length",length_a,length_b)
    if length_a > 0 and length_b > 0:
        overlap_perc_a = (overlap_length / length_a) * 100
        overlap_perc_b = (overlap_length / length_b) * 100
        return overlap_perc_a, overlap_perc_b
    else:
        return 0, 0

  @staticmethod
  def calculate_winner(hit_a, hit_b):
      overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
      print(f"Comparing:\nHit A: {hit_a}\nHit B: {hit_b}\nOverlap1: {overlap1}%, Overlap2: {overlap2}%")

      # Check for significant overlap
      if overlap1 >= 70 or overlap2 >= 70:
          # Determine which hit to retain based on E-value or sequence length on tie
          if hit_a['E-value'] < hit_b['E-value']:
              print("Choosing Hit A based on E-value")
              return hit_a
          elif hit_a['E-value'] > hit_b['E-value']:
              print("Choosing Hit B based on E-value")
              return hit_b
          else:  # E-values are tied, check the sequence length
              if hit_a['seq len'] > hit_b['seq len']:
                  print("E-values tied. Choosing Hit A based on larger sequence length")
                  return hit_a
              else:
                  print("E-values tied. Choosing Hit B based on larger sequence length")
                  return hit_b
      else:
          print("No significant overlap or both hits retained")
          return hit_a, hit_b

  @staticmethod
  def handle_70_overlaps(group):
    # Check if the DataFrame has 1 or fewer rows
    initial_group_count = len(group)
    if len(group) <= 1:
        return group

    items = []
    print("HIIII")
    print(group)

    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    i = 0
    all_winners = []

    while i<len(items)-1:
      current = items[i]
      print(current)
      next_item = items[i+1]
      print(next_item)
      print("Begining Comparison: ",i)
      #print(current,"\n",next_item)
      result = overlap_deduplications.calculate_winner(current, next_item)

      #If result is two items append the current item to winners list since overlap of current item with next item is less than 70% so we want to keep both
      if isinstance(result,tuple):
        if result[0] not in all_winners:
          all_winners.append(result[0])
          #print("Tuple returned!")
          #print("Appending: ",result[0])
          #print("All winners currently: ",all_winners)
        #else:
        #  print("Result already in all winners, no appending!")
        i += 1
      else:
        if result == current:
          #print("Result is ",i," element and not ",i+1," popping: ",items[i+1])
          items.pop(i+1)
        else:
          i += 1

      if i >= len(items) - 1 and items[-1] not in all_winners:
        all_winners.append(items[-1])

      final_group_count = len(pd.DataFrame(all_winners))
      overlap_deduplications.count_after_70 += final_group_count

    return pd.DataFrame(all_winners)

  #METHODS FOR LESS THAN 70% DEDUPLICATION
  @staticmethod
  def calculate_below_70_changes(hit_a, hit_b):
    overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
    #print("Overlap 1: ",overlap1)
    if (0.01 <= overlap1 < 70) and (0.01 <= overlap2 < 70):
      if hit_a['E-value'] < hit_b['E-value']:
        print("Case A: A dominates")
        print(hit_a)
        print(hit_b)
        hit_b['ali from'] = hit_a['ali to'] + 1
        #print(hit_b)
        return hit_a, hit_b
      else:
        print("Case B: B dominates")
        print(hit_a)
        print(hit_b)
        hit_a['ali to'] = hit_b['ali from'] - 1
        #print(hit_a)
        #print(hit_b)
        return hit_a, hit_b
    else:
      return hit_a,hit_b

  @staticmethod
  def below_70_overlap(group):
    initial_group_count = len(group)
    if len(group) <= 1:
        print("Length of group less than 1.")
        return group

    items = []
    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    for i in range(0,len(items)-1):
      element1, element2 = overlap_deduplications.calculate_below_70_changes(items[i],items[i+1])
      items[i] = element1
      items[i+1] = element2

    final_group_count = len(pd.DataFrame(items))
    overlap_deduplications.count_after_less_than_70 += final_group_count
    return pd.DataFrame(items)


  # FINAL METHOD TO CALL
  @staticmethod
  def choose_strand_operations(df):
    if df['strand'].values[0] == '-':
      print("Processing negative strand data...")
      #create the flip columns
      df['ali from flip'] = df['ali to']
      df['ali to flip'] = df['ali from']
      overlap_deduplications.ali_to_column = 'ali to flip'
      overlap_deduplications.ali_from_column = 'ali from flip'
      #100% dedup
      df = df.sort_values(by=['ali from flip','E-value'], ascending=[True, True])
      #print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      return step3_dedup
    elif df['strand'].values[0] == '+':
      print("Processing positive strand data...")
      overlap_deduplications.ali_to_column = 'ali to'
      overlap_deduplications.ali_from_column = 'ali from'
      #100% dedup
      df = df.sort_values(by=['ali from','E-value'], ascending=[True, True])
      #print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      print(step2_dedup)
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      print(step3_dedup)
      return step3_dedup




In [8]:
dataframes = []
col_names = [
    'target name', 'accession', 'query name', 'accession1', 'hmm len',
    'hmm from', 'hmm to', 'seq len', 'ali from', 'ali to',
    'env from', 'env to', 'E-value', 'score', 'bias', 'shifts',
    'stops', 'pipe', 'description of target', 'extra'
]
for file_path in glob(bathsearch_op_path):
    # Check if the file is not empty by checking its size
    if os.path.getsize(file_path) == 0:
        print(f"Skipping completely empty file: {file_path}")
        continue

    try:
        # Read the table
        bathout = pd.read_table(file_path, sep="\s+", header=None, skiprows=2, skipfooter=8, engine='python')
    except pd.errors.EmptyDataError:
        print(f"No data to parse from file (possibly headers only): {file_path}")
        continue

    # Check if the DataFrame is empty after reading
    if bathout.empty:
        print(f"Skipping empty file with headers but no data: {file_path}")
        continue

    # Rename the columns
    bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)

    # Append the DataFrame to the list
    dataframes.append(bathout)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [9]:
df_neg = filtering_operations.get_specific_strand(combined_df,"-")
neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)
df_pos = filtering_operations.get_specific_strand(combined_df,"+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)
complete_bath_deduplication = pd.concat([pos_deduped,neg_deduped])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group c

  deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Overlap1: 100.0%, Overlap2: 69.0537084398977%
Choosing Hit B based on E-value
{'target name': 'NODE_9716_length_3377_cov_2.582480_bin.9', 'accession': '-', 'query name': 'Q976E4', 'accession1': '-', 'hmm len': 455, 'hmm from': 58, 'hmm to': 455, 'seq len': 3377, 'ali from': 3364, 'ali to': 2192, 'env from': 3373, 'env to': 2192, 'E-value': 3.3e-18, 'score': 80.3, 'bias': 0.4, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 2192, 'ali to flip': 3364}
{'target name': 'NODE_9716_length_3377_cov_2.582480_bin.9', 'accession': '-', 'query name': 'Q68BJ6', 'accession1': '-', 'hmm len': 456, 'hmm from': 60, 'hmm to': 455, 'seq len': 3377, 'ali from': 3370, 'ali to': 2186, 'env from': 3376, 'env to': 2186, 'E-value': 3e-34, 'score': 133.1, 'bias': 0.3, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 2186, 'ali to flip': 3370

  step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)


Overlap calculations done on columns: ali from flip ali to flip
70% deduplication done.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Case A: A dominates
{'target name': 'NODE_1036_length_15001_cov_4.179580_bin.5', 'accession': '-', 'query name': 'MF_02019', 'accession1': '-', 'hmm len': 460, 'hmm from': 286, 'hmm to': 458, 'seq len': 15001, 'ali from': 10744, 'ali to': 10232,

  step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)


Less than 70% deduplication done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group contains multiple rows
Group c

  deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'target name': 'NODE_969_length_24535_cov_4.143423_bin.31', 'accession': '-', 'query name': 'Q58206', 'accession1': '-', 'hmm len': 235, 'hmm from': 1, 'hmm to': 224, 'seq len': 24535, 'ali from': 20895, 'ali to': 21551, 'env from': 20895, 'env to': 21578, 'E-value': 1.1e-54, 'score': 201.4, 'bias': 0.1, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '+'}
{'target name': 'NODE_969_length_24535_cov_4.143423_bin.31', 'accession': '-', 'query name': 'P0AAF6', 'accession1': '-', 'hmm len': 242, 'hmm from': 15, 'hmm to': 235, 'seq len': 24535, 'ali from': 20946, 'ali to': 21596, 'env from': 20898, 'env to': 21611, 'E-value': 1.5e-33, 'score': 131.5, 'bias': 0.0, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '+'}
Begining Comparison:  17
Comparing:
Hit A: {'target name': 'NODE_969_length_24535_cov_4.143423_bin.31', 'accession': '-', 'query name': 'Q58206', 'acc

  step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)


Overlap calculations done on columns: ali from ali to
70% deduplication done.
                                      target name accession query name  \
0      NODE_10005_length_3878_cov_2.511117_bin.33         -   MF_01952   
1      NODE_10021_length_4245_cov_2.538425_bin.94         -     P0A9J8   
2      NODE_10022_length_2530_cov_2.013737_bin.68         -   MF_00387   
3      NODE_10034_length_3867_cov_1.920776_bin.10         -   MF_00847   
4       NODE_10038_length_3272_cov_2.444514_bin.3         -     Q58505   
...                                           ...       ...        ...   
10891   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00819   
10892   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00819   
10893   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00015   
10894   NODE_9998_length_2081_cov_2.067127_bin.40         -   MF_01395   
10895   NODE_9998_length_2081_cov_2.067127_bin.40         -   MF_00823   

      accession1  hmm len  hmm fr

  step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)


In [10]:
complete_bath_deduplication.shape

(22004, 22)

In [23]:
#mark with bin interval
coverage_intervals = {
    "1.4-1.6": (1.4, 1.6),
    "1.9-2.1": (1.9, 2.1),
    "2.4-2.6": (2.4, 2.6),
    "3.8-4.2": (3.8, 4.2),
    "7.5-8.5": (7.5, 8.5)
}


def get_coverage_interval(coverage):
    for interval, (low, high) in coverage_intervals.items():
        if low <= coverage <= high:
            return interval
    return "Outside defined intervals"


def categorize_row(row):
    parts = row['target name'].split('_')
    coverage = float(parts[5])  # Assuming the format is always the same

    # Check for bin status
    if "bin" == row['target name'].split("_")[-1]:
        bin_status = "binned"
    elif "unbinned" == row['target name'].split("_")[-1]:
        bin_status = "unbinned"
    else:
        bin_status = "unknown"

    interval = get_coverage_interval(coverage)

    return pd.Series([interval, bin_status])

complete_bath_deduplication[['coverage interval', 'bin status']] = complete_bath_deduplication.apply(categorize_row, axis=1)


In [24]:
complete_bath_deduplication[complete_bath_deduplication['target name'].str.contains("unbinned")]

Unnamed: 0,target name,accession,query name,accession1,hmm len,hmm from,hmm to,seq len,ali from,ali to,...,bias,shifts,stops,pipe,description of target,strand,ali from flip,ali to flip,coverage interval,bin status
2699,NODE_18550_length_1000_cov_3.887831_unbinned,-,P96710,-,464,59,172,1000,14,391,...,1.7,0,0,std,-,+,,,3.8-4.2,unbinned
2700,NODE_18550_length_1000_cov_3.887831_unbinned,-,P96710,-,464,177,344,1000,458,943,...,1.8,0,0,std,-,+,,,3.8-4.2,unbinned
2701,NODE_18559_length_1000_cov_1.919577_unbinned,-,MF_01140,-,311,11,266,1000,126,935,...,5.8,0,0,std,-,+,,,1.9-2.1,unbinned
2752,NODE_18852_length_1000_cov_3.845503_unbinned,-,MF_02055,-,289,144,284,1000,3,431,...,0.4,0,0,std,-,+,,,3.8-4.2,unbinned
2753,NODE_18852_length_1000_cov_3.845503_unbinned,-,ISAur1,-,449,88,191,1000,598,921,...,0.0,0,0,std,-,+,,,3.8-4.2,unbinned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10980,NODE_95368_length_1000_cov_1.924868_unbinned,-,MF_01110,-,314,1,248,1000,748,8,...,0.0,0,0,std,-,-,8.0,748.0,1.9-2.1,unbinned
10981,NODE_95369_length_1000_cov_1.921693_unbinned,-,MF_01251,-,615,7,70,1000,196,5,...,0.0,0,0,std,-,-,5.0,196.0,1.9-2.1,unbinned
10982,NODE_95388_length_1000_cov_1.559788_unbinned,-,MF_01314,-,506,313,414,1000,343,38,...,0.0,0,0,fs,-,-,38.0,343.0,1.4-1.6,unbinned
10983,NODE_95391_length_1000_cov_1.476190_unbinned,-,MF_01281,-,437,174,431,1000,993,220,...,0.0,0,0,std,-,-,220.0,993.0,1.4-1.6,unbinned


In [25]:
complete_bath_deduplication.to_excel(deduplication_op_path)

In [13]:
@staticmethod
def get_statistics():
    return {
        "Initial Count": overlap_deduplications.initial_count,
        "Count After 100% Deduplication": overlap_deduplications.count_after_100,
        "Count After 70% Deduplication": overlap_deduplications.count_after_70,
        "Count After <70% Deduplication": overlap_deduplications.count_after_less_than_70
    }

stats = get_statistics()
print(stats)

{'Initial Count': 148588, 'Count After 100% Deduplication': 62438, 'Count After 70% Deduplication': 1037566, 'Count After <70% Deduplication': 16161}


### Single File Test

In [14]:
bathout = pd.read_table('/content/test_case.tbl', sep="\s+",skiprows =1, header=None)
bathout
# Define column names
col_names = ['target name', 'accession', 'query name', 'accession1', 'hmm len', 'hmm from', 'hmm to',
              'seq len', 'ali from', 'ali to', 'env from', 'env to', 'E-value', 'score', 'bias',
              'shifts', 'stops', 'pipe', 'description of target', 'extra']

# Rename columns
bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)

# Process negative strand
#df_neg = filtering_operations.get_specific_strand(bathout, "-")
#neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)

# Process positive strand
df_pos = filtering_operations.get_specific_strand(bathout, "+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)


FileNotFoundError: [Errno 2] No such file or directory: '/content/test_case.tbl'

In [None]:
pos_deduped