<a href="https://colab.research.google.com/github/NehaSontakk/BATH-Prokka-Comparison/blob/main/BATH_file_deduplication_(Positive_Negative_Strand)_Oct324.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Strand-specific Genomic Data Deduplication Using E-value and Overlap Metrics


A method for filtering and deduplicating genomic data is designed to enhance the accuracy and reliability of sequence alignments by leveraging E-value thresholds and alignment overlap metrics. Initially, an E-value threshold of 0.000001 is applied to exclude low-confidence alignments. Subsequently, the DNA strand (positive or negative) is identified based on alignment positions. For each strand, a series of deduplication steps are performed:

*   100% Deduplication: Exact duplicates are identified and removed by comparing E-values and scores, retaining only the highest quality alignments.
*   70% Deduplication: Alignments with significant overlap (≥70%) are addressed by comparing E-values and sequence lengths, with the more reliable alignment being retained.
*  <70% Deduplication: For alignments with minor overlap (0.01% to <70%), adjustments are made to the alignment positions to resolve conflicts, ensuring non-redundancy of the retained alignments.

Each deduplication step is applied to ensure that the final dataset is both comprehensive and non-redundant, providing high-confidence data for subsequent analyses. This method is applied separately to positive and negative strands, allowing for tailored processing and accurate strand-specific results.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Input file specification

bathsearch_op_path = "/content/drive/MyDrive/Lab Work/Parkinsons_Data/Iteration_Sep30/BATH_Deduplication/BATHOUTPUT/*.tbl"
deduplication_op_path = "/content/drive/MyDrive/Lab Work/Parkinsons_Data/Iteration_Sep30/dedup_combinned_binned_unbinned_sep30.xlsx"

In [3]:
e_value_threshold = 0.000001

In [4]:
import pandas as pd
from glob import glob
import os

In [5]:
class filtering_operations:

  @staticmethod
  def e_value_filtering(df):
    return df.loc[df['E-value']<=e_value_threshold]

  @staticmethod
  def pos_neg_strand_filtering(df_filtered):
    #df should be post e_value_filteration
    df = filtering_operations.e_value_filtering(df_filtered)
    #Strand identification
    df['ali from'] = df['ali from'].astype(int)
    df['ali to'] = df['ali to'].astype(int)
    strand = []

    for index,row in df.iterrows():
      if row['ali from'] < row['ali to']:
        strand.append("+")
      elif row['ali from'] > row['ali to']:
        strand.append("-")

    df['strand'] = strand
    return df

  @staticmethod
  def get_specific_strand(df_raw,strand_info):
    df = filtering_operations.pos_neg_strand_filtering(df_raw)
    return df.loc[df['strand']==strand_info]


In [9]:
class overlap_deduplications:

  ali_to_column = ""
  ali_from_column = ""

  # Initialize variables to hold counts
  initial_count = 0
  count_after_100 = 0
  count_after_70 = 0
  count_after_less_than_70 = 0

  #METHODS FOR 100% DEDUPLICATION

  @staticmethod
  def handle_group(group):
    #Now for each element in group
    min_e_value = group['E-value'].min()
    #find elements with lowest e-values
    group_min_e = group.loc[group['E-value'] == min_e_value]
    #if group contains a single row save it
    if len(group_min_e) == 1:
      return group_min_e
    #if group contains multiple rows with same e-values check min score
    elif len(group_min_e) > 1:
      print("Group contains multiple rows")
      return group_min_e.iloc[0:1]

  @staticmethod
  def deduplicate_full_overlaps(df):
    #100% Overlap Dedup
    overlap_deduplications.initial_count = df.shape[0]
    print("Number of elements in relevant strand: ",df.shape)
    #Find all the duplicates
    duplicates = df.duplicated(subset=['target name',overlap_deduplications.ali_from_column, overlap_deduplications.ali_to_column], keep=False)
    #make a duplicates df
    duplicates_df = df[duplicates]
    #make a non duplicates df
    not_duplicate_df = df[~duplicates]
    print("Number of exact duplicates or homologs: ",duplicates_df.shape)
    #Sort by E-value and score
    duplicates_df1 = duplicates_df.sort_values(by=['E-value', 'score'], ascending=[True, False])
    deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)
    #rejion after deduplication
    dedup_step1 = pd.concat([not_duplicate_df,deduplicated_remhomologs]).sort_values(['target name','ali from','E-value'])
    overlap_deduplications.count_after_100 = dedup_step1.shape[0]
    print("Number of elements after removing exact duplicates or homologs: ",dedup_step1.shape)
    print("Elements removed: ",df.shape[0] - dedup_step1.shape[0])
    return dedup_step1

  #METHODS FOR 70% DEDUPLICATION

  @staticmethod
  def calculate_overlap(hit_a, hit_b):
    #print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
    start_a, end_a = hit_a[overlap_deduplications.ali_from_column], hit_a[overlap_deduplications.ali_to_column]
    start_b, end_b = hit_b[overlap_deduplications.ali_from_column], hit_b[overlap_deduplications.ali_to_column]

    overlap_length = max(0, min(end_a, end_b) - max(start_a, start_b) + 1)
    #print(min(end_a, end_b))
    #print(max(start_a, start_b))
    #print("Overlap length",overlap_length)

    length_a = end_a - start_a + 1
    length_b = end_b - start_b + 1

    #print("Length",length_a,length_b)
    if length_a > 0 and length_b > 0:
        overlap_perc_a = (overlap_length / length_a) * 100
        overlap_perc_b = (overlap_length / length_b) * 100
        return overlap_perc_a, overlap_perc_b
    else:
        return 0, 0

  @staticmethod
  def calculate_winner(hit_a, hit_b):
      overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
      print(f"Comparing:\nHit A: {hit_a}\nHit B: {hit_b}\nOverlap1: {overlap1}%, Overlap2: {overlap2}%")

      # Check for significant overlap
      if overlap1 >= 70 or overlap2 >= 70:
          # Determine which hit to retain based on E-value or sequence length on tie
          if hit_a['E-value'] < hit_b['E-value']:
              print("Choosing Hit A based on E-value")
              return hit_a
          elif hit_a['E-value'] > hit_b['E-value']:
              print("Choosing Hit B based on E-value")
              return hit_b
          else:  # E-values are tied, check the sequence length
              if hit_a['seq len'] > hit_b['seq len']:
                  print("E-values tied. Choosing Hit A based on larger sequence length")
                  return hit_a
              else:
                  print("E-values tied. Choosing Hit B based on larger sequence length")
                  return hit_b
      else:
          print("No significant overlap or both hits retained")
          return hit_a, hit_b

  @staticmethod
  def handle_70_overlaps(group):
    # Check if the DataFrame has 1 or fewer rows
    initial_group_count = len(group)
    if len(group) <= 1:
        return group


    items = []
    #print("HIIII")
    #print(group)

    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    i = 0
    all_winners = []

    while i<len(items)-1:
      current = items[i]
      print(current)
      next_item = items[i+1]
      print(next_item)
      print("Begining Comparison: ",i)
      #print(current,"\n",next_item)
      result = overlap_deduplications.calculate_winner(current, next_item)

      #If result is two items append the current item to winners list since overlap of current item with next item is less than 70% so we want to keep both
      if isinstance(result,tuple):
        if result[0] not in all_winners:
          all_winners.append(result[0])
          #print("Tuple returned!")
          #print("Appending: ",result[0])
          #print("All winners currently: ",all_winners)
        #else:
        #  print("Result already in all winners, no appending!")
        i += 1
      else:
        if result == current:
          #print("Result is ",i," element and not ",i+1," popping: ",items[i+1])
          items.pop(i+1)
        else:
          i += 1

      if i >= len(items) - 1 and items[-1] not in all_winners:
        all_winners.append(items[-1])

      final_group_count = len(pd.DataFrame(all_winners))
      overlap_deduplications.count_after_70 += final_group_count


    return pd.DataFrame(all_winners)

  #METHODS FOR LESS THAN 70% DEDUPLICATION
  @staticmethod
  def calculate_below_70_changes(hit_a, hit_b):
    overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
    #print("Overlap 1: ",overlap1)
    if (0.01 <= overlap1 < 70) and (0.01 <= overlap2 < 70):
      if hit_a['E-value'] < hit_b['E-value']:
        print("Case A: A dominates")
        print(hit_a)
        print(hit_b)
        hit_b['ali from'] = hit_a['ali to'] + 1
        #print(hit_b)
        return hit_a, hit_b
      else:
        print("Case B: B dominates")
        print(hit_a)
        print(hit_b)
        hit_a['ali to'] = hit_b['ali from'] - 1
        #print(hit_a)
        #print(hit_b)
        return hit_a, hit_b
    else:
      return hit_a,hit_b

  @staticmethod
  def below_70_overlap(group):
    initial_group_count = len(group)
    if len(group) <= 1:
        print("Length of group less than 1.")
        return group

    items = []
    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    for i in range(0,len(items)-1):
      element1, element2 = overlap_deduplications.calculate_below_70_changes(items[i],items[i+1])
      items[i] = element1
      items[i+1] = element2

    final_group_count = len(pd.DataFrame(items))
    overlap_deduplications.count_after_less_than_70 += final_group_count
    return pd.DataFrame(items)


  # FINAL METHOD TO CALL
  @staticmethod
  def choose_strand_operations(df):
    if df['strand'].values[0] == '-':
      print("Processing negative strand data...")
      #create the flip columns
      df['ali from flip'] = df['ali to']
      df['ali to flip'] = df['ali from']
      overlap_deduplications.ali_to_column = 'ali to flip'
      overlap_deduplications.ali_from_column = 'ali from flip'
      #100% dedup
      df = df.sort_values(by=['ali from flip','E-value'], ascending=[True, True])
      print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      return step3_dedup
    elif df['strand'].values[0] == '+':
      print("Processing positive strand data...")
      overlap_deduplications.ali_to_column = 'ali to'
      overlap_deduplications.ali_from_column = 'ali from'
      #100% dedup
      df = df.sort_values(by=['ali from','E-value'], ascending=[True, True])
      print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      print(step2_dedup)
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      print(step3_dedup)
      return step3_dedup




In [10]:
dataframes = []
col_names = [
    'target name', 'accession', 'query name', 'accession1', 'hmm len',
    'hmm from', 'hmm to', 'seq len', 'ali from', 'ali to',
    'env from', 'env to', 'E-value', 'score', 'bias', 'shifts',
    'stops', 'pipe', 'description of target', 'extra'
]
for file_path in glob(bathsearch_op_path):
    # Check if the file is not empty by checking its size
    if os.path.getsize(file_path) == 0:
        print(f"Skipping completely empty file: {file_path}")
        continue

    try:
        # Read the table
        bathout = pd.read_table(file_path, sep="\s+", header=None, skiprows=2, skipfooter=8, engine='python')
    except pd.errors.EmptyDataError:
        print(f"No data to parse from file (possibly headers only): {file_path}")
        continue

    # Check if the DataFrame is empty after reading
    if bathout.empty:
        print(f"Skipping empty file with headers but no data: {file_path}")
        continue

    # Rename the columns
    bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)

    # Append the DataFrame to the list
    dataframes.append(bathout)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [11]:
df_neg = filtering_operations.get_specific_strand(combined_df,"-")
neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)
df_pos = filtering_operations.get_specific_strand(combined_df,"+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)
complete_bath_deduplication = pd.concat([pos_deduped,neg_deduped])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand


Processing negative strand data...
                                       target name accession  query name  \
75019   NODE_23970_length_2341_cov_2.082240_bin.21         -    MF_00205   
74122   NODE_17259_length_2428_cov_2.027813_bin.10         -    MF_00096   
63630    NODE_8878_length_2197_cov_1.924370_bin.46         -      A0QQC8   
80755   NODE_21234_length_2613_cov_2.062158_bin.31         -  MF_01210_B   
82490  NODE_12356_length_3284_cov_2.077733_bin.107         -    MF_01543   
...                                            ...       ...         ...   
79181    NODE_9145_length_4125_cov_2.076167_bin.54         -    MF_00958   
55630    NODE_9145_length_4125_cov_2.076167_bin.54         -      Q9ZEP4   
54942    NODE_9145_length_4125_cov_2.076167_bin.54         -      A7N6S2   
49013    NODE_9145_length_4125_cov_2.076167_bin.54         -      Q9HV27   
74218    NODE_9145_length_4125_cov_2.076167_bin.54         -    MF_00099   

      accession1  hmm len  hmm from  hmm to  seq len

  deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'target name': 'NODE_9125_length_2270_cov_1.988262_bin.19', 'accession': '-', 'query name': 'NG_048278.1', 'accession1': '-', 'hmm len': 651, 'hmm from': 3, 'hmm to': 53, 'seq len': 2270, 'ali from': 155, 'ali to': 3, 'env from': 161, 'env to': 3, 'E-value': 1.2e-09, 'score': 49.8, 'bias': 0.1, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 3, 'ali to flip': 155}
{'target name': 'NODE_9125_length_2270_cov_1.988262_bin.19', 'accession': '-', 'query name': 'NG_048319.1', 'accession1': '-', 'hmm len': 652, 'hmm from': 2, 'hmm to': 54, 'seq len': 2270, 'ali from': 161, 'ali to': 3, 'env from': 164, 'env to': 3, 'E-value': 6.8e-09, 'score': 47.6, 'bias': 0.0, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 3, 'ali to flip': 161}
Begining Comparison:  0
Comparing:
Hit A: {'target name': 'NODE_9125_length_2270_cov_1.9882

  step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)


Overlap calculations done on columns: ali from flip ali to flip
70% deduplication done.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of group less than 1.
Length of gro

  step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)


Less than 70% deduplication done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand


Processing positive strand data...
                                       target name accession   query name  \
70862    NODE_8893_length_2192_cov_2.069724_bin.48         -       P37661   
81801     NODE_9823_length_2570_cov_2.078330_bin.8         -     MF_01451   
75546   NODE_21797_length_2090_cov_1.907617_bin.42         -   MF_00253_B   
80821  NODE_27377_length_2090_cov_2.053071_bin.122         -     MF_01221   
85829   NODE_10154_length_2050_cov_2.062657_bin.35         -     MF_03134   
...                                            ...       ...          ...   
23137   NODE_10034_length_3867_cov_1.920776_bin.10         -  NG_056048.1   
34981   NODE_10034_length_3867_cov_1.920776_bin.10         -       O65934   
84146    NODE_6955_length_4358_cov_1.961887_bin.23         -     MF_01965   
45029   NODE_22471_length_3905_cov_2.028571_bin.32         -       P48982   
57312    NODE_8215_length_3976_cov_2.010712_bin.18         -       P54450   

      accession1  hmm len  hmm from  hmm

  deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'target name': 'NODE_9310_length_3514_cov_2.085285_bin.3', 'accession': '-', 'query name': 'P0AAG0', 'accession1': '-', 'hmm len': 327, 'hmm from': 79, 'hmm to': 315, 'seq len': 3514, 'ali from': 9, 'ali to': 716, 'env from': 1, 'env to': 745, 'E-value': 9.5e-57, 'score': 209.5, 'bias': 0.0, 'shifts': 0, 'stops': 0, 'pipe': 'fs', 'description of target': '-', 'strand': '+'}
{'target name': 'NODE_9310_length_3514_cov_2.085285_bin.3', 'accession': '-', 'query name': 'P16676', 'accession1': '-', 'hmm len': 365, 'hmm from': 70, 'hmm to': 249, 'seq len': 3514, 'ali from': 27, 'ali to': 587, 'env from': 5, 'env to': 647, 'E-value': 2.8e-10, 'score': 56.6, 'bias': 0.0, 'shifts': 0, 'stops': 0, 'pipe': 'fs', 'description of target': '-', 'strand': '+'}
Begining Comparison:  2
Comparing:
Hit A: {'target name': 'NODE_9310_length_3514_cov_2.085285_bin.3', 'accession': '-', 'query name': 'P0AAG0', 'accession1': '-', 'hmm len': 327, 

  step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)


Overlap calculations done on columns: ali from ali to
70% deduplication done.
                                     target name accession query name  \
0     NODE_10022_length_2530_cov_2.013737_bin.68         -     P77211   
1     NODE_10022_length_2530_cov_2.013737_bin.68         -   MF_00387   
2     NODE_10034_length_3867_cov_1.920776_bin.10         -   MF_00847   
3     NODE_10056_length_2262_cov_1.996375_bin.40         -   MF_00685   
4     NODE_10068_length_2520_cov_2.095740_bin.42         -   MF_00705   
...                                          ...       ...        ...   
2222   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00819   
2223   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00819   
2224   NODE_9997_length_2187_cov_1.946529_bin.12         -   MF_00015   
2225   NODE_9998_length_2081_cov_2.067127_bin.40         -   MF_01395   
2226   NODE_9998_length_2081_cov_2.067127_bin.40         -   MF_00823   

     accession1  hmm len  hmm from  hmm to  s

  step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)


In [12]:
complete_bath_deduplication.shape

(4811, 22)

In [13]:
complete_bath_deduplication.to_excel(deduplication_op_path)

In [19]:
@staticmethod
def get_statistics():
    return {
        "Initial Count": overlap_deduplications.initial_count,
        "Count After 100% Deduplication": overlap_deduplications.count_after_100,
        "Count After 70% Deduplication": overlap_deduplications.count_after_70,
        "Count After <70% Deduplication": overlap_deduplications.count_after_less_than_70
    }

stats = get_statistics()
print(stats)

{'Initial Count': 23257, 'Count After 100% Deduplication': 15982, 'Count After 70% Deduplication': 17367, 'Count After <70% Deduplication': 3079}


### Single File Test

In [None]:
bathout = pd.read_table('/content/test_case.tbl', sep="\s+",skiprows =1, header=None)
bathout
# Define column names
col_names = ['target name', 'accession', 'query name', 'accession1', 'hmm len', 'hmm from', 'hmm to',
              'seq len', 'ali from', 'ali to', 'env from', 'env to', 'E-value', 'score', 'bias',
              'shifts', 'stops', 'pipe', 'description of target', 'extra']

# Rename columns
bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)

# Process negative strand
#df_neg = filtering_operations.get_specific_strand(bathout, "-")
#neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)

# Process positive strand
df_pos = filtering_operations.get_specific_strand(bathout, "+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)


In [None]:
pos_deduped