<a href="https://colab.research.google.com/github/NehaSontakk/BATH-Prokka-Comparison/blob/main/BATH_file_deduplication_(Positive_Negative).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd

In [38]:
e_value_threshold = 0.000001

In [39]:
class filtering_operations:

  @staticmethod
  def e_value_filtering(df):
    return df.loc[df['E-value']<=e_value_threshold]

  @staticmethod
  def pos_neg_strand_filtering(df_filtered):
    #df should be post e_value_filteration
    df = filtering_operations.e_value_filtering(df_filtered)
    #Strand identification
    df['ali from'] = df['ali from'].astype(int)
    df['ali to'] = df['ali to'].astype(int)
    strand = []

    for index,row in df.iterrows():
      if row['ali from'] < row['ali to']:
        strand.append("+")
      elif row['ali from'] > row['ali to']:
        strand.append("-")

    df['strand'] = strand
    return df

  @staticmethod
  def get_specific_strand(df_raw,strand_info):
    df = filtering_operations.pos_neg_strand_filtering(df_raw)
    return df.loc[df['strand']==strand_info]


In [40]:
class overlap_deduplications:

  ali_to_column = ""
  ali_from_column = ""

  #METHODS FOR 100% DEDUPLICATION

  @staticmethod
  def handle_group(group):
    #Now for each element in group
    min_e_value = group['E-value'].min()
    #find elements with lowest e-values
    group_min_e = group.loc[group['E-value'] == min_e_value]
    #if group contains a single row save it
    if len(group_min_e) == 1:
      return group_min_e
    #if group contains multiple rows with same e-values check min score
    elif len(group_min_e) > 1:
      print("Group contains multiple rows")
      return group_min_e.iloc[0:1]

  @staticmethod
  def deduplicate_full_overlaps(df):
    #100% Overlap Dedup
    print("Number of elements in relevant strand: ",df.shape)
    #Find all the duplicates
    duplicates = df.duplicated(subset=['target name',overlap_deduplications.ali_from_column, overlap_deduplications.ali_to_column], keep=False)
    #make a duplicates df
    duplicates_df = df[duplicates]
    #make a non duplicates df
    not_duplicate_df = df[~duplicates]
    print("Number of exact duplicates or homologs: ",duplicates_df.shape)
    #Sort by E-value and score
    duplicates_df1 = duplicates_df.sort_values(by=['E-value', 'score'], ascending=[True, False])
    deduplicated_remhomologs = duplicates_df1.groupby(['target name',overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column], group_keys=False).apply(overlap_deduplications.handle_group).reset_index(drop=True)
    #rejion after deduplication
    dedup_step1 = pd.concat([not_duplicate_df,deduplicated_remhomologs]).sort_values(['target name','ali from','E-value'])
    print("Number of elements after removing exact duplicates or homologs: ",dedup_step1.shape)
    print("Elements removed: ",df.shape[0] - dedup_step1.shape[0])
    return dedup_step1

  #METHODS FOR 70% DEDUPLICATION

  @staticmethod
  def calculate_overlap(hit_a, hit_b):
    #print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
    start_a, end_a = hit_a[overlap_deduplications.ali_from_column], hit_a[overlap_deduplications.ali_to_column]
    start_b, end_b = hit_b[overlap_deduplications.ali_from_column], hit_b[overlap_deduplications.ali_to_column]

    overlap_length = max(0, min(end_a, end_b) - max(start_a, start_b) + 1)
    #print(min(end_a, end_b))
    #print(max(start_a, start_b))
    #print("Overlap length",overlap_length)

    length_a = end_a - start_a + 1
    length_b = end_b - start_b + 1

    #print("Length",length_a,length_b)
    if length_a > 0 and length_b > 0:
        overlap_perc_a = (overlap_length / length_a) * 100
        overlap_perc_b = (overlap_length / length_b) * 100
        return overlap_perc_a, overlap_perc_b
    else:
        return 0, 0

  @staticmethod
  def calculate_winner(hit_a, hit_b):
      overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
      print(f"Comparing:\nHit A: {hit_a}\nHit B: {hit_b}\nOverlap1: {overlap1}%, Overlap2: {overlap2}%")

      # Check for significant overlap
      if overlap1 >= 70 or overlap2 >= 70:
          # Determine which hit to retain based on E-value or sequence length on tie
          if hit_a['E-value'] < hit_b['E-value']:
              print("Choosing Hit A based on E-value")
              return hit_a
          elif hit_a['E-value'] > hit_b['E-value']:
              print("Choosing Hit B based on E-value")
              return hit_b
          else:  # E-values are tied, check the sequence length
              if hit_a['seq len'] > hit_b['seq len']:
                  print("E-values tied. Choosing Hit A based on larger sequence length")
                  return hit_a
              else:
                  print("E-values tied. Choosing Hit B based on larger sequence length")
                  return hit_b
      else:
          print("No significant overlap or both hits retained")
          return hit_a, hit_b

  @staticmethod
  def handle_70_overlaps(group):
    # Check if the DataFrame has 1 or fewer rows
    if len(group) <= 1:
        return group


    items = []
    print("HIIII")
    print(group)

    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    i = 0
    all_winners = []

    while i<len(items)-1:
      current = items[i]
      print(current)
      next_item = items[i+1]
      print(next_item)
      print("Begining Comparison: ",i)
      #print(current,"\n",next_item)
      result = overlap_deduplications.calculate_winner(current, next_item)

      #If result is two items append the current item to winners list since overlap of current item with next item is less than 70% so we want to keep both
      if isinstance(result,tuple):
        if result[0] not in all_winners:
          all_winners.append(result[0])
          #print("Tuple returned!")
          #print("Appending: ",result[0])
          #print("All winners currently: ",all_winners)
        #else:
        #  print("Result already in all winners, no appending!")
        i += 1
      else:
        if result == current:
          #print("Result is ",i," element and not ",i+1," popping: ",items[i+1])
          items.pop(i+1)
        else:
          i += 1

      if i >= len(items) - 1 and items[-1] not in all_winners:
        all_winners.append(items[-1])


    return pd.DataFrame(all_winners)

  #METHODS FOR LESS THAN 70% DEDUPLICATION
  @staticmethod
  def calculate_below_70_changes(hit_a, hit_b):
    overlap1, overlap2 = overlap_deduplications.calculate_overlap(hit_a, hit_b)
    #print("Overlap 1: ",overlap1)
    if (0.01 <= overlap1 < 70) and (0.01 <= overlap2 < 70):
      if hit_a['E-value'] < hit_b['E-value']:
        print("Case A: A dominates")
        print(hit_a)
        print(hit_b)
        hit_b['ali from'] = hit_a['ali to'] + 1
        #print(hit_b)
        return hit_a, hit_b
      else:
        print("Case B: B dominates")
        print(hit_a)
        print(hit_b)
        hit_a['ali to'] = hit_b['ali from'] - 1
        #print(hit_a)
        #print(hit_b)
        return hit_a, hit_b
    else:
      return hit_a,hit_b

  @staticmethod
  def below_70_overlap(group):
    items = []
    for index, row in group.iterrows():
      row_tuple = row.to_dict()
      items.append(row_tuple)

    for i in range(0,len(items)-1):
      element1, element2 = overlap_deduplications.calculate_below_70_changes(items[i],items[i+1])
      items[i] = element1
      items[i+1] = element2

    return pd.DataFrame(items)

  # FINAL METHOD TO CALL
  @staticmethod
  def choose_strand_operations(df):
    if df['strand'].values[0] == '-':
      print("Processing negative strand data...")
      #create the flip columns
      df['ali from flip'] = df['ali to']
      df['ali to flip'] = df['ali from']
      overlap_deduplications.ali_to_column = 'ali to flip'
      overlap_deduplications.ali_from_column = 'ali from flip'
      #100% dedup
      df = df.sort_values(by=['ali from flip','E-value'], ascending=[True, True])
      print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      return step3_dedup
    elif df['strand'].values[0] == '+':
      print("Processing positive strand data...")
      overlap_deduplications.ali_to_column = 'ali to'
      overlap_deduplications.ali_from_column = 'ali from'
      #100% dedup
      df = df.sort_values(by=['ali from','E-value'], ascending=[True, True])
      print(df)
      step1_dedup = overlap_deduplications.deduplicate_full_overlaps(df)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("100% deduplication done.")
      #70% Overlap
      step2_dedup = step1_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.handle_70_overlaps).reset_index(drop=True)
      print("Overlap calculations done on columns:",overlap_deduplications.ali_from_column,overlap_deduplications.ali_to_column)
      print("70% deduplication done.")
      #Less than 70% overlap
      print(step2_dedup)
      step3_dedup = step2_dedup.groupby(['target name'], group_keys=False).apply(overlap_deduplications.below_70_overlap).reset_index(drop=True)
      print("Less than 70% deduplication done.")
      #return something final
      return step3_dedup




In [22]:
#Do it for all BATH files we have
from glob import glob
#SPECIFY LOCATION
for i in glob("/content/drive/MyDrive/Lab Work/Parkinsons_Data/Deduplication Testing 31Jul2024/bathoutprededup/*.tbl"):
  bathout = pd.read_table("/content/drive/MyDrive/Lab Work/Parkinsons_Data/Bath_Output/DNA_Bacteria_kingdom_sprot.tbl",sep="\s+",header=None,skiprows=2,skipfooter=8)
  col_names = ['target name', 'accession', 'query name', 'accession1', 'hmm len', 'hmm from', 'hmm to', 'seq len', 'ali from', 'ali to', 'env from', 'env to', 'E-value', 'score', 'bias', 'shifts', 'stops', 'pipe', 'description of target', 'extra']
  bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)
  df_neg = filtering_operations.get_specific_strand(bathout,"-")
  neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)
  df_pos = filtering_operations.get_specific_strand(bathout,"+")
  pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)
  bath_file_deduplication = pd.concat([pos_deduped,neg_deduped])
  bath_file_deduplication['origin'] = str(i.split("/")[-1].split(".tbl")[0])
  bath_file_deduplication.to_excel(str(i.split("/")[-1].split(".tbl")[0]+".xlsx"),index=False)

  bathout = pd.read_table("/content/drive/MyDrive/Lab Work/Parkinsons_Data/Bath_Output/DNA_Bacteria_kingdom_sprot.tbl",sep="\s+",header=None,skiprows=2,skipfooter=8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Overlap1: 0.0%, Overlap2: 0.0%
No significant overlap or both hits retained
Begining Comparison:  65
Comparing:
Hit A: {'target name': 'k127_533492', 'accession': '-', 'query name': 'P9WID7', 'accession1': '-', 'hmm len': 343, 'hmm from': 3, 'hmm to': 328, 'seq len': 131788, 'ali from': 127508, 'ali to': 126594, 'env from': 127514, 'env to': 126576, 'E-value': 7e-46, 'score': 169.9, 'bias': 0.3, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 126594, 'ali to flip': 127508}
Hit B: {'target name': 'k127_533492', 'accession': '-', 'query name': 'Q9L1L8', 'accession1': '-', 'hmm len': 341, 'hmm from': 3, 'hmm to': 329, 'seq len': 131788, 'ali from': 127508, 'ali to': 126588, 'env from': 127514, 'env to': 126567, 'E-value': 4.8e-40, 'score': 151.2, 'bias': 0.2, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 126588, 'ali

KeyboardInterrupt: 

In [30]:
list_of_dfs = []
for i in glob("/content/*.xlsx"):
  list_of_dfs.append(pd.read_excel(i))

In [31]:
bathout = pd.concat(list_of_dfs)
df_neg = filtering_operations.get_specific_strand(bathout,"-")
neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)
df_pos = filtering_operations.get_specific_strand(bathout,"+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)
complete_bath_deduplication = pd.concat([pos_deduped,neg_deduped])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Overlap1: 0.0%, Overlap2: 0.0%
No significant overlap or both hits retained
Begining Comparison:  2
Comparing:
Hit A: {'Unnamed: 0': 58.0, 'target name': 'k127_1768858', 'accession': '-', 'query name': 'P0AFQ2', 'accession1': '-', 'hmm len': 377, 'hmm from': 115, 'hmm to': 354, 'seq len': 164989, 'ali from': 10048, 'ali to': 9368, 'env from': 10078, 'env to': 9308, 'E-value': 2.5e-26, 'score': 106.2, 'bias': 11.1, 'shifts': 0, 'stops': 0, 'pipe': 'std', 'description of target': '-', 'strand': '-', 'ali from flip': 9368, 'ali to flip': 10048, 'origin': 'DNA_Bacteria_kingdom_sprot', 'Unnamed: 0.2': nan, 'Unnamed: 0.1': nan}
Hit B: {'Unnamed: 0': nan, 'target name': 'k127_1768858', 'accession': '-', 'query name': 'P37624', 'accession1': '-', 'hmm len': 911, 'hmm from': 274, 'hmm to': 510, 'seq len': 164989, 'ali from': 11083, 'ali to': 10376, 'env from': 11145, 'env to': 10190, 'E-value': 1.4e-53, 'score': 197.2, 'bias': 0.0

In [34]:
complete_bath_deduplication.to_excel("JUL31bin82_BATH_deduplicated.xlsx")

In [33]:
complete_bath_deduplication.shape

(890, 26)

In [41]:
bathout = pd.read_table('/content/test_case1.tbl', sep="\s+",skiprows =1, header=None)
bathout
# Define column names
col_names = ['target name', 'accession', 'query name', 'accession1', 'hmm len', 'hmm from', 'hmm to',
              'seq len', 'ali from', 'ali to', 'env from', 'env to', 'E-value', 'score', 'bias',
              'shifts', 'stops', 'pipe', 'description of target', 'extra']

# Rename columns
bathout.rename(columns=dict(zip(bathout.columns, col_names)), inplace=True)

# Process negative strand
#df_neg = filtering_operations.get_specific_strand(bathout, "-")
#neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)

# Process positive strand
df_pos = filtering_operations.get_specific_strand(bathout, "+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)

# Combine results
bath_file_deduplication = pd.concat([pos_deduped, neg_deduped])
bath_file_deduplication['origin'] = str(i.split("/")[-1].split(".tbl")[0])


Processing positive strand data...
    target name accession query name accession1  hmm len  hmm from  hmm to  \
0  k127_1013475         -   MF_01398          -      178        12     175   

   seq len  ali from  ali to  env from  env to       E-value  score  bias  \
0   183529      5724    6218      5703    6224  6.500000e-25   97.6   1.6   

   shifts  stops pipe description of target strand  
0       0      0  std                     -      +  
Number of elements in relevant strand:  (1, 20)
Number of exact duplicates or homologs:  (0, 20)
Number of elements after removing exact duplicates or homologs:  (1, 20)
Elements removed:  0
Overlap calculations done on columns: ali from ali to
100% deduplication done.
Overlap calculations done on columns: ali from ali to
70% deduplication done.
    target name accession query name accession1  hmm len  hmm from  hmm to  \
0  k127_1013475         -   MF_01398          -      178        12     175   

   seq len  ali from  ali to  env from  en

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand


In [43]:
pos_deduped

Unnamed: 0,target name,accession,query name,accession1,hmm len,hmm from,hmm to,seq len,ali from,ali to,env from,env to,E-value,score,bias,shifts,stops,pipe,description of target,strand
0,k127_1013475,-,MF_01398,-,178,12,175,183529,5724,6218,5703,6224,6.5e-25,97.6,1.6,0,0,std,-,+


In [23]:
import pandas as pd

# Define the path to the newly uploaded file
file_path = '/content/test_case1.tbl'

df = pd.read_csv(file_path, header=None, skiprows=1, sep="\t")
df.head()
col_names = [
    'target name', 'accession', 'query name', 'accession1', 'hmm len', 'hmm from',
    'hmm to', 'seq len', 'ali from', 'ali to', 'env from', 'env to', 'E-value',
    'score', 'bias', 'shifts', 'stops', 'pipe', 'description of target'
]

# Rename the columns in the DataFrame
df.columns = col_names

# Perform strand specific filtering and deduplication
df_neg = filtering_operations.get_specific_strand(df, "-")
#neg_deduped = overlap_deduplications.choose_strand_operations(df_neg)

df_pos = filtering_operations.get_specific_strand(df, "+")
pos_deduped = overlap_deduplications.choose_strand_operations(df_pos)

# Combine negative and positive strand results
bath_file_deduplication = pd.concat([pos_deduped, neg_deduped])
bath_file_deduplication['origin'] = file_path.split("/")[-1].split(".txt")[0]

# Save the deduplicated results to an Excel file
output_path = '/content/' + file_path.split("/")[-1].split(".txt")[0] + "_deduplicated.xlsx"
bath_file_deduplication.to_excel(output_path, index=False)

print(f"Data processed and saved to {output_path}")


Processing positive strand data...
    target name accession query name accession1  hmm len  hmm from  hmm to  \
0  k127_1013475         -   MF_01398          -      178        12     175   

   seq len  ali from  ali to  env from  env to       E-value  score  bias  \
0   183529      5724    6218      5703    6224  6.500000e-25   97.6   1.6   

   shifts  stops pipe description of target strand  
0       0      0  std                     -      +  
Number of elements in relevant strand:  (1, 20)
Number of exact duplicates or homologs:  (0, 20)
Number of elements after removing exact duplicates or homologs:  (1, 20)
Elements removed:  0
Overlap calculations done on columns: ali from ali to
100% deduplication done.
Overlap calculations done on columns: ali from ali to
70% deduplication done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali from'] = df['ali from'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ali to'] = df['ali to'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strand'] = strand
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

KeyError: 'target name'

In [9]:
df_pos


Unnamed: 0,target name,accession,query name,accession1,hmm len,hmm from,hmm to,seq len,ali from,ali to,env from,env to,E-value,score,bias,shifts,stops,pipe,description of target,strand
0,k127_1013475,-,MF_01398,-,178,12,175,183529,5724,6218,5703,6224,6.5e-25,97.6,1.6,0,0,std,-,+


In [12]:
df

Unnamed: 0,target name,accession,query name,accession1,hmm len,hmm from,hmm to,seq len,ali from,ali to,env from,env to,E-value,score,bias,shifts,stops,pipe,description of target
0,k127_1013475,-,MF_01398,-,178,12,175,183529,5724,6218,5703,6224,6.5e-25,97.6,1.6,0,0,std,-
1,k127_1013475,-,MF_01399,-,160,21,143,183529,5739,6110,5703,6161,3.4e-06,37.5,0.2,0,0,std,-
2,k127_1013475,-,Q7A4E7,-,173,17,170,183529,5751,6215,5733,6224,0.0044,27.9,0.1,0,0,std,-
3,k127_1013475,-,P80285,-,184,24,176,183529,5763,6221,5709,6239,2.4e-05,35.2,0.1,0,0,std,-
4,k127_1013475,-,A0R203,-,445,6,156,183529,5769,6221,5754,6239,0.76,18.6,0.0,0,0,std,-
5,k127_1013475,-,P0A2Z3,-,164,14,161,183529,5772,6215,5745,6224,0.0017,29.3,0.1,0,0,std,-
6,k127_1013475,-,P0ABA0,-,156,11,150,183529,5775,6197,5757,6215,0.048,24.8,0.2,0,0,std,-
7,k127_1013475,-,MF_00311,-,198,11,69,183529,5934,6110,5925,6224,0.36,18.4,5.7,0,0,std,-
