# Curate training data for promiscuity model

Here, I only included the EnzymeCAGE_train.csv but not the EnzymeCAGE_valid.csv. Maybe in the future, I will/should include both?!

In [None]:
# Data from EnzymeCAGE

import pandas as pd

df_CAGE = pd.read_csv('data/EnzymeCAGE_train.csv')
df_protein = pd.read_pickle('/home/helen/cec_degrader/generalize/data/protein.pkl')

### All promiscuous enzymes

In [None]:
# Positive samples only
df = df_CAGE[df_CAGE['Label'] == 1]

# Add EC number from protein.pkl if not present 
ec_mapping = df_protein.groupby('Entry')['EC number'].apply(list).to_dict()
df['EC number'] = df['UniprotID'].map(ec_mapping)
df['EC number'] = df['EC number'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Filter to (catalytically) promiscuous enzymes (i.e. present more than once in df)
filtered_df = df[df['UniprotID'].duplicated(keep=False)]
df  = filtered_df.sort_values(by='UniprotID')
df = df.reset_index(drop=True)

# Split reaction SMILEs into substrates and products
df[['substrates', 'products']] = df['SMILES'].str.split('>>', expand=True)

# Remove H20 and H+ as a substrate
def remove_water_and_protons(smiles):
    if pd.isna(smiles):
        return smiles
    parts = smiles.split('.')
    parts = [p for p in parts if p not in ('[H]O[H]', '[H+]', '[H]')]
    return '.'.join(parts)

# Number of substrates and products per entry
df['number_substrates'] = df['substrates'].str.count('\.') + 1
df['number_products'] = df['products'].str.count('\.') + 1

#df.to_pickle('data/EnzymeCAGE_train_all_promiscuous.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EC number'] = df['UniprotID'].map(ec_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EC number'] = df['EC number'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)


### Substrate promiscuous enzymes

In [None]:

# Create cannonical substrate strings to detect duplicate substrates 
def canonical_substrates(substrates):
    parts = substrates.split('.')
    parts = sorted([p.strip() for p in parts])
    return '.'.join(parts)

df['substrates_canonical'] = df['substrates'].str.strip().apply(canonical_substrates)

# For each UniprotID, remove catalytically promiscuous enzymes by filtering out rows with identical substrates
def filter_unique_substrates(group):
    counts = group['substrates_canonical'].value_counts()
    unique_substrates = counts[counts == 1].index
    return group[group['substrates_canonical'].isin(unique_substrates)]

df_unique = df.groupby('UniprotID', group_keys=False).apply(filter_unique_substrates).reset_index(drop=True)
df_unique = df_unique.drop(columns=['substrates_canonical'])
df_unique = df_unique[df_unique['UniprotID'].duplicated(keep=False)]
df_unique = df_unique.sort_values(by='UniprotID')

# Split substrate SMILES into seperate entries
df_unique['substrates_split'] = df_unique['substrates'].str.split('.')
df_unique = df_unique.explode('substrates_split').reset_index(drop=True)
df_unique.loc[df_unique['number_substrates'] > 1]

# For each UniprotID, remove the substrates that are common to all RHEA_IDs within the same UniprotID. The logic being that these do not contribute to promiscuity because stay constant in all reactions. 
substrate_rhea_counts = df_unique.groupby(['UniprotID', 'substrates_split'])['RHEA_ID'].nunique().reset_index(name='rhea_count')
total_rhea_counts = df_unique.groupby('UniprotID')['RHEA_ID'].nunique().reset_index(name='total_rhea_count') # Find the total number of unique RHEA_IDs for each UniprotID
merged = pd.merge(substrate_rhea_counts, total_rhea_counts, on='UniprotID')
merged['appear_in_all_rhea'] = merged['rhea_count'] == merged['total_rhea_count']
substrates_to_remove = merged[merged['appear_in_all_rhea'] == True]

df_filtered = df_unique.merge(substrates_to_remove[['UniprotID', 'substrates_split']], 
                       on=['UniprotID', 'substrates_split'], how='left', indicator=True)

df_filtered = df_filtered[df_filtered['_merge'] == 'left_only'].drop(columns=['_merge'])

#df_filtered.to_pickle('data/EnzymeCAGE_train_promiscuous_substrates.pkl')
# 132'620 entries

  df_unique = df.groupby('UniprotID', group_keys=False).apply(filter_unique_substrates).reset_index(drop=True)


### Promiscuous esterases (EC 3.1.X.Y)

In [None]:
import pandas as pd

df = pd.read_pickle('data/EnzymeCAGE_train_promiscuous_substrates.pkl')

# Filter to only promiscuous esterases
uniprot_with_3_1 = df.loc[df['EC number'].fillna('').str.startswith('3.1.'), 'UniprotID'].unique()
df_filtered = df[df['UniprotID'].isin(uniprot_with_3_1)].copy()
df_filtered = df_filtered.reset_index(drop=True)
#df_filtered.to_pickle('data/EnzymeCAGE_train_promiscuous_substrates_EC3p1.pkl')

# 9'573  entries
# Contains all substrate promiscuous enzymes with EC 3.1 having removed catalytically promiscuous enzymes. 

### Thermophilic esterases (EC 3.1.X.Y)

In [None]:
df_extremophiles = pd.read_pickle('/home/helen/cec_degrader/generalize/data/extremeophiles.pkl')
df  = pd.read_csv('data/EnzymeCAGE_train.csv')
df_protein = pd.read_pickle('/home/helen/cec_degrader/generalize/data/protein.pkl')

# Positive samples only
df = df[df['Label'] == 1]

# Add EC number from protein.pkl if not present 
ec_mapping = df_protein.groupby('Entry')['EC number'].apply(list).to_dict()
df['EC number'] = df['UniprotID'].map(ec_mapping)
df['EC number'] = df['EC number'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Filter to only extremophilic/thermophilic enzymes
df = df[df['UniprotID'].isin(df_extremophiles['Entry'])]

# Filter to only esterases (EC 3.1)
uniprot_with_3_1 = df.loc[df['EC number'].fillna('').str.startswith('3.1.'), 'UniprotID'].unique()
df = df[df['UniprotID'].isin(uniprot_with_3_1)].copy()
df = df.reset_index(drop=True)

# Split reaction SMILEs into substrates and products
df[['substrates', 'products']] = df['SMILES'].str.split('>>', expand=True)

# Remove H20 as a substrate
def remove_water(smiles):
    if pd.isna(smiles):
        return smiles
    parts = smiles.split('.')
    parts = [p for p in parts if p != '[H]O[H]']
    return '.'.join(parts)
df['substrates'] = df['substrates'].apply(remove_water)

# Number of substrates and products per entry
df['number_substrates'] = df['substrates'].str.count('\.') + 1
df['number_products'] = df['products'].str.count('\.') + 1

#df.to_pickle('data/EnzymeCAGE_train_extremophiles_EC3p1.pkl')

# 332 entries
# Contains all thermophilic esterases irrespective of promiscuity from the enzymeCAGE training data. 
# This is fewer entries than the number of esterases in the whole extremophiles_df which is 2232! Can we also use all of those???????