# Imports

In [None]:
import requests
import re
import numpy as np
import pandas as pd

import torch

from transformers import AutoTokenizer

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [None]:
# Specify your path to Capstone folder.

main_path = "/content/drive/MyDrive/Capstone_Diana/Capstone/"

In [None]:
if "drive" in main_path:
    from google.colab import drive
    drive.mount("/content/drive")

In [None]:
db_path = f"{main_path}Data/ProThermDB.tsv"
db_load_orig = pd.read_csv(db_path, sep='\t', index_col=0)
db_load_orig.replace('-', np.nan, inplace=True)

In [None]:
db_load_orig.isna().sum()

PROTEIN                   0
UniProt_ID             2575
MUTATION                855
SOURCE                    1
PDB_wild               4324
PDB_Chain_Mutation    13912
SEC_STR               15271
ASA                   17585
pH                       84
T_(C)                 16500
Tm_(C)                16233
∆Tm_(C)               25528
∆∆G_(kcal/mol)        29083
∆∆G_H2O_(kcal/mol)    26190
STATE                 25997
REVERSIBILITY          1638
PubMed_ID               519
KEY_WORDS                 0
REFERENCE                 0
AUTHOR                    0
dtype: int64

# Start of Preprocessing

In [None]:
# Removing all data entries, missing the target column, which is melting temperature.

db_load_temp = db_load_orig[~db_load_orig["Tm_(C)"].isna()]
db_load_temp.shape

(16047, 20)

In [None]:
# First, unnecessary columns were dropped.

unused_cols = ["PubMed_ID", "KEY_WORDS", "REFERENCE", "AUTHOR", "STATE"]
db_load = db_load_temp.drop(unused_cols, axis = 1)

In [None]:
# Dropping columns related to difference between wild type of protein and mutation - we are not interested in relative data.

unused_cols = ["∆∆G_(kcal/mol)", "∆∆G_H2O_(kcal/mol)", "T_(C)", "∆Tm_(C)", "REVERSIBILITY", "ASA"]
db_load = db_load.drop(unused_cols, axis = 1)

In [None]:
# Removing synthetic data as we do not know the sequences

db_load = db_load[db_load["SOURCE"] != "synthetic"]
db_load

Unnamed: 0_level_0,PROTEIN,UniProt_ID,MUTATION,SOURCE,PDB_wild,PDB_Chain_Mutation,SEC_STR,pH,Tm_(C)
NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,NAD+ -dependent formate dehydrogenases (FDHs),A0A1U8PF82,wild-type,Gossypium hirsutum,,,,7.0,72
28,NAD+ -dependent formate dehydrogenases (FDHs),A0A1U8PF82,M225L (Based on Paper),Gossypium hirsutum,,,,7.0,67
29,NAD+ -dependent formate dehydrogenases (FDHs),A0A1U8PF82,M234L (Based on Paper),Gossypium hirsutum,,,,7.0,66
30,NAD+ -dependent formate dehydrogenases (FDHs),A0A1U8PF82,M243L (Based on Paper),Gossypium hirsutum,,,,7.0,77
31,Major prion protein,P04156,wild-type,Homo sapiens,1E1G,,,7.0,71.3
...,...,...,...,...,...,...,...,...,...
32246,Phosphoglycerate kinase 1,P00558,,Homo sapiens,2XE7,,Coil,7.4,49.2 (0.7)
32247,Phosphoglycerate kinase 1,P00558,wild-type,Homo sapiens,2XE7,,,7.4,46.3 (0.9)
32248,Phosphoglycerate kinase 1,P00558,wild-type,Homo sapiens,2XE7,,,7.4,43 (1.5)
32258,"Cytochrome c oxidase subunit 12, mitochondrial",Q01519,wild-type,Saccharomyces cerevisiae,6GIQ,,,6.8,76.3 (4.2)


In [None]:
# Check that if we have PDB_wild, MUTATION contains only wild-type.

db_load[db_load["PDB_wild"].notna() & db_load["MUTATION"] == 'wild-type']

Unnamed: 0_level_0,PROTEIN,UniProt_ID,MUTATION,SOURCE,PDB_wild,PDB_Chain_Mutation,SEC_STR,pH,Tm_(C)
NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [None]:
# Check that we don't have rows with no uniprot and pdb_wild (and that at least one is present)
# Remove all rows where MUTATION is not specified.

db_load = db_load[(db_load["PDB_wild"].notna()) | (db_load["UniProt_ID"].notna())]
db_load = db_load[db_load["MUTATION"].notna()]
len(db_load)

14741

In [None]:
# Although this dataset contains mistakes regarding the IDs, there are totally 14741 rows.
# It is not feasible to manually check them all, that is why only 1 row will be fixed (which was noticed)

db_load.loc[db_load["PROTEIN"] == "Equine Heart Myoglobin", "PDB_wild"] = "1WLA"

# Extracting Sequence (from UniProt, PDB)

In [None]:
def get_uniprot_sequence(uniprot_id):
    """
    Fetch UniPROT FASTA file and retrieve sequence.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.status_code == 200:
        fasta = response.text
        lines = fasta.strip().split('\n')
        sequence = ''.join(lines[1:])
        return sequence
    else:
        print(f"Failed to retrieve UniProt sequence for {uniprot_id}")
        return None

def get_pdb_sequence(pdb_id):
    """
    Fetch PDB FASTA file and retrieve sequence.
    """
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}"
    response = requests.get(url)
    if response.status_code == 200:
        fasta = response.text
        lines = fasta.strip().split('\n')
        sequence = ''.join(lines[1:])
        return sequence
    else:
        print(f"Failed to retrieve PDB sequence for {pdb_id}")
        return None

def is_uniprot_id(x):
    """
    Check if ID is UniProt based on typical patterns.
    """
    return (len(x) >= 4) and any(c.isdigit() for c in x)

def is_pdb_id(x):
    """
    Check if ID is PDB based on typical patterns.
    """
    return len(x) == 4 and x.isalnum()

def fetch_sequence(row):
    pdb_candidate = row["PDB_wild"]
    uniprot_candidate = row["UniProt_ID"]

    if pd.notna(pdb_candidate):
        if is_pdb_id(pdb_candidate):
            return get_pdb_sequence(pdb_candidate)
        elif is_uniprot_id(pdb_candidate):
            return get_uniprot_sequence(pdb_candidate)

    if pd.notna(uniprot_candidate):
        if is_pdb_id(uniprot_candidate):
            return get_pdb_sequence(uniprot_candidate)
        elif is_uniprot_id(uniprot_candidate):
            return get_uniprot_sequence(uniprot_candidate)

    return np.nan

In [None]:
db_load["Sequence"] = db_load.apply(fetch_sequence, axis=1)

Failed to retrieve UniProt sequence for Q8GJ67 
Failed to retrieve UniProt sequence for Q8GJ67 
Failed to retrieve PDB sequence for 1YGV
Failed to retrieve PDB sequence for 1YGV
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Failed to retrieve PDB sequence for 1UOX
Fa

In [None]:
# Dropping these ids as they are no longer present in pdb: '1SUK', '1W0Q', '1SEE', '2G6S'.

missed_ids = ["1UOX", "1YGV", "1E21;", "2GJ1", "1BGL"]
missed_indexes = db_load[db_load["PDB_wild"].isin(missed_ids) | db_load["UniProt_ID"].isin(missed_ids)].index
len(missed_indexes)

54

 Some sequences were removed (their IDs) from databases, so those ones were placed manually:

 1) 1UOX --> changed to 1R51

 2) 1YGV --> changed to 3HQV

 3) 1E21; was just not found - manually inserted (because it contains what it should not)

 4) 1SUK --> https://www.modelarchive.org/doi/10.5452/ma-cnpyn

 5) 1W0Q --> 10.2210/pdb1W0Q/pdb

 6) 2GJ1 --> 5GH0

 7) 1BGL --> 4V40

 8) 1SEE -->  10.2210/pdb1SEE/pdb

 9) 2G6S --> 10.2210/pdb2G6S/pdb


In [None]:
# Manually substituring PDB IDs.

db_load.loc[db_load["PDB_wild"] == "1UOX", "PDB_wild"] = "1R51"
db_load.loc[db_load["PDB_wild"] == "1YGV", "PDB_wild"] = "3HQV"
db_load.loc[db_load["PDB_wild"] == "1E21;", "PDB_wild"] = "1E21"
db_load.loc[db_load["PDB_wild"] == "2GJ1", "PDB_wild"] = "5GH0"
db_load.loc[db_load["PDB_wild"] == "1BGL", "PDB_wild"] = "4V40"

In [None]:
# Fetching sequences for those missed rows.

db_load.loc[missed_indexes, "Sequence"] = db_load.loc[missed_indexes].apply(fetch_sequence, axis=1)

In [None]:
# Clearing sequences from additional elements.

db_load["Sequence"] = db_load["Sequence"].str.replace(r"[>|].*", "", regex=True)

In [None]:
# Checking the apperance of sequences.

db_load["Sequence"].unique()

array(['MAMKQVANSAIKAIANSGSSSLLTRQLHASPGSKKIVGVFYKANEYFTKNPNFVGCVEGALGLRPWLESQGHQYIVTDDKEGPDCELEKHIPDLHVLISTPFHPAYVTAERIKKAKNLQLLLTAGIGSDHVDLKAAAEAGLTVAEVTGSNVVSVAEDELMRILILVRNFVPGYHQVITGDWNVAGIAYRAYDLEGKTVGTIGAGRIGKLLLQRLKPFNCNLLYHDRVKIDPELEKQTGAKFEEDLDAMLPKCDIIVINMPLTEKTRGMFDKDRIAKMKKGVLIVNNARGAIMDTQAVADACSSGHIAGYSGDVWYPQPAPKDHPWRYMPNQAMTPHISGTTIDAQLRYAAGVKDMLERYFKGEDFPEQNYIVKAGELAPQYR',
       'LGGYMLGSAMSRPIIHFGSDYEDRYYRENMHRYPNQVYYRPVDEYSNQNNFVHDCVNITIKQHTVTTTTKGENFTETDVKMMERVVEQMCITQYERESQAYYQR',
       '',
       'SINGGIRAATSQEINELTYYTTLSANSYCRTVIPGATWDCIHCDATEDLKIIKTWSTLIYDTNAMVARGDSEKTIYIVFRGSSSIRNWIADLTFVPVSYPPVSGTKVHKGFLDSYGEVQNELVATVLDQFKQYPSYKVAVTGHSLGGATVLLCALDLYQREEGLSSSNLFLYTQGQPRVGDPAFANYVVSTGIPYRRTVNERDIVPHLPPAAFGFLHAGEEYWITDNSPETVQVCTSDLETSDCSNSIVPFTSVLDHLSYFGINTGLCT',
       'MGNVMEGKSVEELSSTECHQWYKKFMTECPSGQLTLYEFRQFFGLKNLSPSASQYVEQMFETFDFNKDGYIDFMEYVAALSLVLKGKVEQKLRWYFKLYDVDGNGCIDRDELLTIIQAIRAINPCSDTTMTAEEFTDTVFSKIDVNGDGELSLEEFIEGVQKDQMLLDTLTRSLDLTRIVRRLQNGEQDEEGA

# Applying Mutations to Sequences

In [None]:
def apply_mutations_with_correction(mutation_str, sequence):
    mutation_pattern = r"([A-Z])(\d+)([A-Z])"
    matches = re.findall(mutation_pattern, mutation_str or '')

    sequence_list = list(sequence)
    correction = np.nan

    for original_aa, pos_str, new_aa in matches:
        pos = int(pos_str) - 1
        if pos >= len(sequence_list):
            print(f"Warning: position {pos+1} is out of range for the sequence.")
            continue

        if sequence_list[pos] != original_aa:
            print(f"Warning: at position {pos+1}, found {sequence_list[pos]}, expected {original_aa}.")
            if np.isnan(correction):
                correction = 0
            continue

        sequence_list[pos] = new_aa
        correction = 1

    return ''.join(sequence_list), correction

def correct_sequences_in_dataframe(df):
    """
    Apply mutations to sequences in a DataFrame.
    """
    updated_sequences = []
    correction_flags = []

    for idx, row in df.iterrows():
        seq = row["Sequence"]
        mutations = row["MUTATION"]

        if pd.notna(seq) and pd.notna(mutations):
            corrected_seq, correction_flag = apply_mutations_with_correction(mutations, seq)
            updated_sequences.append(corrected_seq)
            correction_flags.append(correction_flag)
        else:
            updated_sequences.append(seq)
            correction_flags.append(np.nan)

    df["Sequence"] = updated_sequences
    df["Correction"] = correction_flags

    return df

In [None]:
# Applying mutations to original sequences.
# Saving Correction, identifying which sequences were able to be corrected and not fail in any of point mutations.

db_load["Correction"] = np.nan
db_load = correct_sequences_in_dataframe(db_load)



In [None]:
# Most of the sequences were correctly changed to mutated versions.

db_load["Correction"].value_counts()

Correction
1.0    4209
0.0    2937
Name: count, dtype: int64

In [None]:
# Get only those rows that have sequences.

db_load_final = db_load[db_load.Sequence.notna()]
len(db_load_final)

14721

In [None]:
# Dropping MUTATION column as is not useful anymore.
# PDB_Chain_Mutation duplicates MUTATION and even less data --> redundant.
# Drop Correction as was an artificial column for check.

db_load_final = db_load_final.drop("MUTATION", axis = 1)
db_load_final = db_load_final.drop("PDB_Chain_Mutation", axis = 1)
db_load_final = db_load_final.drop("Correction", axis = 1)

# Processing Text Columns

In [None]:
# Specifying columns to use in further analysis.

prot_col = ["Sequence"]
sci_cols = ["PROTEIN", "UniProt_ID", "SOURCE", "PDB_wild", "SEC_STR"]
numeric_cols = ["pH", "Tm_(C)"]

In [None]:
# Filling in missing categorical values with <UNK> token, recognized by models.

db_load_final.loc[:, sci_cols] = db_load_final[sci_cols].fillna("<UNK>")

In [None]:
# Dropping data entries with missing pH values.

db_load_final = db_load_final[db_load_final.pH.notna()]

In [None]:
# Initializing models for text tokenization.

protein_model_name = "Rostlab/prot_bert_bfd"
text_model_name = "allenai/scibert_scivocab_uncased"

protein_tokenizer = AutoTokenizer.from_pretrained(protein_model_name)
text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)

In [None]:
# Identifying maximum length of sequences needed for padding (to have the same length).

prot_max_length = db_load_final["Sequence"].apply(len).max()
text_max_length = db_load_final[sci_cols].apply(lambda row: row.apply(len).max(), axis=1).max()

In [None]:
def tokenize_protein(text):
    """
    Tokenizes protein sequences using the protein tokenizer.
    """
    text = str(text)
    return protein_tokenizer(text, padding="max_length", truncation=True, max_length=prot_max_length, return_tensors=None)

def tokenize_text(text):
    """
    Tokenizes other text columns using the text tokenizer.
    """
    text = str(text)
    return text_tokenizer(text, padding="max_length", truncation=True, max_length=text_max_length, return_tensors=None)

In [None]:
# Tokenizing sequences by their corresponding tokenizers.

db_load_final.loc[:, "tokenized_Sequence"] = db_load_final["Sequence"].apply(tokenize_protein)
for col in sci_cols:
    db_load_final[f"tokenized_{col}"] = db_load_final[col].apply(tokenize_text)

# Create targets, numeric_embeddings columns

In [None]:
# Processing user reported inconsistencies to prepare T_m columns for further usage.

db_load_final["Tm_(C)"] = db_load_final["Tm_(C)"].str.split("(").str[0].str.strip()
db_load_final["Tm_(C)"] = db_load_final["Tm_(C)"].str.replace(r"[<>]", "", regex=True).str.strip()
db_load_final["Tm_(C)"] = db_load_final["Tm_(C)"].str.replace(" ", "")
db_load_final["Tm_(C)"] = db_load_final["Tm_(C)"].astype(float)

In [None]:
# Creating targets column for consistency during models employment.

target_columns = ["Tm_(C)"]
db_load_final.loc[:, "targets"] = db_load_final[target_columns].apply(lambda row: torch.tensor(row.values, dtype=torch.float32), axis=1)

In [None]:
# Creating numeric_embeddings column for consistency during models employment.

numeric_columns_new = ["pH"]
db_load_final.loc[:, "numeric_embeddings"] = db_load_final[numeric_columns_new].apply(lambda row: torch.tensor(row.values, dtype=torch.float), axis=1)
db_load_final["numeric_embeddings"].unique()

array([tensor([7.]), tensor([7.]), tensor([7.]), ..., tensor([7.4000]),
       tensor([6.8000]), tensor([6.8000])], shape=(13359,), dtype=object)

# Saving

In [None]:
db_final = db_load_final.copy()

In [None]:
# Specifying methods for handling data .

numeric_method = "none"
sequence_method = "protbert"
text_method = "scibert"
database = "protherm"

In [None]:
# Saving tokenized and preprocessed dataset.

dataset_name = f"tokenized_dataset_{numeric_method}_{sequence_method}_{text_method}_{len(db_final)}_{database}"
pickle_path = f"{main_path}Tokenized_results/{dataset_name}.pkl"

db_final.to_pickle(pickle_path)
print(f"Dataset is successfully saved as pickle: {pickle_path}")

***

# End of Preprocessing