In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from pandasai.llm.local_llm import LocalLLM
from data.constants import SYSTEM, PREPROMPT, POSTPROMPT, TEMP, LOCAL_SERVER_URL, PROMPT_OUTPUT_FILE_PATH, TEST_DF_PATH, LOCAL_MODEL
import requests
import sys


system_file_path = str(SYSTEM)
preprompt_file_path = str(PREPROMPT)
postprompt_file_path = str(POSTPROMPT)
temp = TEMP

try:

    with open(system_file_path, 'r') as file:
        system = file.read()

    with open(preprompt_file_path, 'r') as file:
        preprompt = file.read()

    with open(postprompt_file_path, 'r') as file:
        postprompt = file.read()

except FileNotFoundError as e:
    print(f"Error: The file '{e.filename}' does not exist.")
    sys.exit(1)
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)

class CustomLocalLLM(LocalLLM):
    def __init__(self, api_base, model):
        super().__init__(api_base=api_base, model=model)
        self.api_base = api_base
        self.model = model

    def call(self, messages):
        """
        Отправляет запрос к локальной модели с использованием списка сообщений.
        
        :param messages: Список словарей, где каждый словарь содержит "role" и "content".
        :return: Ответ модели.
        """

        data = {
            "model": self.model,
            "messages": messages
        }

        headers = {"Content-Type": "application/json"}
        response = requests.post(f"{self.api_base}/v1/chat/completions", json=data, headers=headers)

        if response.status_code != 200:
            raise ValueError(f"LLM returned an invalid response: {response.text}")

        response_data = response.json()
        if "choices" not in response_data or len(response_data["choices"]) == 0:
            raise ValueError("LLM returned an invalid response format.")

        return response_data["choices"][0]["message"]["content"]

local_model = CustomLocalLLM(api_base=LOCAL_SERVER_URL, model=LOCAL_MODEL)


In [2]:
df_test = pd.read_parquet(TEST_DF_PATH)

In [None]:
df_test

Unnamed: 0,Index,Drug_ID,Drug,Target_ID,Target,Y
0,1689,190816,O=C1CC2(C(=O)N1)C(=O)N(Cc1ccc(Br)cc1F)C(=O)c1c...,P15121,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,8.251812
1,3308,138805809,COc1nc2ccc([C@@](O)(c3ccccc3)C3CCN(C(C)=O)CC3)...,P51449,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,6.958213
2,3375,118022643,COc1nc2ccc(C(O)(c3ccc(C)nc3C)c3cnnn3C)cc2c(Cl)...,P51449,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,6.221776
3,1175,50943321,Cc1ccc(CN2CCC(N3Cc4cccc(C(N)=O)c4C3=O)CC2)cc1,P11103,MAEASERLYRVQYAKSGRASCKKCSESIPKDSLRMAIMVQSPMFDG...,7.521433
4,818,91899443,CCCC(=O)O[C@@H]1[C@H](C)[C@]2(O)C(CC(=O)C[C@]3...,P17252,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,8.455932
5,109,24860409,CCCCCCCOc1ccc(CC(=O)[O-])cc1,O67648,MGLEKTVKEKLSFEGVGIHTGEYSKLIIHPEKEGTGIRFFKNGVYI...,5.58501
6,670,6539085,NS(=O)(=O)c1ccc(NN=C2C(=O)Nc3ccc(C(=O)NCCc4cnc...,P24941,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,6.600153
7,3188,118022426,COc1nc2ccc(C(O)(c3cnc(C)n3C)c3cnc(C)n3C)cc2c(C...,P51449,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,6.08087
8,3133,138805723,COc1nc2ccc([C@@](O)(c3ccccc3)C3CCN(C(C)=O)CC3)...,P51449,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,6.823619
9,1791,121426383,Cc1cccc(Cc2c(C)nc3c(C(=O)O)cc(N4CCOCC4)nn23)c1C,P48736,MELENYKQPVVLREDNCRRRRRMKPRSAAASLSSMELIPIEFVLPT...,5.20999


In [4]:
sequences = df_test.Target.tolist()
smiles_references = df_test.Drug.tolist()

In [5]:
results = []

output_text_path = PROMPT_OUTPUT_FILE_PATH / 'prompt_result_qwen2.5-coder-14b-instruct.txt'

output_text_path.touch(exist_ok=True)

try:
    with open(output_text_path, "w") as output_file:
        for i, sequence in enumerate(sequences, start=1):
            print(f"Processing sequence {i}/{len(sequences)}...")

            messages = [
                {"role": "system", "content": system},
                {"role": "user", "content": preprompt},
                {"role": "user", "content": sequence},
                {"role": "user", "content": postprompt}
            ]

            try:
                response = local_model.call(messages)
                print(f"Response from Local LLM for sequence {i}:")
                print(response)

                smiles_start = response.find("**") + 2
                smiles_end = response.find("**", smiles_start)
                smiles = response[smiles_start:smiles_end].strip() if smiles_start != -1 and smiles_end != -1 else "N/A"
                explanation = response[smiles_end + 2:].strip() if smiles_end != -1 else response.strip()

                results.append({
                    "Sequence": sequence,
                    "SMILES_references": smiles_references[i],
                    "SMILES": smiles,
                    "Explanation": explanation
                })

                output_file.write(f"Sequence {i}:\n")
                output_file.write(f"Input: {sequence}\n")
                output_file.write(f"Output (SMILES): {smiles}\n")
                output_file.write(f"Explanation: {explanation}\n")
                output_file.write("-" * 80 + "\n")
                output_file.write(f"Results {results}:\n")
            
            except Exception as e:
                print(f"Error processing sequence {i}: {e}")

                results.append({
                    "Sequence": sequence,
                    "SMILES_references": smiles_references[i],
                    "SMILES": "Error",
                    "Explanation": str(e)
                })

                output_file.write(f"Sequence {i}: Error - {e}\n")
                output_file.write("-" * 80 + "\n")


 
except Exception as e:
    print(f"Error while processing sequences: {e}")

Processing sequence 1/10...
Response from Local LLM for sequence 1:
To generate a SMILES representation of a ligand that potentially interacts with the given protein sequence, we need to consider several factors such as the amino acid composition, potential binding sites, and functional groups that could interact favorably with the protein. For simplicity, let's assume a basic ligand design focusing on hydrogen bonding and hydrophobic interactions.

### Step 1: Analyze the Protein Sequence
The provided sequence is quite long, so we will focus on identifying key features:
- **Hydrophilic residues**: Asp (D), Glu (E), His (H), Lys (K), Arg (R).
- **Hydrophobic residues**: Leu (L), Met (M), Phe (F), Ile (I), Val (V).

### Step 2: Design the Ligand
Given the presence of hydrophilic and hydrophobic residues, a ligand with both aromatic rings for hydrophobic interactions and polar groups for hydrogen bonding would be suitable.

#### Proposed Ligand:
- **Aromatic ring**: To interact with hydr

In [6]:
df = pd.DataFrame(results)
output_csv_path = PROMPT_OUTPUT_FILE_PATH / 'results.csv'
df.to_csv(output_csv_path, index=False)
print(f"All results have been saved to '{output_csv_path}' and '{output_text_path}'.")

df = pd.read_csv(output_csv_path)

All results have been saved to '/Users/jerzy/Documents/Projects/BioMol-Generator/model_prompting/results.csv' and '/Users/jerzy/Documents/Projects/BioMol-Generator/model_prompting/prompt_result_qwen2.5-coder-14b-instruct.txt'.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sequence           9 non-null      object
 1   SMILES_references  9 non-null      object
 2   SMILES             8 non-null      object
 3   Explanation        9 non-null      object
dtypes: object(4)
memory usage: 416.0+ bytes


In [8]:
df

Unnamed: 0,Sequence,SMILES_references,SMILES,Explanation
0,MASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCA...,COc1nc2ccc([C@@](O)(c3ccccc3)C3CCN(C(C)=O)CC3)...,Hydrophilic residues,": Asp (D), Glu (E), His (H), Lys (K), Arg (R)...."
1,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc(C(O)(c3ccc(C)nc3C)c3cnnn3C)cc2c(Cl)...,Aromatic Ring,": Forms π-π stacking interactions, which are c..."
2,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,Cc1ccc(CN2CCC(N3Cc4cccc(C(N)=O)c4C3=O)CC2)cc1,Protein Sequence Analysis,: \n - The sequence contains both hydrophobi...
3,MAEASERLYRVQYAKSGRASCKKCSESIPKDSLRMAIMVQSPMFDG...,CCCC(=O)O[C@@H]1[C@H](C)[C@]2(O)C(CC(=O)C[C@]3...,Identify Potential Binding Sites,: Analyze the protein sequence to identify reg...
4,MADVFPGNDSTASQDVANRFARKGALRQKNVHEVKDHKFIARFFKQ...,CCCCCCCOc1ccc(CC(=O)[O-])cc1,Identify Potential Binding Sites:,"- The sequence contains several prolines, whic..."
5,MGLEKTVKEKLSFEGVGIHTGEYSKLIIHPEKEGTGIRFFKNGVYI...,NS(=O)(=O)c1ccc(NN=C2C(=O)Nc3ccc(C(=O)NCCc4cnc...,Identify Key Residues,":\n - Hydrophobic residues: Ala (A), Val (V)..."
6,MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPS...,COc1nc2ccc(C(O)(c3cnc(C)n3C)c3cnc(C)n3C)cc2c(C...,,For the given protein sequence MSHHWGYGKHNGPEH...
7,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc([C@@](O)(c3ccccc3)C3CCN(C(C)=O)CC3)...,SMILES Notation:,`C1=CC(=CC=C1[NH]C2=NC(=CC2=N)N)C3=CC(=CC=C3)C...
8,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,Cc1cccc(Cc2c(C)nc3c(C(=O)O)cc(N4CCOCC4)nn23)c1C,SMILES: `CCOC(=O)C1=CC=C(C=C1)[NH+]CCC`,**Explanation:**\n- **Phenyl ring (C1=CC=C(C=C...
