In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from pandasai.llm.local_llm import LocalLLM
from data.constants import SYSTEM, PREPROMPT, POSTPROMPT, TEMP, LOCAL_SERVER_URL, PROMPT_OUTPUT_FILE_PATH, DATASET_PATH
import requests
import sys


system_file_path = str(SYSTEM)
preprompt_file_path = str(PREPROMPT)
postprompt_file_path = str(POSTPROMPT)
temp = TEMP

try:

    with open(system_file_path, 'r') as file:
        system = file.read()

    with open(preprompt_file_path, 'r') as file:
        preprompt = file.read()

    with open(postprompt_file_path, 'r') as file:
        postprompt = file.read()

except FileNotFoundError as e:
    print(f"Error: The file '{e.filename}' does not exist.")
    sys.exit(1)
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)

class CustomLocalLLM(LocalLLM):
    def __init__(self, api_base, model):
        super().__init__(api_base=api_base, model=model)
        self.api_base = api_base
        self.model = model

    def call(self, messages):
        """
        Отправляет запрос к локальной модели с использованием списка сообщений.
        
        :param messages: Список словарей, где каждый словарь содержит "role" и "content".
        :return: Ответ модели.
        """

        data = {
            "model": self.model,
            "messages": messages
        }

        headers = {"Content-Type": "application/json"}
        response = requests.post(f"{self.api_base}/v1/chat/completions", json=data, headers=headers)

        if response.status_code != 200:
            raise ValueError(f"LLM returned an invalid response: {response.text}")

        response_data = response.json()
        if "choices" not in response_data or len(response_data["choices"]) == 0:
            raise ValueError("LLM returned an invalid response format.")

        return response_data["choices"][0]["message"]["content"]

local_model = CustomLocalLLM(api_base=LOCAL_SERVER_URL, model="qwen2.5-coder-14b-instruct-")


In [2]:
sequence_df = pd.read_parquet(DATASET_PATH)
df_train, df_test = train_test_split(sequence_df, test_size=0.2, shuffle=True, random_state=42)

In [3]:
df_test

Unnamed: 0,Index,Drug_ID,Drug,Target_ID,Target,Y
16348,32553,16038120,COc1cc(N2CCC(N3CCN(C)CC3)CC2)ccc1Nc1ncc(Cl)c(N...,Q8NI60,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,5.769526
12311,18110,5329102,CCN(CC)CCNC(=O)c1c(C)[nH]c(/C=C2\C(=O)Nc3ccc(F...,Q9NQU5,MFRKKKKKRPEISAPQNFQHRVHTSFDPKEGKFVGLPPQWQNILDT...,5.619771
13229,20206,447077,CSc1cccc(Nc2ncc3cc(-c4c(Cl)cccc4Cl)c(=O)n(C)c3...,P51955,MPSRAEDYEVLYTIGTGSYGRCQKIRRKSDGKILVWKELDYGSMTE...,6.958213
4421,5309,11723916,CC1=C(/C=C/c2occc2/C=C/C(C)=C/C(=O)O)C(C)(C)CCC1,P19793,MDTKHFLPLDFSTQVNSSLTSPTGRGSMAAPSLHPSLGPGIGSPGQ...,8.148742
1456,1637,126962341,Cc1cc(-n2nc(C(=O)NCC3COC(C)(C)O3)cc2C)nc(N)n1,P08238,MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISN...,4.193820
...,...,...,...,...,...,...
18521,39790,3636,CCCC[N+]1(C)C2CC(OC(=O)C(CO)c3ccccc3)CC1C1OC12,P08483,MTLHSNSTTSPLFPNISSSWVHSPSEAGLPLGTVTQLGSYNISQET...,6.657380
11808,16586,121426227,Cc1nc2c(N)cc(N3CCOCC3)nn2c1Nc1cccc(C(F)(F)F)c1...,P42336,MPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIK...,5.369969
22858,46015,44589582,CC(C)OC(=O)c1ccc(NC(=O)N[C@@H](Cc2ccc(O)cc2)C(...,P11229,MNTSAPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,7.398918
1608,1799,121426215,Cc1c(Cc2c(CO)nc3c(N)cc(N4CCOCC4)nn23)cccc1C(F)...,O00750,MSSTQGNGEHWKSLESVGISRKELAMAEALQMEYDALSRLRHDKEE...,8.853872


In [4]:
sequences = df_test.Target.tolist()
smiles_references = df_test.Drug.tolist()

In [None]:
results = []

output_text_path = PROMPT_OUTPUT_FILE_PATH / 'prompt_result_qwen2.5-coder-14b-instruct.txt'

output_text_path.touch(exist_ok=True)

try:
    with open(output_text_path, "w") as output_file:
        for i, sequence in enumerate(sequences, start=1):
            print(f"Processing sequence {i}/{len(sequences)}...")

            messages = [
                {"role": "system", "content": system},
                {"role": "user", "content": preprompt},
                {"role": "user", "content": sequence},
                {"role": "user", "content": postprompt}
            ]

            try:
                response = local_model.call(messages)
                print(f"Response from Local LLM for sequence {i}:")
                print(response)

                smiles_start = response.find("**") + 2
                smiles_end = response.find("**", smiles_start)
                smiles = response[smiles_start:smiles_end].strip() if smiles_start != -1 and smiles_end != -1 else "N/A"
                explanation = response[smiles_end + 2:].strip() if smiles_end != -1 else response.strip()

                results.append({
                    "Sequence": sequence,
                    "SMILES_references": smiles_references[i],
                    "SMILES": smiles,
                    "Explanation": explanation
                })

                output_file.write(f"Sequence {i}:\n")
                output_file.write(f"Input: {sequence}\n")
                output_file.write(f"Output (SMILES): {smiles}\n")
                output_file.write(f"Explanation: {explanation}\n")
                output_file.write("-" * 80 + "\n")

            except Exception as e:
                print(f"Error processing sequence {i}: {e}")

                results.append({
                    "Sequence": sequence,
                    "SMILES_references": smiles_references[i],
                    "SMILES": "Error",
                    "Explanation": str(e)
                })

                output_file.write(f"Sequence {i}: Error - {e}\n")
                output_file.write("-" * 80 + "\n")


    df = pd.DataFrame(results)

    output_csv_path = PROMPT_OUTPUT_FILE_PATH / 'results.csv'

    df.to_csv(output_csv_path, index=False)
    print(f"All results have been saved to '{output_csv_path}' and '{output_text_path}'.")

except Exception as e:
    print(f"Error while processing sequences: {e}")

Processing sequence 1/4940...
Response from Local LLM for sequence 1:
To generate a small molecule ligand that could potentially interact with the given protein sequence, we need to consider the properties of the protein and design a ligand that can form hydrogen bonds, hydrophobic interactions, or other binding mechanisms. The provided protein sequence is rich in charged residues like glutamate (E), aspartate (D), and lysine (K), suggesting areas where positively or negatively charged ligands could interact.

Here's an example of a molecular structure in SMILES format that might serve as a potential ligand:

**SMILES: COC1=CC2=C(C=C1)OC(C3=CC=CC=C3)=C2**

This is the SMILES representation for **benzyl alcohol**, which has a hydroxyl group (-OH) capable of forming hydrogen bonds with charged residues on the protein. Additionally, it has an aromatic ring that can participate in π-π stacking interactions if the protein has aromatic residues.

For more specific targeting and better bindin

In [None]:
df = pd.read_csv(output_csv_path)

NameError: name 'pd' is not defined

In [2]:
df

NameError: name 'df' is not defined