# Install

In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# MAIN

In [3]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
import numpy as np
import pandas as pd

def get_matrix(smiles_file):
  with open(smiles_file) as in_file:

    matrix = []
    names = []
    for line in in_file:
      if '\t' in line:
        smiles = line.split('\t')[0].strip()
        name = line.split('\t')[1].strip()
      else:
        smiles = line.split(' ')[0].strip()
        name = line.split(' ')[1].strip()

      molecule = Chem.MolFromSmiles(smiles)
      if molecule is None:
        continue

      # Topological Fingerprints 2048 bits
      fp1 = Chem.RDKFingerprint(molecule)
      # MACCS Keys 167 bits
      fp2 = MACCSkeys.GenMACCSKeys(molecule)
      # Concatena fingerprints
      fp_final = list(fp1.ToBitString() + fp2.ToBitString())
      # tranforma em uma lista de inteiros
      matrix.append([int(x) for x in fp_final])
      names.append(name)

    df = pd.DataFrame(matrix, index=names)
    df.index.name = "name"
    return df

# CHARLES

In [None]:
# Importando bibliotecas necessárias
import pandas as pd

# Carregando o arquivo de texto
dir_experiment = "BACES/"
meu_drive = "/content/drive/MyDrive/smiles/" + dir_experiment
input_file_path = meu_drive + "ligantes_parcial.txt"
output_file_path = meu_drive + "ligantes_filtrado.txt"

# Lendo o arquivo, assumindo que está delimitado por tabulação
df = pd.read_csv(input_file_path, sep="\t")

# Filtrando apenas as linhas que NÃO contêm "Bos Taurus" na coluna 'Target Source Organism According to Curator or DataSource'
df_filtered = df[df["Target Source Organism According to Curator or DataSource"] != "Bos taurus"]

# Salvando o arquivo filtrado no mesmo formato
df_filtered.to_csv(output_file_path, sep="\t", index=False)

print(f"Arquivo filtrado salvo em: {output_file_path}")


Arquivo filtrado salvo em: /content/drive/MyDrive/smiles/BACES/ligantes_filtrado.txt


In [4]:
import random
import string

# Função para gerar um nome aleatório de 4 letras único
used_names = set()

def gerar_nome_aleatorio():
    while True:
        nome = ''.join(random.choices(string.ascii_uppercase, k=4))
        if nome not in used_names:
            used_names.add(nome)
            return nome

# Carregar o arquivo
dir_experiment = "BACES/"
meu_drive = "/content/drive/MyDrive/smiles/" + dir_experiment
input_file_path = meu_drive + "decoys_only.txt"
output_file_path = meu_drive + "decoys.smi"

# Ler o arquivo e processar as linhas
with open(input_file_path, 'r') as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    parts = line.strip().split()
    if len(parts) == 0:
        continue  # Pula linhas vazias
    smiles = parts[0]  # Ajuste se o SMILES estiver em outra posição
    nome_aleatorio = gerar_nome_aleatorio()
    processed_lines.append(f"{smiles} {nome_aleatorio}\n")

# Salvar o arquivo editado
with open(output_file_path, 'w') as file:
    file.writelines(processed_lines)

print(f"Arquivo processado e salvo em: {output_file_path}")
print(f"Total de moléculas processadas: {len(processed_lines)}")


Arquivo processado e salvo em: /content/drive/MyDrive/smiles/BACES/decoys.smi
Total de moléculas processadas: 1225


In [None]:
# Caminho do arquivo com os SMILES
smiles_file = "/content/drive/My Drive/smiles/BACES/conjunto_validacao_ligands_com_nome.smiles"

# Leitura do arquivo e criação de um DataFrame para os SMILES
smiles_data = pd.read_csv(smiles_file, sep=" ", header=None, names=["smiles", "name"])

# Remove duplicatas, mantendo apenas a primeira ocorrência de cada SMILES
smiles_data = smiles_data.drop_duplicates(subset="smiles", keep='first')

# Salva o DataFrame sem duplicatas em um novo arquivo (ou substitua o existente)
smiles_data.to_csv("/content/drive/My Drive/smiles/BACES/smiles_clean.smiles", sep=" ", index=False, header=False)

# Verifica as primeiras linhas do DataFrame atualizado
smiles_data


Unnamed: 0,smiles,name
0,Cc1cc(sc1CCC(=O)N[C@@H](CC(O)=O)C(O)=O)C(=O)Oc...,YEPH
1,NC(=N)c1ccc(OC(=O)c2ccc(CC3(CCC3)C(=O)N[C@@H](...,VRIT
4,CC(C)(Cc1ccc(s1)C(=O)Oc1ccc(cc1F)C(N)=N)C(=O)N...,BHGZ
5,NC(=N)c1ccc(OC(=O)c2ccc(CN3CSC[C@H]3C(=O)N[C@@...,JDSR
6,CCn1c2ccc(cc2s\c1=N/CC(C)(C)C(O)=O)C(=O)Oc1ccc...,GMKQ
...,...,...
541,NC(=N)c1ccc(OC(=O)c2cnc(s2)N2CCC(CC2)C(O)=O)cc1,ZLHN
542,CC(Cc1ccc(o1)C(=O)Oc1ccc(cc1)C(N)=N)C(=O)N1CCCC1,DJUP
543,CCC(CC)(Cc1ccc(s1)C(=O)Oc1ccc(cc1F)C(N)=N)C(=O...,LFPG
545,Cl.CCC(CC)(Cc1ccc(s1)C(=O)Oc1ccc(cc1F)C(N)=N)C...,VYUS


In [None]:
## CHARLES ##
# Diretorio com arquivos ligantes e decoys
#dir_experiment = "LIT/TP53/"
dir_experiment = "BACES/"
meu_drive = "/content/drive/MyDrive/smiles/" + dir_experiment
# Definindo caminho dos arquivos
ligand_smiles = meu_drive + "smiles_clean.smiles"

In [None]:
# Gerando matriz para ligantes
ligands_df = get_matrix(ligand_smiles)
ligands_df.to_csv(meu_drive + "ligands_matrix.csv")
ligands_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2205,2206,2207,2208,2209,2210,2211,2212,2213,2214
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YEPH,1,0,1,1,1,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
VRIT,1,0,0,1,1,0,0,1,1,0,...,1,1,1,0,1,1,1,1,1,0
BHGZ,1,0,0,1,1,0,0,0,1,0,...,1,1,1,1,1,1,1,1,1,0
JDSR,1,1,0,1,0,1,0,1,0,1,...,1,1,1,0,1,1,1,1,1,0
GMKQ,1,1,0,1,0,1,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZLHN,1,0,0,0,0,1,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
DJUP,1,0,0,0,1,0,0,0,1,0,...,1,1,1,1,1,1,1,1,1,0
LFPG,1,0,0,1,1,0,0,1,1,0,...,1,1,1,1,1,1,1,1,1,0
VYUS,1,0,0,1,1,0,0,1,1,0,...,1,1,1,1,1,1,1,1,1,1
