## Descriptores

### descriptores ligandos

In [53]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdmolfiles import MolFromMol2File, MolFromPDBFile
from mordred import Calculator, descriptors

# Lista de descriptores a calcular

# Ruta de la carpeta que contiene los archivos MOL2
carpeta = '/home/nfernandez/unit/LigandosPDB'

# Lista para almacenar los descriptores
calc = Calculator(descriptors, ignore_3D=False)

data = []

# Función para calcular descriptores
for archivo in os.listdir(carpeta):
    if archivo.endswith('.pdb'):
        ruta_archivo = os.path.join(carpeta, archivo)

        # Cargar la molécula del archivo PDB
        mol = MolFromPDBFile(ruta_archivo, sanitize=True)
        if mol:  # Verifica que la molécula se haya leído correctamente
            descriptores = {'Nombre': archivo}  # Etiqueta con el nombre del archivo
            try:
                # Calcula todos los descriptores de Mordred para la molécula
                descriptores.update(calc(mol).asdict())
                data.append(descriptores)
            except Exception as e:
                print(f"Error calculando descriptores para {archivo}: {e}")

# Itera sobre cada archivo MOL2 en la carpeta
for archivo in os.listdir(carpeta):
    if archivo.endswith('.mol2'):
        ruta_archivo = os.path.join(carpeta, archivo)

        # Cargar la molécula del archivo MOL2
        mol1 = MolFromMol2File(ruta_archivo, sanitize=True)
        if mol1:  # Verifica que la molécula se haya leído correctamente
            # Calcula los descriptores y los almacena en un diccionario
            descriptores = {'Nombre': archivo}  # Etiqueta con el nombre del archivo
            try:
                # Calcula todos los descriptores de Mordred para la molécula
                descriptores.update(calc(mol1).asdict())
                data.append(descriptores)
            except Exception as e:
                print(f"Error calculando descriptores para {archivo}: {e}")

# Convertimos la lista de descriptores en un DataFrame de Pandas
df_descriptores = pd.DataFrame(data)

# Guardamos el DataFrame en un archivo CSV
df_descriptores.to_excel('descriptores_ligandos.xlsx', index=False)
print("Descriptores calculados y guardados en 'descriptores_ligandos.xlsx'")
df_descriptores


Descriptores calculados y guardados en 'descriptores_ligandos.xlsx'


Unnamed: 0,Nombre,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,60.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,...,7.126891,32.187603,116.120115,4.838338,84,5,26.0,24.0,3.500000,2.250000
1,251.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,8.472136,2.000000,4.000000,8.472136,1.059017,...,7.738488,33.811160,118.099380,5.368154,74,5,30.0,28.0,4.722222,1.916667
2,18.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,20.860105,1.969616,3.939231,20.860105,1.227065,...,8.164226,45.799808,238.229666,5.068716,816,14,62.0,60.0,5.750000,4.500000
3,71.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,24.136411,1.993169,3.986338,24.136411,1.206821,...,8.474077,50.019056,282.255880,5.226961,1313,17,76.0,74.0,7.111111,5.083333
4,25.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,...,7.126891,32.187603,114.104465,5.186567,84,5,26.0,24.0,3.500000,2.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,136.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,6.987918,1.801938,3.603875,6.987918,1.164653,...,6.608001,28.105124,86.073165,5.379573,35,3,18.0,16.0,3.000000,1.750000
250,229.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,15.750491,1.949856,3.899712,15.750491,1.211576,...,7.826044,40.245095,182.167065,5.204773,364,10,46.0,44.0,4.750000,3.500000
251,171.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,8.054679,1.847759,3.695518,8.054679,1.150668,...,6.900731,30.257210,100.088815,5.267832,56,4,22.0,20.0,3.250000,2.000000
252,94.pdb,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,23.412409,1.975377,3.950753,23.412409,1.232232,...,8.298291,48.426412,266.260966,5.023792,1140,16,70.0,68.0,6.250000,5.000000


### Descriptores proteinas

In [54]:
import pandas as pd
from propy import PyPro

# Leer el archivo Excel
df = protein_sequences

# Asegúrate de que la columna de secuencias de aminoácidos está correctamente identificada
# Por ejemplo, la columna que contiene las secuencias puede llamarse 'AA_Sequence'
# Si no sabes el nombre exacto, puedes ver las columnas con: df.columns

# Función para calcular los descriptores con ProPy
def calculate_descriptors(sequence):
    if len(sequence) == 0:
        print("Secuencia vacía, no se calcularán descriptores.")
        return {}  # Si la secuencia está vacía, devolver un diccionario vacío

    # Calcular los descriptores de la proteína usando ProPy
    protein_descriptor = PyPro.GetProDes(sequence)
    descriptors = protein_descriptor.GetALL()  # Obtener todos los descriptores
    return descriptors

# Lista para almacenar los resultados de los descriptores
descriptor_data = []

# Procesar cada secuencia de aminoácidos en el DataFrame
for _, row in df.iterrows():
    proteina = row['proteina']  # Asumiendo que el nombre de la proteína está en esta columna
    sequence = row['AA Sequence W/O signal peptide']      # Asumiendo que la secuencia está en esta columna

    # Calcular descriptores
    descriptors = calculate_descriptors(sequence)

    # Agregar el nombre de la proteína y los descriptores como una fila del DataFrame
    descriptor_data.append({"proteina": proteina, **descriptors})

# Crear el DataFrame con los descriptores calculados
df_descriptors1 = pd.DataFrame(descriptor_data)

# Mostrar el DataFrame con los descriptores
df_descriptors1

Unnamed: 0,proteina,A,R,N,D,C,E,Q,G,H,...,QSOgrant41,QSOgrant42,QSOgrant43,QSOgrant44,QSOgrant45,QSOgrant46,QSOgrant47,QSOgrant48,QSOgrant49,QSOgrant50
0,CmedPBP4,6.122,2.721,1.361,5.442,4.082,11.565,4.762,9.524,2.721,...,0.029493,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341
1,CpunPBP2,8.451,2.113,2.113,7.746,4.225,11.972,2.817,4.225,4.225,...,0.029432,0.031559,0.031934,0.032102,0.026964,0.029060,0.033730,0.031201,0.029511,0.032615
2,CpunPBP5,8.392,4.196,2.797,6.294,4.196,11.888,2.797,4.196,4.196,...,0.031201,0.030126,0.031509,0.030460,0.027874,0.028269,0.031272,0.030974,0.032708,0.031396
3,CsinGOBP1,4.861,4.167,1.389,6.250,4.861,13.194,4.167,3.472,6.944,...,0.031101,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562
4,CsinGOBP2,8.511,4.255,2.128,7.801,4.255,10.638,2.128,4.255,7.092,...,0.031391,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,DkikPBP2,6.993,2.797,2.797,7.692,4.196,9.091,3.497,5.594,4.196,...,0.029841,0.033287,0.032367,0.031716,0.029321,0.028513,0.031184,0.029517,0.030908,0.032506
106,DhouPBP1,7.143,2.857,5.000,7.857,4.286,8.571,2.857,3.571,5.000,...,0.033032,0.030671,0.032426,0.028554,0.028250,0.031033,0.030718,0.031505,0.032773,0.028406
107,ScerGOBP1,6.207,6.207,2.069,4.138,4.828,13.793,2.759,4.138,8.276,...,0.032404,0.033225,0.033919,0.029509,0.024662,0.030385,0.032298,0.030258,0.028813,0.033485
108,ScerGOBP2,7.801,4.255,2.837,7.092,4.255,12.057,1.418,4.965,5.674,...,0.031197,0.033450,0.031377,0.032264,0.026384,0.031429,0.030621,0.029995,0.031577,0.031415


# Descriptores Docking (SDF y TXT)

## SDF

In [55]:
import os
from openbabel import pybel
import pandas as pd

# Define la ruta principal donde están tus carpetas y archivos SDF
ruta_principal = "/home/nfernandez/unit/DockingResults"

# Lista para almacenar todos los descriptores de cada archivo
datos_descriptores = []

# Recorrer todas las subcarpetas y archivos
for carpeta_raiz, carpetas, archivos in os.walk(ruta_principal):
    for archivo in archivos:
        # Procesar solo archivos con extensión .sdf
        if archivo.endswith(".sdf"):
            ruta_archivo = os.path.join(carpeta_raiz, archivo)

            # Cargar la molécula usando pybel, manejando posibles errores
            try:
                # Intenta leer la primera molécula del archivo SDF
                mol = next(pybel.readfile("sdf", ruta_archivo))
            except StopIteration:
                print(f"Advertencia: El archivo {ruta_archivo} está vacío o no contiene moléculas válidas. Se omitirá.")
                continue  # Salta al siguiente archivo si este está vacío o es inválido

            # Extraer todos los descriptores disponibles
            descriptores = mol.calcdesc()  # Calcula todos los descriptores automáticamente
            descriptores["Masa molecular"] = mol.molwt
            descriptores["Carga"] = mol.charge
            descriptores["Archivo"] = archivo  # Para identificar el archivo en el DataFrame

            # Añadir los descriptores a la lista de datos
            datos_descriptores.append(descriptores)

# Crear un DataFrame con los datos recopilados
df_descriptores_sdf = pd.DataFrame(datos_descriptores)

# Mostrar los primeros registros del DataFrame
df_descriptores_sdf

Advertencia: El archivo /home/nfernandez/unit/DockingResults/CbuqPBP1_254_docking.sdf está vacío o no contiene moléculas válidas. Se omitirá.


Unnamed: 0,abonds,atoms,bonds,cansmi,cansmiNS,dbonds,formula,HBA1,HBA2,HBD,...,rotors,s,sbonds,smarts,tbonds,title,TPSA,Masa molecular,Carga,Archivo
0,0.0,20.0,19.0,,,0.0,,2.0,1.0,1.0,...,16.0,,19.0,,0.0,,20.23,233.19994,0,LbotPBP1_179_docking.sdf
1,0.0,15.0,15.0,,,3.0,,0.0,0.0,0.0,...,4.0,,12.0,,0.0,,0.00,180.16050,0,CsinGOBP2_243_docking.sdf
2,0.0,10.0,10.0,,,2.0,,0.0,0.0,0.0,...,1.0,,8.0,,0.0,,0.00,120.10700,0,CsupGOBP2_62_docking.sdf
3,0.0,10.0,9.0,,,2.0,,2.0,2.0,0.0,...,5.0,,7.0,,0.0,,26.30,128.08440,0,SexiPBP3_7_docking.sdf
4,0.0,9.0,9.0,,,4.0,,1.0,1.0,0.0,...,2.0,,5.0,,0.0,,17.07,112.08500,0,LstiGOBP2_47_docking.sdf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,0.0,10.0,11.0,,,1.0,,0.0,0.0,0.0,...,0.0,,10.0,,0.0,,0.00,120.10700,0,OachPBP1_42_docking.sdf
1454,0.0,17.0,16.0,,,2.0,,1.0,1.0,0.0,...,13.0,,14.0,,0.0,,17.07,208.17060,0,AipsGOBP2_34_docking.sdf
1455,0.0,13.0,12.0,,,1.0,,1.0,1.0,0.0,...,10.0,,11.0,,0.0,,17.07,160.12780,0,CsupGOBP1_17_docking.sdf
1456,0.0,18.0,17.0,,,1.0,,2.0,1.0,1.0,...,13.0,,16.0,,0.0,,20.23,209.17854,0,SinfPBP1_29_docking.sdf


In [56]:
# Reemplazar NaN con 0 en df_descriptores_sdf
df_descriptores_sdf = df_descriptores_sdf.fillna(0)
df_descriptores_sdf

df_descriptores_sdf = df_descriptores_sdf.dropna(axis=1, how='all')  # Eliminar columnas con todos los valores NaN

# Alternatively, if you want to remove columns with only 0s:
df_descriptores_sdf = df_descriptores_sdf.loc[:, (df_descriptores_sdf != 0).any(axis=0)]

df_descriptores_sdf

df_descriptores_sdf['Archivo'] = df_descriptores_sdf['Archivo'].str.replace('_docking', '')
df_descriptores_sdf['Archivo'] = df_descriptores_sdf['Archivo'].str.replace('.sdf', '')
df_descriptores_sdf['proteina'] = df_descriptores_sdf['Archivo'].str.rsplit('_', n=1).str[-2]
df_descriptores_sdf['id'] = df_descriptores_sdf['Archivo'].str.rsplit('_', n=1).str[-1]
cols = list(df_descriptores_sdf.columns)
cols.remove('proteina')
cols.remove('id')
cols.remove('Archivo')
df_descriptores_sdf = df_descriptores_sdf[['proteina', 'id'] + cols]
df_descriptores_sdf

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,MR,MW,rotors,sbonds,tbonds,TPSA,Masa molecular
0,LbotPBP1,179,20.0,19.0,0.0,2.0,1.0,1.0,1.2866,615.9384,60.3098,233.19994,16.0,19.0,0.0,20.23,233.19994
1,CsinGOBP2,243,15.0,15.0,3.0,0.0,0.0,0.0,1.3671,482.7244,49.1850,180.16050,4.0,12.0,0.0,0.00,180.16050
2,CsupGOBP2,62,10.0,10.0,2.0,0.0,0.0,0.0,0.8868,326.9092,32.7000,120.10700,1.0,8.0,0.0,0.00,120.10700
3,SexiPBP3,7,10.0,9.0,2.0,2.0,2.0,0.0,0.0698,251.3088,28.7930,128.08440,5.0,7.0,0.0,26.30,128.08440
4,LstiGOBP2,47,9.0,9.0,4.0,1.0,1.0,0.0,0.2458,255.3129,28.6645,112.08500,2.0,5.0,0.0,17.07,112.08500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,OachPBP1,42,10.0,11.0,1.0,0.0,0.0,0.0,0.8055,333.2910,31.8900,120.10700,0.0,10.0,0.0,0.00,120.10700
1454,AipsGOBP2,34,17.0,16.0,2.0,1.0,1.0,0.0,0.8224,544.3421,54.3385,208.17060,13.0,14.0,0.0,17.07,208.17060
1455,CsupGOBP1,17,13.0,12.0,1.0,1.0,1.0,0.0,0.4972,407.4669,41.3665,160.12780,10.0,11.0,0.0,17.07,160.12780
1456,SinfPBP1,29,18.0,17.0,1.0,2.0,1.0,1.0,1.1240,547.5008,53.8238,209.17854,13.0,16.0,0.0,20.23,209.17854


## TXT

In [57]:
import os
import re
import pandas as pd

# Función para listar los archivos en una carpeta
def listar_rutas_archivos(ruta_carpeta):
    rutas_archivos = []
    for root, dirs, files in os.walk(ruta_carpeta):
        for archivo in files:
            rutas_archivos.append(os.path.join(root, archivo))
    return rutas_archivos

# Función para crear el DataFrame a partir de los archivos logs
def create_dataframe_log_from_path_list(rutasLogs):
    data_log = []

    # Expresión regular para extraer las columnas: mode, affinity, RMSD l.b., RMSD u.b.
    mode_pattern = re.compile(r'^\s*(\d+)\s+(-?\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')

    # Iterar sobre las rutas de los archivos log
    for log_path in rutasLogs:
        list_path = listar_rutas_archivos(log_path)

        for file_path in list_path:
            # Variables para almacenar datos
            modes = []
            affinities = []
            rmsd_lb = []
            rmsd_ub = []

            # Lectura del archivo línea por línea
            with open(file_path, 'r') as file:
                for line in file:
                    mode_match = mode_pattern.match(line)
                    if mode_match:
                        modes.append(int(mode_match.group(1)))
                        affinities.append(float(mode_match.group(2)))
                        rmsd_lb.append(float(mode_match.group(3)))
                        rmsd_ub.append(float(mode_match.group(4)))

            if modes and affinities and rmsd_lb and rmsd_ub:
                df = pd.DataFrame({
                    'Mode': modes,
                    'Affinity (kcal/mol)': affinities,
                    'RMSD l.b.': rmsd_lb,
                    'RMSD u.b.': rmsd_ub
                })

                # Transformar los datos en un formato apropiado y agregar el nombre del archivo
                transformed_data = {}
                for index, row in df.iterrows():
                    prefix = f"Fila {index + 1} - "
                    transformed_data[prefix + 'Mode'] = row['Mode']
                    transformed_data[prefix + 'Affinity (kcal/mol)'] = row['Affinity (kcal/mol)']
                    transformed_data[prefix + 'RMSD l.b.'] = row['RMSD l.b.']
                    transformed_data[prefix + 'RMSD u.b.'] = row['RMSD u.b.']

                # Obtener el nombre del ligando (nombre del archivo sin la extensión)
                ligand_name = os.path.splitext(os.path.basename(file_path))[0]
                df_transformed = pd.DataFrame(transformed_data, index=[0])
                df_transformed['Archivo'] = ligand_name
                df_transformed = pd.concat([df_transformed['Archivo'], df_transformed.drop('Archivo', axis=1)], axis=1)
                data_log.append(df_transformed)

    # Combinar todos los DataFrames en uno solo
    log_df = pd.concat(data_log, ignore_index=True)
    df_cleaned = log_df.dropna(how='all')

    return df_cleaned

# Ruta a la carpeta que contiene los archivos txt
carpeta_txt = '/home/nfernandez/unit/DockingLogs'  # Cambia esto por la ruta de tu carpeta

# Listar las rutas de los archivos logs
rutasLogs = [carpeta_txt]  # Si tienes más carpetas, agrégalas aquí

# Crear el DataFrame
df_final = create_dataframe_log_from_path_list(rutasLogs)
df_final['Archivo'] = df_final['Archivo'].str.replace('log_', '')
df_final['proteina'] = df_final['Archivo'].str.rsplit('_', n=1).str[-2]
df_final['id'] = df_final['Archivo'].str.rsplit('_', n=1).str[-1]
cols = list(df_final.columns)
cols.remove('proteina')
cols.remove('id')
cols.remove('Archivo')
df_final = df_final[['proteina', 'id'] + cols]
df_final = df_final.fillna(0)

# Guardar el DataFrame en un archivo xlsx
df_final.to_excel('dataset_extraido.xlsx', index=False)

# Mostrar el DataFrame
df_final

Unnamed: 0,proteina,id,Fila 1 - Mode,Fila 1 - Affinity (kcal/mol),Fila 1 - RMSD l.b.,Fila 1 - RMSD u.b.,Fila 2 - Mode,Fila 2 - Affinity (kcal/mol),Fila 2 - RMSD l.b.,Fila 2 - RMSD u.b.,...,Fila 48 - RMSD l.b.,Fila 48 - RMSD u.b.,Fila 49 - Mode,Fila 49 - Affinity (kcal/mol),Fila 49 - RMSD l.b.,Fila 49 - RMSD u.b.,Fila 50 - Mode,Fila 50 - Affinity (kcal/mol),Fila 50 - RMSD l.b.,Fila 50 - RMSD u.b.
0,SlitPBP1,55,1.0,-7.2,0.0,0.0,2.0,-7.1,1.081,4.706,...,3.247,4.544,49.0,-6.0,3.803,7.998,50.0,-6.0,2.860,3.788
1,SexiPBP2,195,1.0,-7.2,0.0,0.0,2.0,-7.2,1.293,3.186,...,5.576,9.155,49.0,-6.1,2.676,3.942,50.0,-6.1,5.997,9.166
2,AipsPBP1,113,1.0,-6.7,0.0,0.0,2.0,-6.6,1.553,2.897,...,3.901,6.539,49.0,-5.8,4.164,6.016,50.0,-5.8,4.635,6.176
3,SexiPBP1,78,1.0,-6.5,0.0,0.0,2.0,-6.3,1.632,3.181,...,2.856,6.316,49.0,-5.5,4.070,6.519,50.0,-5.5,2.978,5.257
4,HassPBP1,17,1.0,-5.3,0.0,0.0,2.0,-5.2,1.361,2.087,...,3.714,5.705,49.0,-4.5,4.728,7.087,50.0,-4.5,2.613,6.066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,AipsGOBP2,15,1.0,-4.5,0.0,0.0,2.0,-4.5,1.038,1.329,...,4.635,5.672,49.0,-3.5,4.698,5.444,0.0,0.0,0.000,0.000
1454,CpinPBP2,48,1.0,-5.6,0.0,0.0,2.0,-5.6,1.191,1.601,...,3.346,4.699,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
1455,LbotPBP1,27,1.0,-8.4,0.0,0.0,2.0,-8.4,0.872,1.671,...,2.913,6.643,49.0,-7.1,2.476,4.522,50.0,-7.1,1.927,6.583
1456,LbotPBP1,197,1.0,-6.7,0.0,0.0,2.0,-6.7,3.283,6.238,...,1.729,2.325,49.0,-5.8,2.630,5.562,50.0,-5.8,3.083,5.878


In [58]:
df_final_merged = pd.merge(df_descriptores_sdf, df_final, on=['proteina', 'id'], how='left')

# Display the merged DataFrame
df_final_merged.info()
df_final_merged

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Columns: 217 entries, proteina to Fila 50 - RMSD u.b.
dtypes: float64(215), object(2)
memory usage: 2.4+ MB


Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,Fila 48 - RMSD l.b.,Fila 48 - RMSD u.b.,Fila 49 - Mode,Fila 49 - Affinity (kcal/mol),Fila 49 - RMSD l.b.,Fila 49 - RMSD u.b.,Fila 50 - Mode,Fila 50 - Affinity (kcal/mol),Fila 50 - RMSD l.b.,Fila 50 - RMSD u.b.
0,LbotPBP1,179,20.0,19.0,0.0,2.0,1.0,1.0,1.2866,615.9384,...,1.981,6.813,49.0,-6.7,2.549,6.795,50.0,-6.6,1.847,6.872
1,CsinGOBP2,243,15.0,15.0,3.0,0.0,0.0,0.0,1.3671,482.7244,...,2.006,3.919,49.0,-5.8,2.159,5.299,0.0,0.0,0.000,0.000
2,CsupGOBP2,62,10.0,10.0,2.0,0.0,0.0,0.0,0.8868,326.9092,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
3,SexiPBP3,7,10.0,9.0,2.0,2.0,2.0,0.0,0.0698,251.3088,...,5.215,6.842,49.0,-4.2,4.425,6.552,50.0,-4.2,4.474,6.069
4,LstiGOBP2,47,9.0,9.0,4.0,1.0,1.0,0.0,0.2458,255.3129,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,OachPBP1,42,10.0,11.0,1.0,0.0,0.0,0.0,0.8055,333.2910,...,2.492,4.377,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
1454,AipsGOBP2,34,17.0,16.0,2.0,1.0,1.0,0.0,0.8224,544.3421,...,1.851,5.436,49.0,-6.0,2.762,6.664,50.0,-5.9,1.920,3.403
1455,CsupGOBP1,17,13.0,12.0,1.0,1.0,1.0,0.0,0.4972,407.4669,...,4.934,7.402,49.0,-5.4,3.148,6.469,50.0,-5.4,3.848,5.166
1456,SinfPBP1,29,18.0,17.0,1.0,2.0,1.0,1.0,1.1240,547.5008,...,3.267,7.109,49.0,-5.7,3.221,5.679,50.0,-5.6,4.221,6.153


In [59]:
resultate=result_df.copy()
resultate

Unnamed: 0,Compound Name,id,Smiles,Protein,UniProtID,AA Sequence W/O signal peptide,proteina,Binding_Affinity
0,ionone (beta),1,CC(=O)/C=C/C1=C(C)CCCC1(C)C,CmedPBP4,M4XYS4,MEVEMLPEGMKQLTGGFIKVFEACKTELGLKDGMLTDMYHLWREEY...,CmedPBP4,7.13
1,ionone (beta),1,CC(=O)/C=C/C1=C(C)CCCC1(C)C,CpunPBP2,A0A0M4F9J4,MMKDMTKNFLKAYGECQQELHLTDDTARDLMFFWKEDYEVTSREAG...,CpunPBP2,10.06
2,ionone (beta),1,CC(=O)/C=C/C1=C(C)CCCC1(C)C,CpunPBP5,A0A0M4FIS7,SQEVMKKMSATFFKLLEECKKELSVTDDMIQGLVRFWLEDSALGER...,CpunPBP5,9.85
3,ionone (beta),1,CC(=O)/C=C/C1=C(C)CCCC1(C)C,CsinGOBP1,A0A0K0MNP5,KVEVMKDVTLGFGEALQHCREQSQLTEEKMEEFFHFWRDDFKFEHR...,CsinGOBP1,12.93
4,ionone (beta),1,CC(=O)/C=C/C1=C(C)CCCC1(C)C,CsinGOBP2,A0A0K0MN53,TAEIMSHVTAHFGKLLEECRQESGLTTDILEEFQHFWREDFEVVHR...,CsinGOBP2,30
...,...,...,...,...,...,...,...,...
1454,2-methyl-3-pentanol,252,CCC(O)C(C)C,CsinGOBP1,A0A0K0MNP5,KVEVMKDVTLGFGEALQHCREQSQLTEEKMEEFFHFWRDDFKFEHR...,CsinGOBP1,30
1455,2-methyl-3-pentanol,252,CCC(O)C(C)C,CsinGOBP2,A0A0K0MN53,TAEIMSHVTAHFGKLLEECRQESGLTTDILEEFQHFWREDFEVVHR...,CsinGOBP2,9.57
1456,Methyl benzyl ether,253,COCc1ccccc1,CsinGOBP1,A0A0K0MNP5,KVEVMKDVTLGFGEALQHCREQSQLTEEKMEEFFHFWRDDFKFEHR...,CsinGOBP1,24.11
1457,Methyl benzyl ether,253,COCc1ccccc1,CsinGOBP2,A0A0K0MN53,TAEIMSHVTAHFGKLLEECRQESGLTTDILEEFQHFWREDFEVVHR...,CsinGOBP2,30


In [60]:
print(f"Columns in df_final_merged: {df_final_merged.columns.tolist()}")
print(f"Columns in resulate: {resultate.columns.tolist()}")

Columns in df_final_merged: ['proteina', 'id', 'atoms', 'bonds', 'dbonds', 'HBA1', 'HBA2', 'HBD', 'logP', 'MP', 'MR', 'MW', 'rotors', 'sbonds', 'tbonds', 'TPSA', 'Masa molecular', 'Fila 1 - Mode', 'Fila 1 - Affinity (kcal/mol)', 'Fila 1 - RMSD l.b.', 'Fila 1 - RMSD u.b.', 'Fila 2 - Mode', 'Fila 2 - Affinity (kcal/mol)', 'Fila 2 - RMSD l.b.', 'Fila 2 - RMSD u.b.', 'Fila 3 - Mode', 'Fila 3 - Affinity (kcal/mol)', 'Fila 3 - RMSD l.b.', 'Fila 3 - RMSD u.b.', 'Fila 4 - Mode', 'Fila 4 - Affinity (kcal/mol)', 'Fila 4 - RMSD l.b.', 'Fila 4 - RMSD u.b.', 'Fila 5 - Mode', 'Fila 5 - Affinity (kcal/mol)', 'Fila 5 - RMSD l.b.', 'Fila 5 - RMSD u.b.', 'Fila 6 - Mode', 'Fila 6 - Affinity (kcal/mol)', 'Fila 6 - RMSD l.b.', 'Fila 6 - RMSD u.b.', 'Fila 7 - Mode', 'Fila 7 - Affinity (kcal/mol)', 'Fila 7 - RMSD l.b.', 'Fila 7 - RMSD u.b.', 'Fila 8 - Mode', 'Fila 8 - Affinity (kcal/mol)', 'Fila 8 - RMSD l.b.', 'Fila 8 - RMSD u.b.', 'Fila 9 - Mode', 'Fila 9 - Affinity (kcal/mol)', 'Fila 9 - RMSD l.b.', 'Fila

In [61]:
resultate['id'] = resultate['id'].astype(str)
df_final_merged['id'] = df_final_merged['id'].astype(str)

# Realizar una combinación para asegurar que solo se mantengan las filas coincidentes
df_final_merged = resultate[['proteina', 'id']].merge(df_final_merged, on=['proteina', 'id'], how='left')
df_final_merged = df_final_merged.dropna(subset=['atoms'])

df_final_merged

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,Fila 48 - RMSD l.b.,Fila 48 - RMSD u.b.,Fila 49 - Mode,Fila 49 - Affinity (kcal/mol),Fila 49 - RMSD l.b.,Fila 49 - RMSD u.b.,Fila 50 - Mode,Fila 50 - Affinity (kcal/mol),Fila 50 - RMSD l.b.,Fila 50 - RMSD u.b.
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,3.116,4.677,49.0,-3.7,3.743,4.686,50.0,-3.7,2.937,3.323
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,2.835,3.286,49.0,-3.9,6.512,6.919,50.0,-3.9,4.373,5.530
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,2.066,2.486,49.0,-3.8,3.263,5.034,50.0,-3.7,2.464,3.373
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000


In [62]:
# Fusionar los DataFrames resultate y df_final_merged usando 'id' y 'proteina' como claves
df_final_merged = pd.merge(df_final_merged, resultate[['id', 'proteina', 'Binding_Affinity']], on=['id', 'proteina'], how='left')

# Mostrar el DataFrame actualizado
df_final_merged

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,Fila 48 - RMSD u.b.,Fila 49 - Mode,Fila 49 - Affinity (kcal/mol),Fila 49 - RMSD l.b.,Fila 49 - RMSD u.b.,Fila 50 - Mode,Fila 50 - Affinity (kcal/mol),Fila 50 - RMSD l.b.,Fila 50 - RMSD u.b.,Binding_Affinity
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,7.13
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,10.06
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,9.85
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,12.93
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,4.677,49.0,-3.7,3.743,4.686,50.0,-3.7,2.937,3.323,50
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,3.286,49.0,-3.9,6.512,6.919,50.0,-3.9,4.373,5.530,30
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,2.486,49.0,-3.8,3.263,5.034,50.0,-3.7,2.464,3.373,9.57
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,24.11


In [63]:
# Ruta donde se guardará el archivo Excel
ruta_excel = '/home/nfernandez/unit/dataset_docking.xlsx'

# Guardar el DataFrame en un archivo Excel
df_final_merged.to_excel(ruta_excel, index=False)

print(f"DataFrame guardado en: {ruta_excel}")

DataFrame guardado en: /home/nfernandez/unit/dataset_docking.xlsx


### Limpieza de descriptores ligandos

In [64]:
df_ligando= df_descriptores.copy()
df_ligando


# Reemplazar los valores NaN en el DataFrame df_ligando con 0
df_ligando = df_ligando.fillna(0)

# Eliminar columnas con todos los valores iguales a 0
df_ligando = df_ligando.loc[:, (df_ligando != 0).any(axis=0)]
if 'ABC' in df_ligando.columns:
    df_ligando = df_ligando.drop('ABC', axis=1)
if 'ABCGG' in df_ligando.columns:
    df_ligando = df_ligando.drop('ABCGG', axis=1)

# Rename 'Nombre' column to 'id' and remove '.pdb' from the 'id' column
df_ligandos = df_ligando.rename(columns={'Nombre': 'id'})
df_ligandos['id'] = df_ligandos['id'].str.replace('.pdb', '')

# Iterar sobre las columnas del dataframe
for col in df_ligandos.columns:
    # Excluir las columnas 'proteina' y 'Compound name' del proceso de conversión
    if col not in ['id'] and df_ligandos[col].dtype == 'object':
        try:
            # Intentar convertir a tipo numérico
            df_ligandos[col] = pd.to_numeric(df_ligandos[col], errors='coerce')
        except ValueError:
            # Si la conversión falla, deja la columna como está
            print(f"No se pudo convertir la columna {col} a numérico.")

# Reemplazar los valores booleanos con enteros en df_ligandos
df_ligandos = df_ligandos.applymap(lambda x: 1 if x == True else (0 if x == False else x))


# Specify the path where you want to save the dataset
file_path = '/home/nfernandez/unit/dataset_ligandos.xlsx'

# Save the DataFrame to an Excel file
df_ligandos.to_excel(file_path, index=False)

print(f"Dataset saved to: {file_path}")
df_ligandos.info()
df_ligandos

  df_ligandos = df_ligandos.applymap(lambda x: 1 if x == True else (0 if x == False else x))


Dataset saved to: /home/nfernandez/unit/dataset_ligandos.xlsx
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Columns: 1625 entries, id to mZagreb2
dtypes: float64(1493), int64(131), object(1)
memory usage: 3.1+ MB


Unnamed: 0,id,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,60,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,2.876615,2.673468,...,7.126891,32.187603,116.120115,4.838338,84,5,26.0,24.0,3.500000,2.250000
1,251,0,0,8.472136,2.000000,4.000000,8.472136,1.059017,2.899228,2.683282,...,7.738488,33.811160,118.099380,5.368154,74,5,30.0,28.0,4.722222,1.916667
2,18,0,0,20.860105,1.969616,3.939231,20.860105,1.227065,3.644675,3.810017,...,8.164226,45.799808,238.229666,5.068716,816,14,62.0,60.0,5.750000,4.500000
3,71,0,0,24.136411,1.993169,3.986338,24.136411,1.206821,3.813575,4.084374,...,8.474077,50.019056,282.255880,5.226961,1313,17,76.0,74.0,7.111111,5.083333
4,25,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,2.876615,2.673468,...,7.126891,32.187603,114.104465,5.186567,84,5,26.0,24.0,3.500000,2.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,136,0,0,6.987918,1.801938,3.603875,6.987918,1.164653,2.579830,2.341896,...,6.608001,28.105124,86.073165,5.379573,35,3,18.0,16.0,3.000000,1.750000
250,229,0,0,15.750491,1.949856,3.899712,15.750491,1.211576,3.372523,3.354527,...,7.826044,40.245095,182.167065,5.204773,364,10,46.0,44.0,4.750000,3.500000
251,171,0,0,8.054679,1.847759,3.695518,8.054679,1.150668,2.739193,2.513670,...,6.900731,30.257210,100.088815,5.267832,56,4,22.0,20.0,3.250000,2.000000
252,94,0,0,23.412409,1.975377,3.950753,23.412409,1.232232,3.757227,4.018055,...,8.298291,48.426412,266.260966,5.023792,1140,16,70.0,68.0,6.250000,5.000000


### Limpieza descriptores Proteinas

In [65]:
df_protein = df_descriptors1.copy()
df_protein

# Reemplazar los valores NaN en el DataFrame df_protein con 0
df_protein = df_protein.fillna(0)

# Eliminar columnas con todos los valores iguales a 0
df_protein = df_protein.loc[:, (df_protein != 0).any(axis=0)]

# Specify the path where you want to save the dataset
file_path = '/home/nfernandez/unit/dataset_proteinas.xlsx'

# Save the DataFrame to an Excel file
df_protein.to_excel(file_path, index=False)

print(f"Dataset saved to: {file_path}")

df_protein.info()
df_protein

Dataset saved to: /home/nfernandez/unit/dataset_proteinas.xlsx
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Columns: 1505 entries, proteina to QSOgrant50
dtypes: float64(1504), object(1)
memory usage: 1.3+ MB


Unnamed: 0,proteina,A,R,N,D,C,E,Q,G,H,...,QSOgrant41,QSOgrant42,QSOgrant43,QSOgrant44,QSOgrant45,QSOgrant46,QSOgrant47,QSOgrant48,QSOgrant49,QSOgrant50
0,CmedPBP4,6.122,2.721,1.361,5.442,4.082,11.565,4.762,9.524,2.721,...,0.029493,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341
1,CpunPBP2,8.451,2.113,2.113,7.746,4.225,11.972,2.817,4.225,4.225,...,0.029432,0.031559,0.031934,0.032102,0.026964,0.029060,0.033730,0.031201,0.029511,0.032615
2,CpunPBP5,8.392,4.196,2.797,6.294,4.196,11.888,2.797,4.196,4.196,...,0.031201,0.030126,0.031509,0.030460,0.027874,0.028269,0.031272,0.030974,0.032708,0.031396
3,CsinGOBP1,4.861,4.167,1.389,6.250,4.861,13.194,4.167,3.472,6.944,...,0.031101,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562
4,CsinGOBP2,8.511,4.255,2.128,7.801,4.255,10.638,2.128,4.255,7.092,...,0.031391,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,DkikPBP2,6.993,2.797,2.797,7.692,4.196,9.091,3.497,5.594,4.196,...,0.029841,0.033287,0.032367,0.031716,0.029321,0.028513,0.031184,0.029517,0.030908,0.032506
106,DhouPBP1,7.143,2.857,5.000,7.857,4.286,8.571,2.857,3.571,5.000,...,0.033032,0.030671,0.032426,0.028554,0.028250,0.031033,0.030718,0.031505,0.032773,0.028406
107,ScerGOBP1,6.207,6.207,2.069,4.138,4.828,13.793,2.759,4.138,8.276,...,0.032404,0.033225,0.033919,0.029509,0.024662,0.030385,0.032298,0.030258,0.028813,0.033485
108,ScerGOBP2,7.801,4.255,2.837,7.092,4.255,12.057,1.418,4.965,5.674,...,0.031197,0.033450,0.031377,0.032264,0.026384,0.031429,0.030621,0.029995,0.031577,0.031415


# Armado de Dataset

In [66]:
df_final_merged1 = df_final_merged.copy()
df_final_merged1

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,Fila 48 - RMSD u.b.,Fila 49 - Mode,Fila 49 - Affinity (kcal/mol),Fila 49 - RMSD l.b.,Fila 49 - RMSD u.b.,Fila 50 - Mode,Fila 50 - Affinity (kcal/mol),Fila 50 - RMSD l.b.,Fila 50 - RMSD u.b.,Binding_Affinity
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,7.13
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,10.06
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,9.85
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,12.93
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,4.677,49.0,-3.7,3.743,4.686,50.0,-3.7,2.937,3.323,50
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,3.286,49.0,-3.9,6.512,6.919,50.0,-3.9,4.373,5.530,30
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,2.486,49.0,-3.8,3.263,5.034,50.0,-3.7,2.464,3.373,9.57
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.000,0.000,24.11


In [67]:
df_ligan2= df_ligandos.copy()
df_ligan2

Unnamed: 0,id,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,60,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,2.876615,2.673468,...,7.126891,32.187603,116.120115,4.838338,84,5,26.0,24.0,3.500000,2.250000
1,251,0,0,8.472136,2.000000,4.000000,8.472136,1.059017,2.899228,2.683282,...,7.738488,33.811160,118.099380,5.368154,74,5,30.0,28.0,4.722222,1.916667
2,18,0,0,20.860105,1.969616,3.939231,20.860105,1.227065,3.644675,3.810017,...,8.164226,45.799808,238.229666,5.068716,816,14,62.0,60.0,5.750000,4.500000
3,71,0,0,24.136411,1.993169,3.986338,24.136411,1.206821,3.813575,4.084374,...,8.474077,50.019056,282.255880,5.226961,1313,17,76.0,74.0,7.111111,5.083333
4,25,0,0,9.517541,1.879385,3.758770,9.517541,1.189693,2.876615,2.673468,...,7.126891,32.187603,114.104465,5.186567,84,5,26.0,24.0,3.500000,2.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,136,0,0,6.987918,1.801938,3.603875,6.987918,1.164653,2.579830,2.341896,...,6.608001,28.105124,86.073165,5.379573,35,3,18.0,16.0,3.000000,1.750000
250,229,0,0,15.750491,1.949856,3.899712,15.750491,1.211576,3.372523,3.354527,...,7.826044,40.245095,182.167065,5.204773,364,10,46.0,44.0,4.750000,3.500000
251,171,0,0,8.054679,1.847759,3.695518,8.054679,1.150668,2.739193,2.513670,...,6.900731,30.257210,100.088815,5.267832,56,4,22.0,20.0,3.250000,2.000000
252,94,0,0,23.412409,1.975377,3.950753,23.412409,1.232232,3.757227,4.018055,...,8.298291,48.426412,266.260966,5.023792,1140,16,70.0,68.0,6.250000,5.000000


In [68]:
df_protein2 = df_protein.copy()
df_protein2

Unnamed: 0,proteina,A,R,N,D,C,E,Q,G,H,...,QSOgrant41,QSOgrant42,QSOgrant43,QSOgrant44,QSOgrant45,QSOgrant46,QSOgrant47,QSOgrant48,QSOgrant49,QSOgrant50
0,CmedPBP4,6.122,2.721,1.361,5.442,4.082,11.565,4.762,9.524,2.721,...,0.029493,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341
1,CpunPBP2,8.451,2.113,2.113,7.746,4.225,11.972,2.817,4.225,4.225,...,0.029432,0.031559,0.031934,0.032102,0.026964,0.029060,0.033730,0.031201,0.029511,0.032615
2,CpunPBP5,8.392,4.196,2.797,6.294,4.196,11.888,2.797,4.196,4.196,...,0.031201,0.030126,0.031509,0.030460,0.027874,0.028269,0.031272,0.030974,0.032708,0.031396
3,CsinGOBP1,4.861,4.167,1.389,6.250,4.861,13.194,4.167,3.472,6.944,...,0.031101,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562
4,CsinGOBP2,8.511,4.255,2.128,7.801,4.255,10.638,2.128,4.255,7.092,...,0.031391,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,DkikPBP2,6.993,2.797,2.797,7.692,4.196,9.091,3.497,5.594,4.196,...,0.029841,0.033287,0.032367,0.031716,0.029321,0.028513,0.031184,0.029517,0.030908,0.032506
106,DhouPBP1,7.143,2.857,5.000,7.857,4.286,8.571,2.857,3.571,5.000,...,0.033032,0.030671,0.032426,0.028554,0.028250,0.031033,0.030718,0.031505,0.032773,0.028406
107,ScerGOBP1,6.207,6.207,2.069,4.138,4.828,13.793,2.759,4.138,8.276,...,0.032404,0.033225,0.033919,0.029509,0.024662,0.030385,0.032298,0.030258,0.028813,0.033485
108,ScerGOBP2,7.801,4.255,2.837,7.092,4.255,12.057,1.418,4.965,5.674,...,0.031197,0.033450,0.031377,0.032264,0.026384,0.031429,0.030621,0.029995,0.031577,0.031415


In [69]:
df_merged12 = pd.merge(df_final_merged1, df_ligan2, on='id', how='inner')
# Move the 'Binding_Affinity' column to the end of the DataFrame
cols = list(df_merged12.columns)
cols.remove('Binding_Affinity')
df_merged12 = df_merged12[cols + ['Binding_Affinity']]
df_merged12

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,TSRW10,MW_y,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Binding_Affinity
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,45.596315,192.151415,5.651512,301,19,68.0,76.0,6.645833,3.069444,7.13
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,45.596315,192.151415,5.651512,301,19,68.0,76.0,6.645833,3.069444,10.06
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,45.596315,192.151415,5.651512,301,19,68.0,76.0,6.645833,3.069444,9.85
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,45.596315,192.151415,5.651512,301,19,68.0,76.0,6.645833,3.069444,12.93
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,45.596315,192.151415,5.651512,301,19,68.0,76.0,6.645833,3.069444,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,32.688753,102.104465,4.862117,46,6,26.0,26.0,4.472222,1.777778,50
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,32.688753,102.104465,4.862117,46,6,26.0,26.0,4.472222,1.777778,30
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,32.688753,102.104465,4.862117,46,6,26.0,26.0,4.472222,1.777778,9.57
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,36.722228,122.073165,6.424903,94,8,38.0,40.0,2.861111,2.250000,24.11


In [70]:
df_merged3 = pd.merge(df_merged12, df_protein2, on='proteina', how='inner')
cols = list(df_merged3.columns)
cols.remove('Binding_Affinity')
df_merged3 = df_merged3[cols + ['Binding_Affinity']]

# Convert 'Binding_Affinity' column to float type
df_merged3['Binding_Affinity'] = df_merged3['Binding_Affinity'].astype(float)

df_merged3.info()
df_merged3

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Columns: 3346 entries, proteina to Binding_Affinity
dtypes: float64(3213), int64(131), object(2)
memory usage: 37.2+ MB


Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,QSOgrant42,QSOgrant43,QSOgrant44,QSOgrant45,QSOgrant46,QSOgrant47,QSOgrant48,QSOgrant49,QSOgrant50,Binding_Affinity
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341,7.13
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031559,0.031934,0.032102,0.026964,0.029060,0.033730,0.031201,0.029511,0.032615,10.06
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.030126,0.031509,0.030460,0.027874,0.028269,0.031272,0.030974,0.032708,0.031396,9.85
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,12.93
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829,30.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341,50.00
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,30.00
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829,9.57
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,24.11


In [71]:
# Reemplazar NaN con 0 en df_merged3
df_merged3 = df_merged3.fillna(0)
df_merged3 = df_merged3.loc[:, (df_merged3 != 0).any(axis=0)]

# Now 'df_merged3' will have only columns with at least one non-zero value.
df_merged3

Unnamed: 0,proteina,id,atoms,bonds,dbonds,HBA1,HBA2,HBD,logP,MP,...,QSOgrant42,QSOgrant43,QSOgrant44,QSOgrant45,QSOgrant46,QSOgrant47,QSOgrant48,QSOgrant49,QSOgrant50,Binding_Affinity
0,CmedPBP4,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341,7.13
1,CpunPBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031559,0.031934,0.032102,0.026964,0.029060,0.033730,0.031201,0.029511,0.032615,10.06
2,CpunPBP5,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.030126,0.031509,0.030460,0.027874,0.028269,0.031272,0.030974,0.032708,0.031396,9.85
3,CsinGOBP1,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,12.93
4,CsinGOBP2,1,14.0,14.0,3.0,1.0,1.0,0.0,0.6110,407.1765,...,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829,30.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,CmedPBP4,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.031452,0.030593,0.032004,0.026301,0.031368,0.032444,0.027974,0.029908,0.034341,50.00
1454,CsinGOBP1,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,30.00
1455,CsinGOBP2,252,8.0,7.0,0.0,2.0,1.0,1.0,0.3110,205.3128,...,0.031523,0.031068,0.032233,0.026548,0.030795,0.031066,0.030422,0.029792,0.031829,9.57
1456,CsinGOBP1,253,9.0,9.0,3.0,1.0,1.0,0.0,0.6558,249.5221,...,0.033718,0.033598,0.028898,0.025392,0.030858,0.032347,0.029792,0.030958,0.033562,24.11


In [72]:
# Specify the path where you want to save the dataset
file_path = '/home/nfernandez/unit/dataset_unidos.xlsx'

# Save the DataFrame to an Excel file
df_merged3.to_excel(file_path, index=False)

print(f"Dataset saved to: {file_path}")

Dataset saved to: /home/nfernandez/unit/dataset_unidos.xlsx
