In [4]:
import os
import subprocess
import requests
import pandas as pd
import concurrent.futures
import logging
from ftplib import FTP

# ------------------ CONFIG ------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

THERMO_RAW_PARSER = "mono /home/alejandro/Descargas/ThermoRawFileParser/ThermoRawFileParser.exe"
MSGFPLUS = "java -jar /home/alejandro/Descargas/MSGFPlus_v20240326/MSGFPlus.jar"
ID_FILE_CONVERTER = "IDFileConverter"
FEATURE_FINDER = "FeatureFinderIdentification"
PROTEIN_QUANTIFIER = "ProteinQuantifier"
FALSE_DISCOVERY_RATE = "FalseDiscoveryRate"
FASTA_DATABASE = "human_proteome.fasta"

FTP_SERVER = "massive-ftp.ucsd.edu"
BASE_DIR = "./Massive/"
os.makedirs(BASE_DIR, exist_ok=True)

# Tu listado (el que subiste) renómbralo/ponlo aquí:
LISTA_FTP = os.path.join(BASE_DIR, "descarga_massive.txt")

CONTAMINANT_PROTEINS = [
    "sp|P04264|K2C1_HUMAN",   # Keratina
    "sp|P35908|K22E_HUMAN",   # Otra keratina
]

# ------------------ UTILIDADES ------------------
def normalize_massive_path(p: str) -> str:
    """
    Convierte rutas tipo 'f.MSV000091530/updates/...' en '/v05/MSV000091530/updates/...'
    y limpia tabs/espacios extra.
    """
    p = p.strip()
    if not p:
        return ""
    # Quitar tabs/líneas con indentación
    p = p.lstrip("\t ").rstrip()
    if p.startswith("f.MSV"):
        p = p.replace("f.MSV", "MSV", 1)   # f.MSV000091530 -> MSV000091530
        p = "/v05/" + p
    # Asegurar que arranca por "/" para usar ftp.cwd sin sorpresas
    if not p.startswith("/"):
        p = "/" + p
    return p

def read_remote_paths(list_file: str):
    """Lee las rutas del fichero y devuelve solo las que terminan en .raw normalizadas."""
    paths = []
    with open(list_file, "r") as f:
        for line in f:
            norm = normalize_massive_path(line)
            if norm and norm.lower().endswith(".raw"):
                paths.append(norm)
    return paths

def ftp_download_file(ftp: FTP, remote_path: str, local_path: str):
    """
    Descarga un archivo desde el FTP. Hace cwd al directorio y retrbinary del nombre base.
    Crea BASE_DIR si no existe. Si ya existe localmente, lo salta.
    """
    if os.path.exists(local_path):
        logging.info(f"Ya existe localmente, se omite descarga: {local_path}")
        return

    remote_dir = os.path.dirname(remote_path)
    remote_name = os.path.basename(remote_path)

    logging.info(f"Descargando desde FTP: {remote_path}  →  {local_path}")
    ftp.cwd(remote_dir)
    with open(local_path, "wb") as fh:
        ftp.retrbinary(f"RETR {remote_name}", fh.write)
    logging.info(f"✅ Archivo descargado: {local_path}")

def remove_contaminants(tsv_or_csv_file):
    """
    Elimina proteínas contaminantes del CSV/TSV de cuantificación de proteínas.
    ProteinQuantifier suele producir TSV con cabeceras; ajustamos lectura flexible.
    """
    try:
        # Intento 1: TSV con cabeceras estándar (OpenMS suele usar \t)
        df = pd.read_csv(tsv_or_csv_file, sep="\t", comment="#")
    except Exception:
        # Intento 2: CSV por si acaso
        df = pd.read_csv(tsv_or_csv_file)

    if "protein" not in df.columns:
        # A veces OpenMS usa 'accession' o similar; intenta detectar
        cand = [c for c in df.columns if c.lower() in ("protein", "accession", "protein_accession")]
        if cand:
            df = df.rename(columns={cand[0]: "protein"})
        else:
            logging.error(f"No encuentro columna 'protein' (ni equivalente) en {tsv_or_csv_file}")
            return

    df_filtrado = df[~df["protein"].isin(CONTAMINANT_PROTEINS)]
    # Conserva el mismo separador que tenía (si detectaste \t, guarda como TSV)
    if "\t" in open(tsv_or_csv_file, "r", encoding="utf-8", errors="ignore").read(5000):
        df_filtrado.to_csv(tsv_or_csv_file, sep="\t", index=False)
    else:
        df_filtrado.to_csv(tsv_or_csv_file, index=False)
    logging.info(f"Contaminantes eliminados en: {tsv_or_csv_file}")

def run_command(command):
    logging.info(f"Ejecutando: {command}")
    try:
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        logging.error(f"Error ejecutando: {command}\n{e}")
        raise

def process_raw_file(local_raw_path: str):
    try:
        file_name = os.path.basename(local_raw_path)
        base_name = os.path.splitext(file_name)[0]

        # Salidas esperadas (contempla .mzML y .mzML.gz por si tu versión comprime)
        mzml_file = os.path.join(BASE_DIR, f"{base_name}.mzML")
        mzml_gz   = os.path.join(BASE_DIR, f"{base_name}.mzML.gz")

        output_quant = os.path.join(BASE_DIR, f"{base_name}_protein_abundances.csv")
        if os.path.exists(output_quant):
            logging.info(f"Salida existente; se omite: {output_quant}")
            return

        if not os.path.exists(local_raw_path):
            logging.error(f"No existe el .raw local: {local_raw_path}")
            return

        # 1) RAW -> mzML  (IMPORTANTE: -o es carpeta, no archivo)
        run_command(
            f'{THERMO_RAW_PARSER} -i "{local_raw_path}" -o "{BASE_DIR}" -f 1'
        )

        # Si TRFP generó .mzML.gz, descomprime o ajusta la ruta
        if os.path.exists(mzml_gz) and not os.path.exists(mzml_file):
            # Opción A: usar el .mzML.gz directo si tus herramientas lo aceptan
            # mzML para MS-GF+ debe ser descomprimido normalmente:
            import gzip, shutil
            with gzip.open(mzml_gz, "rb") as fin, open(mzml_file, "wb") as fout:
                shutil.copyfileobj(fin, fout)
            os.remove(mzml_gz)

        if not os.path.exists(mzml_file):
            logging.error(f"No encuentro el mzML esperado: {mzml_file}")
            return

        # 2) Identificación MS-GF+ (mzML -> mzid)
        mzid_file = os.path.join(BASE_DIR, f"{base_name}.mzid")
        run_command(f'{MSGFPLUS} -s "{mzml_file}" -d "{FASTA_DATABASE}" -o "{mzid_file}"')

        # 3) .mzid -> .idXML
        idxml_file = os.path.join(BASE_DIR, f"{base_name}.idXML")
        run_command(f'{ID_FILE_CONVERTER} -in "{mzid_file}" -out "{idxml_file}"')

        # 4) Cuantificación de péptidos (features)
        feature_file = os.path.join(BASE_DIR, f"{base_name}.featureXML")
        run_command(f'{FEATURE_FINDER} -in "{mzml_file}" -id "{idxml_file}" -out "{feature_file}"')

        # 5) Cuantificación de proteínas
        run_command(f'{PROTEIN_QUANTIFIER} -in "{feature_file}" -out "{output_quant}" -top:N 0')

        # 6) Limpieza de contaminantes
        remove_contaminants(output_quant)

        # 7) Borrar intermedios
        for p in [mzml_file, mzid_file, idxml_file, feature_file]:
            if os.path.exists(p):
                try:
                    os.remove(p)
                    logging.info(f"Borrado: {p}")
                except Exception as e:
                    logging.error(f"No se pudo borrar {p}: {e}")

        logging.info(f"✅ Procesado completo: {output_quant}")

    except Exception as e:
        logging.error(f"Error procesando {local_raw_path}: {e}")


def download_all_raws():
    """Descarga todos los .raw listados en LISTA_FTP hacia BASE_DIR."""
    if not os.path.exists(LISTA_FTP):
        logging.error(f"No encuentro el listado: {LISTA_FTP}")
        return []

    remote_paths = read_remote_paths(LISTA_FTP)
    if not remote_paths:
        logging.error("El listado no tenía rutas .raw válidas.")
        return []

    local_paths = []
    with FTP(FTP_SERVER) as ftp:
        ftp.login()  # anónimo
        for rp in remote_paths:
            fname = os.path.basename(rp)
            local = os.path.join(BASE_DIR, fname)
            try:
                ftp_download_file(ftp, rp, local)
                local_paths.append(local)
            except Exception as e:
                logging.error(f"Fallo descargando {rp}: {e}")
    return local_paths

def main():
    # 1) Descargar todos los RAW del listado (omite los ya existentes)
    local_raws = download_all_raws()
    if not local_raws:
        logging.error("No hay .raw locales para procesar.")
        return

    # 2) Procesar en paralelo
    max_workers = min(3, len(local_raws))
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
        ex.map(process_raw_file, local_raws)

if __name__ == "__main__":
    main()


2025-09-19 11:43:46,923 [INFO] Descargando desde FTP: /v05/MSV000091530/updates/2023-03-23_glyanglife_fc206ce7/raw/global/Global_YXLC-G_cancer.raw  →  ./Massive/Global_YXLC-G_cancer.raw
2025-09-19 11:44:21,228 [INFO] ✅ Archivo descargado: ./Massive/Global_YXLC-G_cancer.raw
2025-09-19 11:44:21,432 [INFO] Ejecutando: mono /home/alejandro/Descargas/ThermoRawFileParser/ThermoRawFileParser.exe -i "./Massive/Global_YXLC-G_cancer.raw" -o "./Massive/" -f 1
2025-09-19 11:44:21,778 [ERROR] Error ejecutando: mono /home/alejandro/Descargas/ThermoRawFileParser/ThermoRawFileParser.exe -i "./Massive/Global_YXLC-G_cancer.raw" -o "./Massive/" -f 1
Command 'mono /home/alejandro/Descargas/ThermoRawFileParser/ThermoRawFileParser.exe -i "./Massive/Global_YXLC-G_cancer.raw" -o "./Massive/" -f 1' returned non-zero exit status 1.
2025-09-19 11:44:21,778 [ERROR] Error procesando ./Massive/Global_YXLC-G_cancer.raw: Command 'mono /home/alejandro/Descargas/ThermoRawFileParser/ThermoRawFileParser.exe -i "./Massive

2025-09-19 11:44:21 INFO Started parsing ./Massive/Global_YXLC-G_cancer.raw
2025-09-19 11:44:21 ERROR RAW file cannot be processed because of an error - ThermoFisher.CommonCore.RawFileReader.Facade.RawFileLoader


In [2]:
2+2

4