In [3]:
import pandas as pd
import requests
from tqdm import tqdm
from typing import Optional, List, Dict
from collections import Counter
import os
from ratelimit import limits, sleep_and_retry

# Límite de la API de RNAcentral
CALLS = 20
PERIOD = 1

@sleep_and_retry
@limits(calls=CALLS, period=PERIOD)
def get_sequence(urs):
    seq_response = requests.get(
        f'https://rnacentral.org/api/v1/rna/{urs}.fasta',
        timeout=15
    )
    seq_response.raise_for_status()
    sequence_lines = seq_response.text.splitlines()[1:]
    sequence = "".join(sequence_lines)
    return sequence

def fetch_rna_sequences(
    custom_query: str,
    max_results: Optional[int] = None,
    fields: List[str] = ["description", "species", "rna_type", "so_rna_type_name", "expert_db", "length"],
    filename: Optional[str] = None,
    batch_size: int = 100
) -> pd.DataFrame:
    base_url = 'https://www.ebi.ac.uk/ebisearch/ws/rest/rnacentral'
    query_params = {
        'query': custom_query,
        'fields': ','.join(fields),
        'format': 'json',
        'size': batch_size,
        'start': 0
    }

    print(f"📋 Query: {custom_query}")

    # Primera consulta para obtener el total de resultados
    try:
        response = requests.get(base_url, params=query_params, timeout=30)
        response.raise_for_status()
        data = response.json()
        total_found = int(data['hitCount'])
        print(f"⚡ Encontrados {total_found} resultados")
    except requests.exceptions.RequestException as e:
        print(f"❌ Error en la consulta: {e}")
        return pd.DataFrame()

    # Calcular número total de resultados a obtener
    if max_results and max_results < total_found:
        total_to_fetch = max_results
        print(f"⚡ Obteniendo {max_results} de {total_found} resultados")
    else:
        total_to_fetch = total_found
        print(f"⚡ Obteniendo todos los {total_found} resultados")

    # Extraer resultados con paginación
    results = []
    pbar = tqdm(total=total_to_fetch, desc="Extrayendo secuencias")

    start_index = 0
    while start_index < total_to_fetch and len(results) < total_to_fetch:
        query_params['start'] = start_index
        current_batch_size = min(batch_size, total_to_fetch - len(results))
        query_params['size'] = current_batch_size

        try:
            response = requests.get(base_url, params=query_params, timeout=30)
            response.raise_for_status()
            data = response.json()

            for entry in data['entries']:
                if len(results) >= total_to_fetch:
                    break

                full_id = entry['id']
                urs = full_id.split('_')[0]

                metadata = {}
                for field in fields:
                    field_data = entry['fields'].get(field, [])
                    # Solo el último nombre para so_rna_type_name
                    if field == "so_rna_type_name" and isinstance(field_data, list) and field_data:
                        metadata[field] = field_data[-1]
                    elif isinstance(field_data, list) and len(field_data) > 1:
                        metadata[field] = "; ".join(str(x) for x in field_data)
                    elif field_data:
                        metadata[field] = field_data[0] if isinstance(field_data, list) else field_data
                    else:
                        metadata[field] = ""

                try:
                    sequence = get_sequence(urs)
                    results.append({
                        "URS_ID": full_id,
                        "Sequence": sequence,
                        "Length": len(sequence),
                        **metadata
                    })
                    pbar.update(1)
                except requests.exceptions.RequestException as e:
                    print(f"⚠️ Error obteniendo secuencia {urs}: {e}")
                    continue

            start_index += len(data['entries'])

        except requests.exceptions.RequestException as e:
            print(f"⚠️ Error en paginación: {e}")
            break

    pbar.close()

    df = pd.DataFrame(results)

    if not df.empty:
        if filename:
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"💾 Datos guardados en: {filename}")
        print(f"✅ {len(df)} secuencias obtenidas de {total_found} encontradas")
        return df
    else:
        print("❌ No se encontraron secuencias válidas")
        return pd.DataFrame()

def fetch_all_trna_sec(max_results: Optional[int] = None, filename: Optional[str] = None):
    query = 'tRNA AND so_rna_type_name:"Selenocysteinyl_tRNA"'
    return fetch_rna_sequences(
        custom_query=query,
        max_results=max_results,
        fields=["description", "species", "rna_type", "so_rna_type_name", "expert_db", "length"],
        filename=filename,
        batch_size=50
    )

def markdown_summary(df):
    md = []
    md.append(f"# Estadísticas tRNA-Sec\n")
    md.append(f"- **Total secuencias:** {len(df)}")
    md.append(f"- **Longitud promedio:** {df['Length'].mean():.1f} nt")
    md.append(f"- **Especies únicas:** {df['species'].nunique()}\n")
    md.append("## Primeras 5 secuencias\n")
    for i, row in df.head().iterrows():
        md.append(f"### {row['URS_ID']}")
        md.append(f"- **Especie:** {row['species']}")
        md.append(f"- **Longitud:** {row['Length']} nt")
        md.append(f"- **Tipo RNA:** {row['rna_type']}")
        md.append(f"- **Subtipo ARN:** {row['so_rna_type_name']}")
        md.append(f"- **Bases de datos:** {row['expert_db']}")
        md.append(f"- **Descripción:** {row['description'][:100]}...\n")
    # Bases de datos más comunes
    all_dbs = []
    for dbs in df['expert_db']:
        if ';' in dbs:
            all_dbs.extend([db.strip() for db in dbs.split(';')])
        else:
            all_dbs.append(dbs)
    db_counts = Counter(all_dbs)
    md.append("## Bases de datos más comunes\n")
    for db, count in db_counts.most_common(10):
        md.append(f"- **{db}**: {count} secuencias")
    return "\n".join(md)

# EJECUCIÓN PRINCIPAL
print("=== EXTRAYENDO tRNA-Sec CON DATOS COMPLETOS ===")
maxsize = input("¿Cuántas secuencias quieres obtener? (Enter para todas): ")
maxsize = int(maxsize) if maxsize.strip() else None

# Nombres de archivo dinámicos
base_name = "all" if maxsize is None else str(maxsize)
csv_path = f"data/raw/{base_name}_trna_sec_detailed.csv"
md_path = f"data/raw/{base_name}_summary_trna_sec.md"

df_sample = fetch_all_trna_sec(max_results=maxsize, filename=csv_path)

if not df_sample.empty:
    print(f"\n📊 ESTADÍSTICAS DESCRIPTIVAS")
    md = markdown_summary(df_sample)
    print(md)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)
    print(f"✅ CSV guardado en: {csv_path}")
    print(f"✅ Resumen guardado en: {md_path}")
else:
    print("❌ No se encontraron secuencias válidas")

=== EXTRAYENDO tRNA-Sec CON DATOS COMPLETOS ===
📋 Query: tRNA AND so_rna_type_name:"Selenocysteinyl_tRNA"
⚡ Encontrados 43902 resultados
⚡ Obteniendo 50 de 43902 resultados


Extrayendo secuencias: 100%|██████████| 50/50 [00:49<00:00,  1.01it/s]

💾 Datos guardados en: data/raw/50_trna_sec_detailed.csv
✅ 50 secuencias obtenidas de 43902 encontradas

📊 ESTADÍSTICAS DESCRIPTIVAS
# Estadísticas tRNA-Sec

- **Total secuencias:** 50
- **Longitud promedio:** 87.9 nt
- **Especies únicas:** 33

## Primeras 5 secuencias

### URS00001DA281_9606
- **Especie:** Homo sapiens
- **Longitud:** 87 nt
- **Tipo RNA:** tRNA
- **Subtipo ARN:** selenocysteinyl_tRNA
- **Bases de datos:** ENA; HGNC; GeneCards; MalaCards; GtRNAdb; PDBe
- **Descripción:** Homo sapiens (human) tRNA-SeC (anticodon TCA) 1-1 (TRU-TCA1-1)...

### URS0000C8E9EB_9606
- **Especie:** Homo sapiens
- **Longitud:** 84 nt
- **Tipo RNA:** tRNA
- **Subtipo ARN:** selenocysteinyl_tRNA
- **Bases de datos:** GtRNAdb; HGNC; GeneCards
- **Descripción:** Homo sapiens (human) tRNA-SeC (anticodon TCA) 2-1 (TRU-TCA2-1)...

### URS00008FED48_9606
- **Especie:** Homo sapiens
- **Longitud:** 90 nt
- **Tipo RNA:** tRNA
- **Subtipo ARN:** selenocysteinyl_tRNA
- **Bases de datos:** PDBe
- **Descripci


