# **First option**

In [None]:
import csv
import requests
import xml.etree.ElementTree as ET
import spacy
from datetime import datetime
import pandas as pd
nlp = spacy.load("en_core_web_sm")

In [None]:
def keyword_extraction(abstract, max_palabras=5):

    doc = nlp(abstract)
    key_words = [token.text for token in doc if token.pos_ in ["NOUN", "ADJ"]]
    key_words = list(dict.fromkeys(key_words))[:max_palabras]

    return key_words

In [None]:
def df_to_ris(dataframe, ris_filename):
    with open(ris_filename, mode='w', newline='', encoding='utf-8') as ris_file:

        for i,row in dataframe.iterrows():

            date_obj = datetime.strptime(str(row['published']), '%Y-%m-%d %H:%M:%S')
            year = date_obj.strftime('%Y')
            month = date_obj.strftime('%m')
            day = date_obj.strftime('%d')

            title = row['title'].replace('\n', ' ').replace('\r', ' ')

            abstract = row['summary'].replace('\n', ' ').replace('\r', ' ')

            abstract_title = f"{title}. {abstract}"
            key_words = keyword_extraction(abstract_title,max_palabras=6)

            if datetime(2019, 1, 1) <= datetime(int(year), int(month), int(day)) <= datetime(2024, 2, 12):

              ris_file.write("TY  - \n")

              ris_file.write(f"TI  - {title}\n")

              ris_file.write(f"SP  - \n")

              ris_file.write(f"EP  - \n")

              for author in row['authors'].split(';'):
                ris_file.write(f"AU  - {author.strip()}\n")

              ris_file.write(f"PY  - {year}\n")

              for word in key_words:
                ris_file.write(f"KW  - {word}\n")

              ris_file.write(f"DO  - {row['doi']}\n")

              ris_file.write(f"JO  -  \n")

              ris_file.write(f"IS  -  \n")

              ris_file.write(f"SN  -  \n")

              ris_file.write(f"VO  -  \n")

              ris_file.write(f"VL  -  \n")

              ris_file.write(f"JA  -  \n")

              ris_file.write(f"Y1  - {row['published']}\n")

              ris_file.write(f"AB  - {abstract}\n")

              ris_file.write(f"ER  -  \n\n\n")


In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests


def get_arxiv_page(query: str,
                   baseURL: str = "http://export.arxiv.org/api/query?",
                   start: int = 0,
                   max_results: int = 7000,
                   sortBy: str = "relevance",
                   sortOrder: str = "descending",
                   columns: list = [],
                   timeout: float = 10.) -> list:
    """
    Function processes the query and returns a list of data rows.

    Parameters
    ----------
    query : str
        Query to be requested
    baseURL : str, optional, default : "http://export.arxiv.org/api/query?"
        Base URL of the arxiv API
    start : int, optional, default : 0
        Starting index for page
    max_results : int, optional, default : 10
        Maximum number of page entries
    sortBy : str, optional, default : "relevance"
        Sort entries by
    sortOrder : str, optional, default : "descending"
        Order of sorting
    columns : list, optional, default : []
        List of columns to be returned
    timeout : float, optional, default : 10.
        Timeout in seconds for HTTP requests

    Returns
    -------
    rows : list
        List of data rows
    """

    # Check if columns are valid
    valid_columns = ["id", "title", "summary", "authors", "primary_category",
                     "categories", "comments", "updated", "published", "doi", "links"]
    for c in columns:
        if c not in valid_columns:
            raise ValueError(
                "{:s} is not a valid column name.\nValid column names are {}.".format(c, valid_columns))
    # Renaming map of columns for arxiv query
    col_map = {
        "id": "id",
        "title": "title",
        "summary": "summary",
        "authors": "author",
        "primary_category": "arxiv:primary_category",
        "categories": "category",
        "comments": "arxiv:comment",
        "updated": "updated",
        "published": "published",
        "doi": "arxiv:doi",
        "links": "link",
    }

    # Build and request query URL
    url = '{:s}{:s}&start={:d}&max_results={:d}&sortBy={:s}&sortOrder={:s}'.format(
        baseURL,
        query,
        start,
        max_results,
        sortBy,
        sortOrder
    )

    # Retry on server errors or timeouts
    retries = requests.adapters.Retry(total=5, backoff_factor=0.5,
                                      status_forcelist=[429, 500, 502, 503, 504])
    adapter = requests.adapters.HTTPAdapter(max_retries=retries)

    http = requests.Session()
    http.mount("http://", adapter)
    response = http.get(url, timeout=timeout)

    # Read data and get entries
    data = BeautifulSoup(response.text, "xml")
    entries = data.find_all("entry")

    # Loop over entries and build rows of data frame
    rows = []
    for entry in entries:

        d = {}

        # Parse the requested columns
        for c in columns:
            if c in ["authors", "categories", "primary_category", "links"]:
                tmp = entry.find_all(col_map[c])
                if tmp is not None:
                    v = []
                    for t in tmp:
                        if c == "authors":
                            v.append(t.find("name").text.strip())
                        elif c in ["categories", "primary_category"]:
                            v.append(t["term"].strip())
                        elif c == "links":
                            v.append(t["href"].strip())
                    val = "; ".join(v)
            else:
                tmp = entry.find(col_map[c])
                tmp_stripped = tmp.text.strip() if tmp is not None else ""
                if c == "id":
                    val = tmp_stripped[21:]
                elif c == "summary":
                    val = tmp_stripped.replace("\n", " ")
                elif c in ["published", "updated"]:
                    val = datetime.strptime(tmp_stripped, "%Y-%m-%dT%H:%M:%SZ")
                else:
                    val = tmp_stripped
            d[c] = val

        rows.append(d)

    df = pd.DataFrame(rows, columns=columns)

    df.drop_duplicates(inplace=True, ignore_index=True)

    return df

In [None]:
query = "search_query=((all:interpre* OR all:expla* OR all:'explainable artificial intelligence') AND (all:'agnostic' OR all:'black-box') AND (all:'method' OR all:'technique') AND ( all:'machine learning' OR all:'deep learning' OR all:'artificial intelligence'))"
df = get_arxiv_page(query,columns = ["id", "title", "summary", "authors", "primary_category", "categories", "comments", "updated", "published", "doi", "links"])
df_to_ris(df, 'ArXiv.ris')

In [None]:
df.shape

(1332, 11)

# **Second option**

In [None]:
import csv
import requests
import xml.etree.ElementTree as ET
import spacy
from datetime import datetime
nlp = spacy.load("en_core_web_sm")

def keyword_extraction(abstract, max_palabras=5):

    doc = nlp(abstract)
    key_words = [token.text for token in doc if token.pos_ in ["NOUN", "ADJ"]]
    key_words = list(dict.fromkeys(key_words))[:max_palabras]

    return key_words

def guardar_metadatos_arxiv_csv(search_query, max_results=100, filename='arxiv_metadatos.csv'):
    url_base = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': search_query,
        'start': 0,
        'max_results': max_results,
    }

    response = requests.get(url_base, params=params)
    if response.status_code != 200:
        print("Error al obtener los datos")
        return

    root = ET.fromstring(response.content)

    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['TY', 'TI', 'AU', 'AB', 'KW', 'Y1', 'UR', 'ID','PY','DO'])

        for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):

            title = entry.find('{http://www.w3.org/2005/Atom}title').text

            authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])

            summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
            abstract = f"{title}. {summary}"
            key_words = keyword_extraction(abstract,max_palabras=6)

            published = entry.find('{http://www.w3.org/2005/Atom}published').text

            link = entry.find('{http://www.w3.org/2005/Atom}link[@title="pdf"]').attrib['href']

            arxiv_id = entry.find('{http://www.w3.org/2005/Atom}id').text

            date_obj = datetime.strptime(published, '%Y-%m-%dT%H:%M:%SZ')
            year = date_obj.strftime('%Y')

            doi = entry.find('{http://www.w3.org/2005/Atom}doi').text if entry.find('{http://www.w3.org/2005/Atom}doi') is not None else ''

            writer.writerow(['arXiv', title, authors , summary, ', '.join(key_words), published, link, arxiv_id,year,doi])
            #'TY', 'TI', 'AU', 'AB', 'KW', 'Y1', 'UR', 'ID','PY','DO'

def csv_to_ris(csv_filename, ris_filename):
    with open(csv_filename, mode='r', newline='', encoding='utf-8') as csv_file, \
         open(ris_filename, mode='w', newline='', encoding='utf-8') as ris_file:

        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            ris_file.write("TY  - \n")

            title = row['TI'].replace('\n', ' ').replace('\r', ' ')
            ris_file.write(f"TI  - {title}\n")

            ris_file.write(f"SP  - \n")

            ris_file.write(f"EP  - \n")

            for author in row['AU'].split(','):
              ris_file.write(f"AU  - {author.strip()}\n")

            ris_file.write(f"PY  - {row['PY']}\n")

            for keyword in row['KW'].split(','):
              ris_file.write(f"KW  - {keyword.strip()}\n")

            ris_file.write(f"DO  - {row['DO']}\n")

            ris_file.write(f"JO  -  \n")

            ris_file.write(f"IS  -  \n")

            ris_file.write(f"SN  -  \n")

            ris_file.write(f"VO  -  \n")

            ris_file.write(f"VL  -  \n")

            ris_file.write(f"JA  -  \n")

            ris_file.write(f"Y1  - {row['Y1']}\n")

            abstract = row['AB'].replace('\n', ' ').replace('\r', ' ')
            ris_file.write(f"AB  -{abstract[1:-1]}\n")

            ris_file.write(f"ER  -  \n\n\n")


search_query = "((all:interpre* OR all:expla* OR all:'explainable artificial intelligence') AND (all:'agnostic' OR all:'black-box') AND (all:'method' OR all:'technique') AND ( all:'machine learning' OR all:'deep learning' OR all:'artificial intelligence') ) AND (submittedDate:[20190101 TO 20240201])"
file_name_csv = 'ArXiv_Citation_CSV.csv'
file_name_ris = 'ArXiv_Citation_RIS.ris'

guardar_metadatos_arxiv_csv(search_query, max_results=200, filename= file_name_csv)
csv_to_ris(file_name_csv, file_name_ris)



KeyboardInterrupt: 

# **MERGE RIS FILES OF IEEE**


In [None]:
import os

def unir_archivos_ris(directorio, archivo_salida):
    # Lista para almacenar el contenido de todos los archivos .ris
    contenido_total = []

    # Recorrer el directorio para encontrar archivos .ris
    for archivo in os.listdir(directorio):
        if archivo.endswith(".ris"):
            # Abrir y leer el contenido del archivo .ris
            with open(os.path.join(directorio, archivo), 'r', encoding='utf-8') as f:
                contenido = f.read()
                # Agregar el contenido a la lista total
                contenido_total.append(contenido)
                # Agregar una línea en blanco entre archivos para separar las entradas
                contenido_total.append('\n')

    # Escribir el contenido combinado al archivo de salida
    with open(archivo_salida, 'w', encoding='utf-8') as f:
        f.write('\n'.join(contenido_total))

# Usar la función

# Usar la función
directorio = '/content/MIX'
archivo_salida = 'MERGE.ris'
unir_archivos_ris(directorio, archivo_salida)


In [None]:
def contar_ocurrencias_palabra(archivo, palabra):
    # Inicializar contador de ocurrencias
    contador = 0

    # Abrir y leer el archivo
    with open(archivo, 'r', encoding='utf-8') as f:
        # Leer el archivo línea por línea
        for linea in f:
            # Contar las ocurrencias de la palabra en la línea actual
            contador += linea.count(palabra)

    return contador

# Ruta al archivo .ris
archivo_ris = '/content/MERGE.ris'
# Palabra a buscar
palabra_a_buscar = 'TY  -'

# Contar ocurrencias y mostrar el resultado

ocurrencias = contar_ocurrencias_palabra(archivo_ris, palabra_a_buscar)
print(f'La palabra "{palabra_a_buscar}" se repite {ocurrencias} veces en el archivo.')


La palabra "TY  -" se repite 3347 veces en el archivo.


SPRINGER NATURE

In [None]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET

def obtener_datos_springer_formato_pam(api_key, consulta):
    url_base = 'http://api.springernature.com/meta/v2/pam'
    parametros = {
        'q': consulta,
        'api_key': api_key,
    }

    respuesta = requests.get(url_base, params=parametros)

    if respuesta.status_code == 200:
        # Parsear la respuesta XML
        root = ET.fromstring(respuesta.content)

        # Lista para almacenar los datos de cada registro
        registros = []

        # Iterar sobre cada registro en la respuesta XML
        for record in root.findall('.//record'):
            # Diccionario para almacenar los datos del registro actual
            datos_registro = {}
            article = record.find('.//pam:message/pam:article', namespaces=root.nsmap)
            if article is not None:
                for child in article.findall('.//', namespaces=article.nsmap):
                    tag = child.tag.split('}')[-1]  # Eliminar el namespace del nombre del tag
                    datos_registro[tag] = child.text
                # Añadir el diccionario del registro actual a la lista de registros
                registros.append(datos_registro)

        # Convertir la lista de registros a un DataFrame de Pandas
        df = pd.DataFrame(registros)
        return df
    else:
        print(f"Error en la solicitud: {respuesta.status_code}, Respuesta: {respuesta.text}")
        return pd.DataFrame()

api_key = '426834b47cd5bd8e73a7eb2794eb6fcd'
consulta = "((all:interpre* OR all:expla* OR all:'explainable artificial intelligence') AND (all:'agnostic' OR all:'black-box') AND (all:'method' OR all:'technique') AND (all:'machine learning' OR all:'deep learning' OR all:'artificial intelligence'))"

df_resultado = obtener_datos_springer_formato_pam(api_key, consulta)

print(df_resultado)




Empty DataFrame
Columns: []
Index: []


ACM

In [None]:
# Actualizaremos el código para manejar adecuadamente múltiples autores y palabras clave,
# y asegurarnos de que el formato de salida cumpla con el orden y estructura solicitados.

def process_entry_comprehensive(entry):
    # Extrae todos los campos de la entrada
    fields = re.findall(r'(\w{2})  - (.*?)\n', entry)
    field_dict = {}
    for field, value in fields:
        if field in ['AU', 'KW']:  # Campos que pueden repetirse
            if field not in field_dict:
                field_dict[field] = [value]
            else:
                field_dict[field].append(value)
        else:
            field_dict[field] = value
    # Ajusta el tipo de documento y asegura el orden y presencia de campos deseados
    # Asumimos que el campo T2 (journal title) y otros campos específicos pueden no estar presentes y los agregamos manualmente si es necesario
    field_dict['TY'] = 'JOUR'  # Cambia el tipo de documento a JOUR
    if 'T2' not in field_dict:
        field_dict['T2'] = ''  # Añade el campo T2 si no existe
    # Asegurar todos los campos necesarios
    for field in desired_order:
        if field not in field_dict:
            field_dict[field] = ''
    # Ordena y genera la nueva entrada, manejando múltiples autores y palabras clave
    new_entry = ""
    for field in desired_order:
        if field in field_dict:
            if isinstance(field_dict[field], list):
                for item in field_dict[field]:
                    new_entry += f"{field}  - {item}\n"
            else:
                new_entry += f"{field}  - {field_dict[field]}\n"
    new_entry += "ER  -\n\n"
    return new_entry

# Leer el contenido del archivo RIS actualizado
with open(file_path_updated, 'r', encoding='utf-8') as file:
    ris_content_updated = file.read()

# Encuentra todas las entradas en el contenido
entries_updated = entry_pattern.findall(ris_content_updated)

# Procesa cada entrada con la lógica actualizada
processed_entries_comprehensive = [process_entry_comprehensive(entry) for entry in entries_updated]

# Combina todas las entradas procesadas
new_ris_content_comprehensive = "".join(processed_entries_comprehensive)

# Guardar el contenido procesado en un nuevo archivo RIS
new_file_path_comprehensive = '/content/ACM_MENDE.ris'
with open(new_file_path_comprehensive, 'w', encoding='utf-8') as file:
    file.write(new_ris_content_comprehensive)

new_file_path_comprehensive


In [None]:
# Procesar cada entrada con el enfoque mejorado para asegurar la inclusión de todas las citas
processed_entries_improved = [process_entry_comprehensive(entry) for entry in entries_improved]

# Combina todas las entradas procesadas
new_ris_content_improved = "".join(processed_entries_improved)

# Guardar el contenido procesado en un nuevo archivo RIS, asegurando que todas las entradas estén incluidas
new_file_path_improved = '/mnt/data/reorganizado_improved.ris'
with open(new_file_path_improved, 'w', encoding='utf-8') as file:
    file.write(new_ris_content_improved)

new_file_path_improved
