# ------------------------------------------------------------------------------------------------
# RC Publications data download
# ------------------------------------------------------------------------------------------------

In [1]:
from datetime import datetime
import os
import pandas as pd
import logging
from typing import List, Tuple

from utils import extract_metadata, reduce_metadata
from service import RC_Api

from constants import INPUT_DATA_PATH

# Get current date in yyyy_mm_dd format
current_date = datetime.now().strftime('%Y_%m_%d')

# Configure logging
log_dir = 'logs'
os.makedirs(log_dir, exist_ok=True)
log_file_path = os.path.join(log_dir, f'{current_date}_metadata_processing.log')

logger = logging.getLogger('metadata_logger')
if not logger.hasHandlers():
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler(log_file_path)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)


In [2]:
LIMIT = 250

def query_api(year: int) -> Tuple[List[dict], int]:
    query = f'dc.date.issued:"{year}"'
    page = 0
    total_records = 0
    metadata = []
    logger.info(f'START - Retrieving {page=} {year=} ...')
    while (page * LIMIT <= total_records):
        if page > 0:
            logger.info(f'Retrieving {page=} {year=} ...')
        offset = page * LIMIT
        parameters_search = {
            "limit": LIMIT,
            "offset": offset,
            "expand": "metadata",
            "query": query
        }
        res_obj = RC_Api.search(parameters=parameters_search)
        if not res_obj:
            logger.info(f'DONE - No records for {year=}')
            break
        metadata.extend(map(extract_metadata, res_obj))
        total_records += len(res_obj)
        page += 1
        logger.info(f'Retrieved {total_records} records for {year}')
    logger.info(f'DONE - Retrieved page={page-1} {total_records=} {year=}')
    return metadata, total_records

def download_metadata(years: List[int]) -> None:
    for year in years:
        metadata, total_records = query_api(year)
        if total_records > 0:
            reduced_metadata = reduce_metadata(metadata)
            df = pd.DataFrame(reduced_metadata, dtype=str)
            path_to_save = os.path.join(INPUT_DATA_PATH, f'{year}_metadata.csv')
            df.to_csv(path_to_save, index=False, escapechar='\n', encoding='utf-8')
            logger.info(f'Saved metadata for {year} to {path_to_save}')

In [5]:
#download_metadata(range(1900, 2025))
download_metadata(range(2026, 2027))


In [None]:
""" 
def query_api(year: int):
    query = f'dc.date.issued:"{year}"'
    page = 0
    total_records = 0
    metadata = []
    print(f'== INFO - START - Retriving {page=} {year=} ...')
    while (page*LIMIT <= total_records):
        if page > 0:
            print(f'== INFO - Retriving {page=} {year=} ...')
        offset = page * LIMIT
        parameters_search = {
            "limit": LIMIT,
            "offset": offset,
            "expand": "metadata",
            "query": query
        }
        res_obj = RC_Api.search(parameters = parameters_search)
        if not res_obj:
            print(f'== INFO - DONE - No records for {year=}')
            break
        metadata.extend(map(extract_metadata, res_obj))
        total_records += len(res_obj)
        page += 1
        print(f'== INFO - Retrieved {total_records} records for {year}')
    print(f'== INFO - DONE - Retrived page={page-1} {total_records=} {year=}')
    return metadata, total_records

def download_metadata(years: list):
    for year in years:
        metadata, total_records = query_api(year)
        if total_records > 0:
            reduced_metadata = reduce_metadata(metadata)
            df = pd.DataFrame(reduced_metadata, dtype=str)
            path_to_save = os.path.join(INPUT_DATA_PATH, f'{year}_metadata.csv')
            df.to_csv(path_to_save, index=False, escapechar='\n', encoding='utf-8')
            print(f'== INFO - Saved metadata for {year} to {path_to_save}') """

' \ndef query_api(year: int):\n    query = f\'dc.date.issued:"{year}"\'\n    page = 0\n    total_records = 0\n    metadata = []\n    print(f\'== INFO - START - Retriving {page=} {year=} ...\')\n    while (page*LIMIT <= total_records):\n        if page > 0:\n            print(f\'== INFO - Retriving {page=} {year=} ...\')\n        offset = page * LIMIT\n        parameters_search = {\n            "limit": LIMIT,\n            "offset": offset,\n            "expand": "metadata",\n            "query": query\n        }\n        res_obj = RC_Api.search(parameters = parameters_search)\n        if not res_obj:\n            print(f\'== INFO - DONE - No records for {year=}\')\n            break\n        metadata.extend(map(extract_metadata, res_obj))\n        total_records += len(res_obj)\n        page += 1\n        print(f\'== INFO - Retrieved {total_records} records for {year}\')\n    print(f\'== INFO - DONE - Retrived page={page-1} {total_records=} {year=}\')\n    return metadata, total_records

In [None]:
#download_metadata(range(2020, 2022))

# ------------------------------------------------------------------------------------------------
# CKONSORG DOWNLOAD
# ------------------------------------------------------------------------------------------------

In [1]:
import oracledb
import pandas as pd
import os
from dotenv import load_dotenv
from utils import get_latest_ckonsorg_filename
from constants import PATH_TO_CKONSORG_DATA

load_dotenv()

# Define your database connection parameters
dsn_tns = oracledb.makedsn(os.environ.get("CKONSORG_HOST"), '1521', service_name=os.environ.get("CKONSORG_DB"))

 # Define the output CSV file path
csv_file_path = PATH_TO_CKONSORG_DATA + get_latest_ckonsorg_filename()

try:
    # Establish the database connection
    connection = oracledb.connect(user=os.environ.get("CKONSORG_USERNAME"), 
									password=os.environ.get("CKONSORG_PASSWORD"), 
									dsn=dsn_tns)

    # Create a cursor object using the cursor() method
    cursor = connection.cursor()

    # Define the SQL query
    sql_query = 'SELECT * FROM CO.VW_CKONSORG421A_Q_AKTUELL_KOM'

    # Execute the SQL query
    cursor.execute(sql_query)

    # Fetch all the rows from the executed query
    rows = cursor.fetchall()

    # Fetch column names
    col_names = [i[0] for i in cursor.description]

    # Create a DataFrame from the fetched data
    df = pd.DataFrame(rows, columns=col_names)

    # Export the DataFrame to a CSV file
    df.to_csv(csv_file_path, index=False)

    print(f"Data successfully exported to {csv_file_path}")

except oracledb.DatabaseError as e:
    print(f"Database error occurred: {e}")

finally:
    # Close the cursor and connection
    if cursor:
        cursor.close()
    if connection:
        connection.close()

INFO - Connected to DB: <bound method PSQL_DB.__repr__ of Postgres('biblioowner', <password hidden>, 'id-hdb-psgr-cp46.ethz.ch', '5432', 'bibliometrics')>
Data successfully exported to /home/bibliometric/data/research_collection/ckonsorg/ckonsorg_202408.csv
