diff --git a/src/rda_python_common/pg_dbi.py b/src/rda_python_common/pg_dbi.py index 89bb495..193f7e7 100644 --- a/src/rda_python_common/pg_dbi.py +++ b/src/rda_python_common/pg_dbi.py @@ -20,8 +20,45 @@ from .pg_log import PgLOG class PgDBI(PgLOG): + """PostgreSQL Database Interface layer extending PgLOG. + + Provides a high-level API for connecting to and querying PostgreSQL databases + using psycopg2. Supports single and batch INSERT, SELECT, UPDATE, and DELETE + operations, transaction management, schema introspection, user lookups, usage + tracking, and credential retrieval from .pgpass or OpenBao. + + Inherits all logging and utility helpers from PgLOG. + + Instance Attributes: + pgdb (connection | None): Active psycopg2 connection, or None when disconnected. + curtran (int): Transaction counter: 0 = idle, >0 = inside a transaction. + NMISSES (list): Cached list of scientist IDs (userno) not found in the DB. + LMISSES (list): Cached list of login names not found in the DB. + TABLES (dict): Cache of table-field default info keyed by table name. + SEQUENCES (dict): Cache of sequence field names keyed by table name. + SPECIALIST (dict): Cache of specialist records keyed by dataset ID. + SYSDOWN (dict): Cache of system-down status records keyed by hostname. + PGDBI (dict): Active connection and configuration parameters. + PGSIGNS (list): Special comparison sign tokens recognised by get_field_condition(). + CHCODE (int): psycopg2 type code for CHAR columns (used to strip trailing spaces). + DBPORTS (dict): Mapping of database names to non-default TCP port numbers. + DBPASS (dict): Credentials loaded from .pgpass, keyed by (host, port, db, user). + DBBAOS (dict): Credentials loaded from OpenBao, keyed by database name. + DBNAMES (dict): Mapping of schema names to their parent database names. + DBSOCKS (dict): Mapping of database names to Unix socket paths. + VIEWHOMES (dict): Mapping of hostnames to home directories for the view host. + PGRES (list): Reserved PostgreSQL keywords that must be double-quoted as identifiers. + ADDTBLS (list): Names of tables already created in this session (avoids duplicates). + """ def __init__(self): + """Initialise PgDBI with default connection parameters and format helpers. + + Calls the parent PgLOG.__init__(), then sets up SQL timestamp format lambdas, + connection configuration defaults (host, port, socket, schema, credentials), + and operational limits (page size, max transaction size, max record count). + Values are overridden by environment variables when present. + """ super().__init__() # initialize parent class # PostgreSQL specified query timestamp format @@ -98,13 +135,27 @@ def __init__(self): self.PGDBI['MTRANS'] = 5000 # max number of changes in one transactions self.PGDBI['MAXICNT'] = 6000000 # maximum number of records in each table - # set environments and defaults def SETPGDBI(self, name, value): + """Set a PGDBI configuration key, preferring the matching environment variable. + + Args: + name (str): Configuration key to set in self.PGDBI. + value: Default value to use when no matching environment variable exists. + """ self.PGDBI[name] = self.get_environment(name, value) - # create a pgddl command string with - # table name (tname), prefix (pre) and suffix (suf) def get_pgddl_command(self, tname, pre = None, suf = None, scname = None): + """Build a pgddl shell command string for a given table. + + Args: + tname (str): Table name, optionally prefixed with schema (e.g. 'schema.table'). + pre (str | None): Optional prefix appended with '-y' flag. + suf (str | None): Optional suffix appended with '-x' flag. + scname (str | None): Schema name override; parsed from tname when omitted. + + Returns: + str: A pgddl command string ready for use with pgsystem(). + """ ms = re.match(r'^(.+)\.(.+)$', tname) if not scname: if ms: @@ -117,32 +168,56 @@ def get_pgddl_command(self, tname, pre = None, suf = None, scname = None): if pre: xy += ' -y ' + pre return "pgddl {} -aa -h {} -d {} -c {} -u {}{}".format(tname, self.PGDBI['DBHOST'], self.PGDBI['DBNAME'], scname, self.PGDBI['LNNAME'], xy) - # set default connection for dssdb PostgreSQL Server def dssdb_dbname(self): + """Switch the active connection to the default dssdb/dssdb schema.""" self.default_scinfo(self.PGDBI['DEFDB'], self.PGDBI['DEFSC'], self.PGLOG['PSQLHOST']) dssdb_scname = dssdb_dbname - # set default connection for obsua PostgreSQL Server def obsua_dbname(self): + """Switch the active connection to the upadb/obsua schema on the misc host.""" self.default_scinfo('upadb', 'obsua', self.PGLOG['PMISCHOST']) obsua_scname = obsua_dbname - # set default connection for ivaddb PostgreSQL Server def ivaddb_dbname(self): + """Switch the active connection to the ivaddb/ivaddb schema on the misc host.""" self.default_scinfo('ivaddb', 'ivaddb', self.PGLOG['PMISCHOST']) ivaddb_scname = ivaddb_dbname - # set default connection for ispddb PostgreSQL Server def ispddb_dbname(self): + """Switch the active connection to the ispddb/ispddb schema on the misc host.""" self.default_scinfo('ispddb', 'ispddb', self.PGLOG['PMISCHOST']) ispddb_scname = ispddb_dbname - # set a default schema info with hard coded info def default_dbinfo(self, scname = None, dbhost = None, lnname = None, pwname = None, dbport = None, socket = None): + """Set default connection info derived from a schema name. + + Looks up the parent database name for scname and delegates to default_scinfo(). + + Args: + scname (str | None): Schema name; uses current DEFSC when None. + dbhost (str | None): Host override. + lnname (str | None): Login name override. + pwname (str | None): Password override. + dbport (int | None): Port override. + socket (str | None): Unix socket path override. + """ return self.default_scinfo(self.get_dbname(scname), scname, dbhost, lnname, pwname, dbport, socket) - # set default database/schema info with hard coded info def default_scinfo(self, dbname = None, scname = None, dbhost = None, lnname = None, pwname = None, dbport = None, socket = None): + """Set the active connection to hard-coded default values. + + Any argument left as None falls back to the corresponding PGDBI default + (DEFDB, DEFSC, DEFHOST, DEFPORT, DEFSOCK). Disconnects if parameters changed. + + Args: + dbname (str | None): Database name override. + scname (str | None): Schema name override. + dbhost (str | None): Host override. + lnname (str | None): Login name override. + pwname (str | None): Password override. + dbport (int | None): Port override. + socket (str | None): Unix socket path override. + """ if not dbname: dbname = self.PGDBI['DEFDB'] if not scname: scname = self.PGDBI['DEFSC'] if not dbhost: dbhost = self.PGDBI['DEFHOST'] @@ -150,38 +225,99 @@ def default_scinfo(self, dbname = None, scname = None, dbhost = None, lnname = N if socket is None: socket = self.PGDBI['DEFSOCK'] self.set_scname(dbname, scname, lnname, pwname, dbhost, dbport, socket) - # get the datbase sock file name of a given dbname for local connection def get_dbsock(self, dbname): + """Return the Unix socket path for a database, falling back to the default. + + Args: + dbname (str): Database name to look up in DBSOCKS. + + Returns: + str: Socket path, or the 'default' entry if dbname is not found. + """ return (self.DBSOCKS[dbname] if dbname in self.DBSOCKS else self.DBSOCKS['default']) - # get the datbase port number of a given dbname for remote connection def get_dbport(self, dbname): + """Return the TCP port for a database, falling back to the default. + + Args: + dbname (str): Database name to look up in DBPORTS. + + Returns: + int: Port number, or the 'default' entry if dbname is not found. + """ return (self.DBPORTS[dbname] if dbname in self.DBPORTS else self.DBPORTS['default']) - # get the datbase name of a given schema name for remote connection def get_dbname(self, scname): + """Return the parent database name for a given schema name. + + Args: + scname (str | None): Schema name to look up in DBNAMES. + + Returns: + str | None: Resolved database name, the 'default' entry as fallback, + or None when scname is falsy. + """ if scname: if scname in self.DBNAMES: return self.DBNAMES[scname] return self.DBNAMES['default'] return None - # set connection for viewing database information def view_dbinfo(self, scname = None, lnname = None, pwname = None): + """Set the active connection to the view host for read-only queries. + + Args: + scname (str | None): Schema name; uses DEFSC when None. + lnname (str | None): Login name override. + pwname (str | None): Password override. + """ self.view_scinfo(self.get_dbname(scname), scname, lnname, pwname) - # set connection for viewing database/schema information def view_scinfo(self, dbname = None, scname = None, lnname = None, pwname = None): + """Set the active connection to the view host with explicit database/schema names. + + Args: + dbname (str | None): Database name; uses DEFDB when None. + scname (str | None): Schema name; uses DEFSC when None. + lnname (str | None): Login name override. + pwname (str | None): Password override. + """ if not dbname: dbname = self.PGDBI['DEFDB'] if not scname: scname = self.PGDBI['DEFSC'] self.set_scname(dbname, scname, lnname, pwname, self.PGLOG['PVIEWHOST'], self.PGDBI['VWPORT']) - # set connection for given scname def set_dbname(self, scname = None, lnname = None, pwname = None, dbhost = None, dbport = None, socket = None): + """Set the active connection parameters derived from a schema name. + + Resolves the parent database from scname and calls set_scname(). + + Args: + scname (str | None): Schema name; uses DEFSC when None. + lnname (str | None): Login name override. + pwname (str | None): Password override. + dbhost (str | None): Host override. + dbport (int | None): Port override. + socket (str | None): Unix socket path override. + """ if not scname: scname = self.PGDBI['DEFSC'] self.set_scname(self.get_dbname(scname), scname, lnname, pwname, dbhost, dbport, socket) - # set connection for given database & schema names def set_scname(self, dbname = None, scname = None, lnname = None, pwname = None, dbhost = None, dbport = None, socket = None): + """Update active connection parameters and disconnect if anything changed. + + Compares each supplied argument against the current PGDBI value and updates + it when different. Automatically selects socket vs. port depending on whether + the target host matches the local hostname. Calls pgdisconnect() when any + parameter changes so the next operation reconnects with the new settings. + + Args: + dbname (str | None): Database name override. + scname (str | None): Schema name override (also resets LNNAME). + lnname (str | None): Login name override. + pwname (str | None): Password override (None is a meaningful value). + dbhost (str | None): Host override. + dbport (int | None): Port override. + socket (str | None): Unix socket path override. + """ changed = 0 if dbname and dbname != self.PGDBI['DBNAME']: self.PGDBI['DBNAME'] = dbname @@ -211,8 +347,12 @@ def set_scname(self, dbname = None, scname = None, lnname = None, pwname = None, changed = 1 if changed and self.pgdb is not None: self.pgdisconnect(1) - # start a database transaction and exit if fails def starttran(self): + """Begin a new database transaction. + + Ends any in-progress transaction first, then connects if not already connected, + and disables autocommit so subsequent DML is grouped into a single transaction. + """ if self.curtran == 1: self.endtran() # try to end previous transaction if not self.pgdb: self.pgconnect(0, 0, False) @@ -227,22 +367,44 @@ def starttran(self): self.pgdb.autocommit = False self.curtran = 1 - # end a transaction with changes committed and exit if fails def endtran(self, autocommit = True): + """Commit the current transaction and optionally restore autocommit mode. + + Args: + autocommit (bool): When True (default) re-enables autocommit after commit + and resets curtran to 0; when False keeps curtran active. + """ if self.curtran and self.pgdb: if not self.pgdb.closed: self.pgdb.commit() self.pgdb.autocommit = autocommit self.curtran = 0 if autocommit else 1 - # end a transaction without changes committed and exit inside if fails def aborttran(self, autocommit = True): + """Roll back the current transaction without committing changes. + + Args: + autocommit (bool): When True (default) re-enables autocommit after rollback + and resets curtran to 0; when False keeps curtran active. + """ if self.curtran and self.pgdb: if not self.pgdb.closed: self.pgdb.rollback() self.pgdb.autocommit = autocommit self.curtran = 0 if autocommit else 1 - # record error message to dscheck record and clean the lock def record_dscheck_error(self, errmsg, logact = None): + """Write an error message to the dscheck record and release its process lock. + + Only updates the record when the current process still holds the lock + (matched by host and PID). Sets status to 'E' and clears the PID when + logact includes EXITLG. + + Args: + errmsg (str): Error message to store in dscheck.errmsg. + logact (int | None): Logging action flags; defaults to PGDBI['EXITLG']. + + Returns: + int: Number of rows updated (SUCCESS/FAILURE). + """ if logact is None: logact = self.PGDBI['EXITLG'] check = self.PGLOG['DSCHECK'] chkcnd = check['chkcnd'] if 'chkcnd' in check else "cindex = {}".format(check['cindex']) @@ -270,8 +432,24 @@ def record_dscheck_error(self, errmsg, logact = None): record['errmsg'] = errmsg return self.pgupdt("dscheck", record, chkcnd, logact) - # local function to log query error def qelog(self, dberror, sleep, sqlstr, vals, pgcnt, logact = None): + """Log a database query error and optionally sleep before a retry. + + Formats a human-readable message combining the DB error, retry context, + SQL string, and bound values, then passes it to pglog(). When a dscheck + record is active and logact includes EXITLG, also records the error there. + + Args: + dberror (str): Raw database error string (pgcode + pgerror). + sleep (int): Seconds to sleep after logging; 0 means no sleep. + sqlstr (str): SQL statement or short retry description. + vals: Bound parameter values shown in the log message. + pgcnt (int): Retry attempt counter (0-based). + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Always self.FAILURE so callers can use ``return self.qelog(...)``. + """ if logact is None: logact = self.PGDBI['ERRLOG'] retry = " Sleep {}(sec) & ".format(sleep) if sleep else " " if sqlstr: @@ -293,19 +471,42 @@ def qelog(self, dberror, sleep, sqlstr, vals, pgcnt, logact = None): if sleep: time.sleep(sleep) return self.FAILURE # if not exit in self.pglog() - # try to add a new table according the table not exist error def try_add_table(self, dberror, logact): + """Create a missing table when the DB error indicates it does not exist. + + Parses a '42P01 relation not found' error string to extract the table name, + then calls add_new_table() to create it via pgddl. + + Args: + dberror (str): Full database error string to inspect. + logact (int): Logging action flags forwarded to add_new_table(). + """ ms = re.match(r'^42P01 ERROR: relation "(.+)" does not exist', dberror) if ms: tname = ms.group(1) self.add_new_table(tname, logact = logact) - # add a table for given table name def add_a_table(self, tname, logact): + """Create a new table by name (thin wrapper around add_new_table). + + Args: + tname (str): Table name to create. + logact (int): Logging action flags forwarded to add_new_table(). + """ self.add_new_table(tname, logact = logact) - # add a new table for given table name def add_new_table(self, tname, pre = None, suf = None, logact = 0): + """Create a table via pgddl, skipping if already created this session. + + Builds the final table name from tname combined with any prefix or suffix, + checks ADDTBLS to avoid duplicate creation, then runs the pgddl command. + + Args: + tname (str): Base table name (used as the pgddl target). + pre (str | None): Prefix joined with '_' to form the final table name. + suf (str | None): Suffix joined with '_' to form the final table name. + logact (int): Logging action flags forwarded to pgsystem(); default 0. + """ if pre: tbname = '{}_{}'.format(pre, tname) elif suf: @@ -316,9 +517,21 @@ def add_new_table(self, tname, pre = None, suf = None, logact = 0): self.pgsystem(self.get_pgddl_command(tname, pre, suf), logact) self.ADDTBLS.append(tbname) - # validate a table for given table name (tname), prefix (pre) and suffix (suf), - # and add it if not existing def valid_table(self, tname, pre = None, suf = None, logact = 0): + """Ensure a table exists, creating it via pgddl if necessary. + + Skips the existence check when the table was already created this session + (tracked in ADDTBLS). Otherwise calls pgcheck() and runs pgddl when absent. + + Args: + tname (str): Base table name. + pre (str | None): Prefix joined with '_' to form the final table name. + suf (str | None): Suffix joined with '_' to form the final table name. + logact (int): Logging action flags; default 0. + + Returns: + str: The resolved (possibly prefixed/suffixed) table name. + """ if pre: tbname = '{}_{}'.format(pre, tname) elif suf: @@ -330,8 +543,23 @@ def valid_table(self, tname, pre = None, suf = None, logact = 0): self.ADDTBLS.append(tbname) return tbname - # local function to log query error def check_dberror(self, pgerr, pgcnt, sqlstr, ary, logact = None): + """Classify a psycopg2 error and decide whether to retry or abort. + + Handles connection errors (08xxx, 57xxx), lock errors (55xxx), aborted + transactions (25P02), and missing-table errors (42P01 with ADDTBL flag). + Retries up to PGLOG['DBRETRY'] times; exits after that threshold. + + Args: + pgerr (psycopg2.Error): The caught database exception. + pgcnt (int): Current retry count (0-based). + sqlstr (str): SQL statement that caused the error, for logging. + ary: Bound values that were passed to the statement, for logging. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: self.SUCCESS to signal the caller should retry, self.FAILURE to abort. + """ if logact is None: logact = self.PGDBI['ERRLOG'] ret = self.FAILURE pgcode = pgerr.pgcode @@ -366,28 +594,54 @@ def check_dberror(self, pgerr, pgcnt, sqlstr, ary, logact = None): logact |= self.EXITLG # exit for error count exceeds limit return self.qelog(dberror, 0, sqlstr, ary, pgcnt, logact) - # return hash reference to postgresql batch mode command and output file name - def pgbatch(self, sqlfile, foreground = 0): + def pgbatch(self, sqlfile, foreground=0): + """Build a psql batch command dict or foreground pipeline string. + + Sets the PGPASSWORD environment variable before constructing the command so + psql can authenticate non-interactively. + + Args: + sqlfile (str | None): Path to a SQL file to execute. When None or empty, + returns only the psql option string. + foreground (int): When non-zero, returns a foreground pipeline string + (``psql ... < file |``). When 0, returns a dict with + keys 'cmd' (full shell command) and 'out' (output file path). + + Returns: + str | dict: Option string when sqlfile is falsy; pipeline string when + foreground is set; otherwise a dict with 'cmd' and 'out' keys. + """ dbhost = 'localhost' if self.PGDBI['DBSHOST'] == self.PGLOG['HOSTNAME'] else self.PGDBI['DBHOST'] options = "-h {} -p {}".format(dbhost, self.PGDBI['DBPORT']) - pwname = self.get_pgpass_password() - os.environ['PGPASSWORD'] = pwname - options += " -U {} {}".format(self.PGDBI['LNNAME'], self.PGDBI['DBNAME']) + os.environ['PGPASSWORD'] = self.get_pgpass_password() + options += " -U {} {}".format(self.PGDBI['LNNAME'], self.PGDBI['DBNAME']) if not sqlfile: return options if foreground: - batch = "psql {} < {} |".format(options, sqlfile) + return "psql {} < {} |".format(options, sqlfile) + batch = {} + batch['out'] = sqlfile + if re.search(r'\.sql$', batch['out']): + batch['out'] = re.sub(r'\.sql$', '.out', batch['out']) else: - batch['out'] = sqlfile - if re.search(r'\.sql$', batch['out']): - batch['out'] = re.sub(r'\.sql$', '.out', batch['out']) - else: - batch['out'] += ".out" - batch['cmd'] = "psql {} < {} > {} 2>&1".format(options, sqlfile , batch['out']) + batch['out'] += ".out" + batch['cmd'] = "psql {} < {} > {} 2>&1".format(options, sqlfile, batch['out']) return batch - # start a connection to dssdb database and return a DBI object; None if error - # force connect if connect > 0 def pgconnect(self, reconnect = 0, pgcnt = 0, autocommit = True): + """Connect to PostgreSQL and return the connection object. + + Skips reconnection when already connected unless reconnect is non-zero. + Retries on transient errors up to the configured DBRETRY limit, using + pgpass or OpenBao credentials for authentication. + + Args: + reconnect (int): 0 = connect fresh; non-zero = reconnect only if closed. + pgcnt (int): Internal retry counter (start at 0 for external callers). + autocommit (bool): Whether to enable autocommit on the new connection. + + Returns: + connection | int: psycopg2 connection on success, self.FAILURE on error. + """ if self.pgdb: if reconnect and not self.pgdb.closed: return self.pgdb # no need reconnect elif reconnect: @@ -414,8 +668,15 @@ def pgconnect(self, reconnect = 0, pgcnt = 0, autocommit = True): if not self.check_dberror(pgerr, pgcnt, sqlstr, None, self.PGDBI['EXITLG']): return self.FAILURE pgcnt += 1 - # return a PostgreSQL cursor upon success def pgcursor(self): + """Return a cursor with the active schema search path already set. + + Connects automatically if not yet connected. Retries on closed-connection + errors. The search path includes PGDBI['SCPATH'] when it differs from SCNAME. + + Returns: + cursor | int: psycopg2 cursor on success, self.FAILURE on error. + """ pgcur = None if not self.pgdb: self.pgconnect() @@ -438,16 +699,32 @@ def pgcursor(self): pgcnt += 1 return pgcur - # disconnect to dssdb database def pgdisconnect(self, stopit = 1): + """Close the active database connection and clear the connection reference. + + Args: + stopit (int): When non-zero (default), actually closes the connection. + Pass 0 to clear the reference without closing (e.g. after fork). + """ if self.pgdb: if stopit: self.pgdb.close() self.PGLOG['PGDBBUF'] = self.pgdb = None - # gather table field default information as hash array with field names as keys - # and default values as values - # the whole table information is cached to a hash array with table names as keys def pgtable(self, tablename, logact = None): + """Return a dict of column default values for a table, with caching. + + Queries information_schema.columns for the table's column metadata and + maps each column to its effective default value (0 for integers, '' for + strings, None for nullable columns, 0 for sequence/serial columns). + Results are cached in self.TABLES. + + Args: + tablename (str): Fully-qualified (schema.table) or bare table name. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + dict | None: Mapping of column_name → default_value, or None on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if tablename in self.TABLES: return self.TABLES[tablename].copy() # cached already intms = r'^(smallint||bigint|integer)$' @@ -483,8 +760,19 @@ def pgtable(self, tablename, logact = None): self.TABLES[tablename] = pgdefs.copy() return pgdefs - # get sequence field name for given table name def pgsequence(self, tablename, logact = None): + """Return the name of the auto-increment (sequence/serial) column for a table. + + Queries information_schema.columns for a column whose default starts with + 'nextval('. Results are cached in self.SEQUENCES. + + Args: + tablename (str): Fully-qualified or bare table name. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + str | None: Column name of the sequence field, or None if not found. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if tablename in self.SEQUENCES: return self.SEQUENCES[tablename] # cached already condition = self.table_condition(tablename) + " AND column_default LIKE 'nextval(%'" @@ -493,9 +781,20 @@ def pgsequence(self, tablename, logact = None): self.SEQUENCES[tablename] = seqname return seqname - # check default value for integer & string @staticmethod def check_default_value(dflt, isint): + """Normalise a raw column_default string from information_schema. + + Converts integer defaults to int, strips PostgreSQL type-cast notation from + string defaults, and leaves other expressions unchanged. + + Args: + dflt (str): Raw default expression from information_schema.columns. + isint: Truthy when the column is an integer type. + + Returns: + int | str: The normalised default value. + """ if isint: ms = re.match(r"^'{0,1}(\d+)", dflt) if ms: dflt = int(ms.group(1)) @@ -506,9 +805,19 @@ def check_default_value(dflt, isint): dflt = "'{}'".format(dflt) return dflt - # local fucntion: insert prepare pgadd()/pgmadd() for given table and field names - # according to options of multiple place holds and returning sequence id - def prepare_insert(self, tablename, fields, multi = True, getid = None): + def prepare_insert(self, tablename, fields, multi = True, getid = None): + """Build a parameterised INSERT SQL statement. + + Args: + tablename (str): Target table name. + fields (list[str]): Ordered list of column names to insert. + multi (bool): When True uses a multi-value placeholder tuple; when False + uses a single %s (for execute_values). + getid (str | None): Column name to return via RETURNING clause. + + Returns: + str: Complete INSERT SQL string with %s placeholders. + """ strfld = self.pgnames(fields, '.', ',') if multi: strplc = "(" + ','.join(['%s']*len(fields)) + ")" @@ -519,8 +828,17 @@ def prepare_insert(self, tablename, fields, multi = True, getid = None): if self.PGLOG['DBGLEVEL']: self.pgdbg(1000, sqlstr) return sqlstr - # local fucntion: prepare default value for single record - def prepare_default(self, tablename, record, logact = 0): + def prepare_default(self, tablename, record, logact = 0): + """Fill missing (None/empty) values in a single record dict with table defaults. + + Modifies record in-place: for each field whose value is None or an empty + string, replaces it with the column default from pgtable(). + + Args: + tablename (str): Table name used to look up column defaults. + record (dict): Mapping of column_name → value to be updated in-place. + logact (int): Logging action flags forwarded to pgtable(); default 0. + """ table = self.pgtable(tablename, logact) for fld in record: val = record[fld] @@ -532,8 +850,17 @@ def prepare_default(self, tablename, record, logact = 0): vlen = 1 if vlen == 0: record[fld] = table[fld] - # local fucntion: prepare default value for multiple records - def prepare_defaults(self, tablename, records, logact = 0): + def prepare_defaults(self, tablename, records, logact = 0): + """Fill missing (None/empty) values in a multi-record dict with table defaults. + + Modifies records in-place: for each field and each position whose value is + None or an empty string, replaces it with the column default from pgtable(). + + Args: + tablename (str): Table name used to look up column defaults. + records (dict): Mapping of column_name → list-of-values, updated in-place. + logact (int): Logging action flags forwarded to pgtable(); default 0. + """ table = self.pgtable(tablename, logact) for fld in records: vals = records[fld] @@ -547,11 +874,20 @@ def prepare_defaults(self, tablename, records, logact = 0): vlen = 1 if vlen == 0: records[fld][i] = table[fld] - # insert one record into tablename - # tablename: add record for one table name each call - # record: hash reference with keys as field names and hash values as field values - # return self.SUCCESS or self.FAILURE def pgadd(self, tablename, record, logact = None, getid = None): + """Insert a single record into a database table. + + Args: + tablename (str): Target table name. + record (dict): Mapping of column_name → value for the row to insert. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + getid (str | None): When set, returns the value of this RETURNING column + (typically the sequence/serial primary key). + + Returns: + int | any: The RETURNING column value when getid is set; self.SUCCESS (1) + on a plain insert; self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not record: return self.pglog("Nothing adds to " + tablename, logact) if logact&self.DODFLT: self.prepare_default(tablename, record, logact) @@ -584,11 +920,23 @@ def pgadd(self, tablename, record, logact = None, getid = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return ret - # insert multiple records into tablename - # tablename: add records for one table name each call - # records: dict with field names as keys and each value is a list of field values - # return self.SUCCESS or self.FAILURE def pgmadd(self, tablename, records, logact = None, getid = None): + """Insert multiple records into a database table efficiently. + + When getid is set, executes individual inserts to capture each returned ID. + Otherwise uses psycopg2 execute_values() for a single bulk INSERT. + + Args: + tablename (str): Target table name. + records (dict): Mapping of column_name → list-of-values; all lists must + have the same length. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + getid (str | None): Column name to collect via RETURNING for each row. + + Returns: + list | int: List of returned IDs when getid is set; count of rows inserted + otherwise; self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not records: return self.pglog("Nothing to insert to table " + tablename, logact) if logact&self.DODFLT: self.prepare_defaults(tablename, records, logact) @@ -632,8 +980,24 @@ def pgmadd(self, tablename, records, logact = None, getid = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return (ids if ids else count) - # local function: select prepare for pgget() and pgmget() def prepare_select(self, tablenames, fields = None, condition = None, cndflds = None, logact = 0): + """Build a SELECT (or raw) SQL statement from components. + + When tablenames is provided, constructs a full SELECT…FROM…WHERE statement. + When only fields is provided (no tablenames), returns ``SELECT ``. + When only condition is provided, returns condition verbatim (raw SQL). + Appends ``FOR UPDATE`` and starts a transaction when DOLOCK is set. + + Args: + tablenames (str | None): Comma-separated table names for the FROM clause. + fields (str | None): Comma-separated column expressions; None → COUNT(*). + condition (str | None): WHERE clause string or ORDER/GROUP/LIMIT suffix. + cndflds (list | None): Column names for parameterised WHERE clauses (%s). + logact (int): Logging action flags; default 0. + + Returns: + str: Complete SQL statement string. + """ sqlstr = '' if tablenames: if fields: @@ -662,11 +1026,22 @@ def prepare_select(self, tablenames, fields = None, condition = None, cndflds = if self.PGLOG['DBGLEVEL']: self.pgdbg(1000, sqlstr) return sqlstr - # tablenames: comma deliminated string of one or more tables and more than one table for joining, - # fields: comma deliminated string of one or more field names, - # condition: querry conditions for where clause - # return a dict reference with keys as field names upon success def pgget(self, tablenames, fields, condition = None, logact = 0): + """Fetch a single row from one or more tables. + + Appends LIMIT 1 automatically. Returns a count integer when fields is None, + otherwise returns a dict of column_name → value (empty dict when no row found). + CHAR columns are right-stripped. Column names are upper-cased when UCNAME is set. + + Args: + tablenames (str): Comma-separated table names (supports JOINs via WHERE). + fields (str | None): Comma-separated column expressions; None → row count. + condition (str | None): WHERE / ORDER / LIMIT clause. + logact (int): Logging action flags; default PGDBI['ERRLOG']. + + Returns: + dict | int | int: Row dict, count integer, or self.FAILURE on error. + """ if not logact: logact = self.PGDBI['ERRLOG'] if fields and condition and not re.search(r'limit 1$', condition, re.I): condition += " LIMIT 1" sqlstr = self.prepare_select(tablenames, fields, condition, None, logact) @@ -703,12 +1078,22 @@ def pgget(self, tablenames, fields, condition = None, logact = 0): self.pgdbg(1000, "pgget: {} record retrieved from {}".format(cnt, tablenames)) return record - # tablenames: comma deliminated string of one or more tables and more than one table for joining, - # fields: comma deliminated string of one or more field names, - # condition: querry conditions for where clause - # return a dict reference with keys as field names upon success, values for each field name - # are in a list. All lists are the same length with missing values set to None def pgmget(self, tablenames, fields, condition = None, logact = None): + """Fetch multiple rows from one or more tables. + + Returns results as a column-oriented dict: each key is a column name and its + value is a list of that column's values across all returned rows. CHAR columns + are right-stripped. Column names are upper-cased when UCNAME is set. + + Args: + tablenames (str): Comma-separated table names. + fields (str | None): Comma-separated column expressions. + condition (str | None): WHERE / ORDER / LIMIT clause. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + dict | int: Column-oriented result dict (may be empty), or self.FAILURE. + """ if logact is None: logact = self.PGDBI['ERRLOG'] sqlstr = self.prepare_select(tablenames, fields, condition, None, logact) ucname = True if logact&self.UCNAME else False @@ -742,12 +1127,21 @@ def pgmget(self, tablenames, fields, condition = None, logact = None): self.pgdbg(1000, "pgmget: {} record(s) retrieved from {}".format(count, tablenames)) return records - # tablenames: comma deliminated string of one or more tables - # fields: comma deliminated string of one or more field names, - # cnddict: condition dict with field names: values - # return a dict(field names: values) upon success - # retrieve one records from tablenames condition dict def pghget(self, tablenames, fields, cnddict, logact = None): + """Fetch a single row using a condition dict (parameterised query). + + Builds a WHERE clause from the keys of cnddict and binds its values via %s + placeholders, avoiding SQL injection. Appends LIMIT 1 automatically. + + Args: + tablenames (str): Comma-separated table names. + fields (str): Comma-separated column expressions. + cnddict (dict): Mapping of column_name → value used for the WHERE clause. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + dict | int: Row dict, count integer, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not tablenames: return self.pglog("Miss Table name to query", logact) if not fields: return self.pglog("Nothing to query " + tablenames, logact) @@ -788,12 +1182,22 @@ def pghget(self, tablenames, fields, cnddict, logact = None): self.pgdbg(1000, "pghget: {} record retrieved from {}".format(cnt, tablenames)) return record - # tablenames: comma deliminated string of one or more tables - # fields: comma deliminated string of one or more field names, - # cnddicts: condition dict with field names: value lists - # return a dict(field names: value lists) upon success - # retrieve multiple records from tablenames for condition dict def pgmhget(self, tablenames, fields, cnddicts, logact = None): + """Fetch multiple rows using a multi-value condition dict (parameterised). + + Executes one query per row of condition values and accumulates results into a + single column-oriented dict. Useful for bulk lookups with varying WHERE values. + + Args: + tablenames (str): Comma-separated table names. + fields (str): Comma-separated column expressions. + cnddicts (dict): Mapping of column_name → list-of-values; each position + forms one WHERE clause evaluation. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + dict | int: Accumulated column-oriented result dict, or self.FAILURE. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not tablenames: return self.pglog("Miss Table name to query", logact) if not fields: return self.pglog("Nothing to query " + tablenames, logact) @@ -847,8 +1251,21 @@ def pgmhget(self, tablenames, fields, cnddicts, logact = None): self.pgdbg(1000, "pgmhget: {} record(s) retrieved from {}".format(count, tablenames)) return records - # local fucntion: update prepare for pgupdt, pghupdt and pgmupdt def prepare_update(self, tablename, fields, condition = None, cndflds = None): + """Build a parameterised UPDATE SQL statement. + + Accepts either a raw condition string or a list of condition field names. + Field names containing a dot separator are double-quoted appropriately via pgname(). + + Args: + tablename (str): Table to update. + fields (list[str]): Column names to set (SET col=%s …). + condition (str | None): Raw WHERE clause string. + cndflds (list | None): Column names for parameterised WHERE clause. + + Returns: + str: Complete UPDATE SQL string with %s placeholders. + """ strset = [] # build set string for fld in fields: @@ -864,12 +1281,18 @@ def prepare_update(self, tablename, fields, condition = None, cndflds = None): if self.PGLOG['DBGLEVEL']: self.pgdbg(1000, sqlstr) return sqlstr - # update one or multiple rows in tablename - # tablename: update for one table name each call - # record: dict with field names: values - # condition: update conditions for where clause) - # return number of rows undated upon success def pgupdt(self, tablename, record, condition, logact = None): + """Update rows in a table using a raw WHERE condition string. + + Args: + tablename (str): Target table name. + record (dict): Mapping of column_name → new value. + condition (str): WHERE clause string (must not be empty or numeric). + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows updated, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not record: self.pglog("Nothing updates to " + tablename, logact) if not condition or isinstance(condition, int): self.pglog("Miss condition to update " + tablename, logact) @@ -898,12 +1321,18 @@ def pgupdt(self, tablename, record, condition, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return ucnt - # update one or multiple records in tablename - # tablename: update for one table name each call - # record: update values, dict with field names: values - # cnddict: condition dict with field names: values - # return number of records updated upon success def pghupdt(self, tablename, record, cnddict, logact = None): + """Update rows in a table using a condition dict (parameterised). + + Args: + tablename (str): Target table name. + record (dict): Mapping of column_name → new value. + cnddict (dict): Mapping of column_name → value for the WHERE clause. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows updated, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not record: self.pglog("Nothing updates to " + tablename, logact) if not cnddict or isinstance(cnddict, int): self.pglog("Miss condition to update to " + tablename, logact) @@ -933,12 +1362,21 @@ def pghupdt(self, tablename, record, cnddict, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return ucnt - # update multiple records in tablename - # tablename: update for one table name each call - # records: update values, dict with field names: value lists - # cnddicts: condition dict with field names: value lists - # return number of records updated upon success def pgmupdt(self, tablename, records, cnddicts, logact = None): + """Update multiple rows using parallel value and condition dicts. + + Uses psycopg2 execute_batch() for efficient bulk updates. The number of + values in records and cnddicts must match. + + Args: + tablename (str): Target table name. + records (dict): Mapping of column_name → list-of-new-values. + cnddicts (dict): Mapping of column_name → list-of-condition-values. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows updated, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not records: self.pglog("Nothing updates to " + tablename, logact) if not cnddicts or isinstance(cnddicts, int): self.pglog("Miss condition to update to " + tablename, logact) @@ -974,8 +1412,17 @@ def pgmupdt(self, tablename, records, cnddicts, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return ucnt - # local fucntion: delete prepare for pgdel, pghdel and del def prepare_delete(self, tablename, condition = None, cndflds = None): + """Build a parameterised DELETE SQL statement. + + Args: + tablename (str): Table to delete from. + condition (str | None): Raw WHERE clause string. + cndflds (list | None): Column names for parameterised WHERE clause. + + Returns: + str: Complete DELETE SQL string with %s placeholders. + """ # build condition string if not condition: cndset = [] @@ -986,11 +1433,17 @@ def prepare_delete(self, tablename, condition = None, cndflds = None): if self.PGLOG['DBGLEVEL']: self.pgdbg(1000, sqlstr) return sqlstr - # delete one or mutiple records in tablename according condition - # tablename: delete for one table name each call - # condition: delete conditions for where clause - # return number of records deleted upon success def pgdel(self, tablename, condition, logact = None): + """Delete rows from a table using a raw WHERE condition string. + + Args: + tablename (str): Target table name. + condition (str): WHERE clause (must not be empty or numeric). + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows deleted, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not condition or isinstance(condition, int): self.pglog("Miss condition to delete from " + tablename, logact) sqlstr = self.prepare_delete(tablename, condition) @@ -1015,11 +1468,17 @@ def pgdel(self, tablename, condition, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return dcnt - # delete one or mutiple records in tablename according condition - # tablename: delete for one table name each call - # cndict: delete condition dict for names: values - # return number of records deleted upon success def pghdel(self, tablename, cnddict, logact = None): + """Delete rows from a table using a condition dict (parameterised). + + Args: + tablename (str): Target table name. + cnddict (dict): Mapping of column_name → value for the WHERE clause. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows deleted, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not cnddict or isinstance(cnddict, int): self.pglog("Miss condition dict to delete from " + tablename, logact) sqlstr = self.prepare_delete(tablename, None, list(cnddict)) @@ -1046,11 +1505,19 @@ def pghdel(self, tablename, cnddict, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return dcnt - # delete mutiple records in tablename according condition - # tablename: delete for one table name each call - # cndicts: delete condition dict for names: value lists - # return number of records deleted upon success def pgmdel(self, tablename, cnddicts, logact = None): + """Delete multiple rows using a multi-value condition dict. + + Uses psycopg2 execute_batch() for efficient bulk deletes. + + Args: + tablename (str): Target table name. + cnddicts (dict): Mapping of column_name → list-of-condition-values. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: Number of rows deleted, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if not cnddicts or isinstance(cnddicts, int): self.pglog("Miss condition dict to delete from " + tablename, logact) sqlstr = self.prepare_delete(tablename, None, list(cnddicts)) @@ -1080,9 +1547,19 @@ def pgmdel(self, tablename, cnddicts, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return dcnt - # sqlstr: a complete sql string - # return number of record affected upon success def pgexec(self, sqlstr, logact = None): + """Execute a raw SQL statement and return the affected row count. + + Use for DDL or DML that does not fit the structured helpers (e.g. TRUNCATE, + custom UPDATE with subqueries). Not suitable for SELECT queries. + + Args: + sqlstr (str): Complete SQL statement to execute. + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + int: rowcount from the cursor, or self.FAILURE on error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] if self.PGLOG['DBGLEVEL']: self.pgdbg(100, sqlstr) ret = pgcnt = 0 @@ -1106,18 +1583,35 @@ def pgexec(self, sqlstr, logact = None): if self.curtran > self.PGDBI['MTRANS']: self.starttran() return ret - # tablename: one table name to a temporary table - # fromtable: table name data gathing from - # fields: table name data gathing from - # condition: querry conditions for where clause - # return number of records created upon success def pgtemp(self, tablename, fromtable, fields, condition = None, logact = 0): + """Create a temporary table populated from a SELECT query. + + Args: + tablename (str): Name for the new temporary table. + fromtable (str): Source table name. + fields (str): Column expressions for the SELECT. + condition (str | None): Optional WHERE clause. + logact (int): Logging action flags; default 0. + + Returns: + int: Number of rows created, or self.FAILURE on error. + """ sqlstr = "CREATE TEMPORARY TABLE {} SELECT {} FROM {}".format(tablename, fields, fromtable) if condition: sqlstr += " WHERE " + condition return self.pgexec(sqlstr, logact) - # get condition for given table name for accessing information_schema def table_condition(self, tablename): + """Build an information_schema WHERE condition for a given table name. + + Splits schema-qualified names (schema.table) or uses the current SCNAME. + + Args: + tablename (str): Fully-qualified or bare table name. + + Returns: + str: Condition string suitable for querying information_schema.tables + or information_schema.columns. + """ ms = re.match(r'(.+)\.(.+)', tablename) if ms: scname = ms.group(1) @@ -1127,16 +1621,35 @@ def table_condition(self, tablename): tbname = tablename return "table_schema = '{}' AND table_name = '{}'".format(scname, tbname) - # check if a given table name exists or not - # tablename: one table name to check def pgcheck(self, tablename, logact = 0): + """Check whether a table exists in the current schema. + + Args: + tablename (str): Fully-qualified or bare table name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if the table exists, self.FAILURE otherwise. + """ condition = self.table_condition(tablename) ret = self.pgget('information_schema.tables', None, condition, logact) return (self.SUCCESS if ret else self.FAILURE) - # group of functions to check parent records and add an empty one if missed - # return user.uid upon success, 0 otherwise def check_user_uid(self, userno, date = None): + """Return the user.uid for a scientist ID, adding a record if missing. + + Looks up the active user record for userno on the given date. If not found, + logs a warning, attempts a date-range-independent lookup, and finally fetches + UCAR person info to insert a new user record. + + Args: + userno (int | str): UCAR scientist number. + date (str | None): Reference date (YYYY-MM-DD); None means today + (uses until_date IS NULL condition). + + Returns: + int: user.uid on success, 0 if userno is falsy or insert fails. + """ if not userno: return 0 if type(userno) is str: userno = int(userno) if date is None: @@ -1158,8 +1671,18 @@ def check_user_uid(self, userno, date = None): if uid: self.pglog("{}: Scientist ID Added as user.uid = {}".format(userno, uid), self.LGWNEM) return uid - # return user.uid upon success, 0 otherwise def get_user_uid(self, logname, date = None): + """Return the user.uid for a UCAR login name, adding a record if missing. + + Similar to check_user_uid() but looks up by logname instead of userno. + + Args: + logname (str): UCAR login name. + date (str | None): Reference date (YYYY-MM-DD); None means today. + + Returns: + int: user.uid on success, 0 if logname is falsy or insert fails. + """ if not logname: return 0 if not date: date = 'today' @@ -1180,8 +1703,21 @@ def get_user_uid(self, logname, date = None): if uid: self.pglog("{}: UCAR Login Name Added as user.uid = {}".format(logname, uid), self.LGWNEM) return uid - # get ucar user info for given userno (scientist number) or logname (Ucar login) def ucar_user_info(self, userno, logname = None): + """Fetch UCAR person info for a scientist ID or login name via pgperson/pgusername. + + Runs the pgperson command-line tool and parses its key<=>value output. + Maps UCAR API fields to database column names, normalises country code, + organisation type, and employment dates. + + Args: + userno (int): Scientist number; pass 0 to look up by logname instead. + logname (str | None): UCAR login name; used when userno is 0. + + Returns: + dict | None: Mapping of column_name → value suitable for pgadd('dssdb.user'), + or None when pgperson returns no output. + """ matches = { 'upid': "upid", 'uid': "userno", @@ -1243,8 +1779,20 @@ def ucar_user_info(self, userno, logname = None): pgrec['until_date'] = val return pgrec - # set country code for given coutry name or email address def set_country_code(self, email, country = None): + """Normalise a country name or derive it from an email domain. + + Applies a correction table for common aliases (e.g. 'ENGLAND' → 'UNITED.KINGDOM'), + joins two-word country names with a dot, and falls back to email_to_country() + when country is None. + + Args: + email (str): User email address used as fallback for country detection. + country (str | None): Raw country string to normalise. + + Returns: + str: Normalised country name (e.g. 'UNITED.STATES', 'FRANCE'). + """ codes = { 'CHINA': "P.R.CHINA", 'ENGLAND': "UNITED.KINGDOM", @@ -1265,8 +1813,19 @@ def set_country_code(self, email, country = None): country = self.email_to_country(email) return country - # return wuser.wuid upon success, 0 otherwise def check_wuser_wuid(self, email, date = None): + """Return the wuser.wuid for an email address, inserting a record if missing. + + Searches wuser by email and active date range, then falls back to a + cross-table lookup in ruser and dssdb.user to populate the new record. + + Args: + email (str): User email address. + date (str | None): Reference date (YYYY-MM-DD); None means today. + + Returns: + int: wuser.wuid on success, 0 if email is falsy or insert fails. + """ if not email: return 0 emcond = "email = '{}'".format(email) if not date: @@ -1312,8 +1871,18 @@ def check_wuser_wuid(self, email, date = None): return wuid return 0 - # return wuser.wuid upon success, 0 otherwise def check_cdp_wuser(self, username): + """Return or create the wuser.wuid for a CDP username. + + Looks up by cdpname first, then by email. Updates cdpid/cdpname on an + existing record or inserts a new one. + + Args: + username (str): CDP (Collaborative Data Portal) username. + + Returns: + int: wuser.wuid on success, 0 on failure. + """ pgrec = self.pgget("wuser", "wuid", "cdpname = '{}'".format(username), self.PGDBI['EXITLG']) if pgrec: return pgrec['wuid'] idrec = self.pgget("wuser", "wuid", "email = '{}'".format(pgrec['email']), self.PGDBI['EXITLG']) @@ -1332,8 +1901,18 @@ def check_cdp_wuser(self, username): self.pglog("CDP User {} added as wuid = {} in RDADB".format(username, wuid), self.LGWNEM) return wuid - # for given email to get long country name def email_to_country(self, email): + """Infer a country name from an email address domain. + + Checks for a two-letter country-code TLD and looks it up in the countries + table. Recognises common US TLDs (.gov, .edu, .mil, .org, .com, .net). + + Args: + email (str): Email address to inspect. + + Returns: + str: Country name (e.g. 'UNITED.STATES', 'GERMANY'), or 'UNKNOWN'. + """ ms = re.search(r'\.(\w\w)$', email) if ms: pgrec = self.pgget("countries", "token", "domain_id = '{}'".format(ms.group(1)), self.PGDBI['EXITLG']) @@ -1343,12 +1922,27 @@ def email_to_country(self, email): else: return "UNKNOWN" - # if filelists is published for given dataset, reset it to 'P' def reset_rdadb_version(self, dsid): + """Increment the version counter for a dataset record in RDADB. + + Args: + dsid (str): Dataset ID (e.g. 'd123000'). + """ self.pgexec("UPDATE dataset SET version = version + 1 WHERE dsid = '{}'".format(dsid), self.PGDBI['ERRLOG']) - # check the use rdadb flag in table dataset for a given dataset and given values def use_rdadb(self, dsid, logact = 0, vals = None): + """Return the use_rdadb flag for a dataset if it matches an allowed set. + + Args: + dsid (str | None): Dataset ID to query. + logact (int): Logging action flags for missing-dataset warnings; default 0. + vals (str | None): Accepted flag characters; defaults to 'IPYMW' when None. + + Returns: + str: The use_rdadb flag character when found and in vals; 'N' when the + dataset exists but the flag is not in vals; '' when dsid is falsy + or the dataset is not in RDADB. + """ ret = '' # default to empty in case dataset not in RDADB if dsid: pgrec = self.pgget("dataset", "use_rdadb", "dsid = '{}'".format(dsid), self.PGDBI['EXITLG']) @@ -1362,12 +1956,22 @@ def use_rdadb(self, dsid, logact = 0, vals = None): self.pglog("Dataset '{}' is not in RDADB!".format(dsid), logact) return ret - # fld: field name for querry condition - # vals: reference to aaray of values - # isstr: 1 for string values requires quotes and support wildcard - # noand: 1 for skiping the leading ' AND ' for condition - # return a condition string for a given field def get_field_condition(self, fld, vals, isstr = 0, noand = 0): + """Build a SQL condition fragment for a field given a list of values. + + Supports equality, range (IN), comparison signs (<, >, <>), LIKE, SIMILAR TO, + and negation (leading '!' in vals). Multiple values are combined with IN or + the appropriate comparison. Prepends ' AND ' unless noand is set. + + Args: + fld (str): Column name for the condition. + vals (list): Values to match; may include sign tokens from PGSIGNS. + isstr (int): 1 to treat values as strings (adds quotes, handles wildcards). + noand (int): 1 to omit the leading ' AND ' prefix. + + Returns: + str: SQL condition fragment, empty string when vals is empty. + """ cnd = wcnd = negative = '' sign = "=" logic = " OR " @@ -1442,8 +2046,21 @@ def get_field_condition(self, fld, vals, isstr = 0, noand = 0): if cnd and not noand: cnd = " AND " + cnd return cnd - # build up fieldname string for given or default condition def fieldname_string(self, fnames, dnames = None, anames = None, wflds = None): + """Resolve a field-name string and insert any required with-fields. + + Returns dnames when fnames is falsy, anames when fnames is 'ALL' (case-insensitive), + otherwise uses fnames as-is. Then inserts wflds entries at appropriate positions. + + Args: + fnames (str | None): Requested field names string. + dnames (str | None): Default field names string. + anames (str | None): All-fields string used when fnames == 'all'. + wflds (list | None): Additional fields to insert at specific positions. + + Returns: + str | None: Resolved field-names string. + """ if not fnames: fnames = dnames # include default fields names elif re.match(r'^all$', fnames, re.I): @@ -1465,11 +2082,20 @@ def fieldname_string(self, fnames, dnames = None, anames = None, wflds = None): fnames = fnames[0:pos] + wfld + fnames[pos:] # insert with-field return fnames - # Function get_group_field_path(gindex: group index - # dsid: dataset id - # field: path field name: webpath or savedpath) - # go through group tree upward to find a none-empty path, return it or null def get_group_field_path(self, gindex, dsid, field): + """Walk the group tree upward to find the first non-empty path field. + + Recursively follows pindex links from the given group up to the dataset + level until a non-empty value for field is found. + + Args: + gindex (int | None): Group index to start from; None or 0 queries dataset. + dsid (str): Dataset ID. + field (str): Column name to retrieve (e.g. 'webpath' or 'savedpath'). + + Returns: + str | None: First non-empty path value found, or None. + """ if gindex: pgrec = self.pgget("dsgroup", f"pindex, {field}", f"dsid = '{dsid}' AND gindex = {gindex}", self.PGDBI['EXITLG']) @@ -1483,10 +2109,23 @@ def get_group_field_path(self, gindex, dsid, field): else: return None - # get the specialist info for a given dataset - def get_specialist(self, dsid, logact = None): + def get_specialist(self, dsid, logact=None): + """Return specialist contact info for a dataset, with caching. + + Queries dsowner and dssgrp to find the primary specialist for dsid. + Falls back to 'datahelp' / 'Data Help' when none is found. Results + are cached in self.SPECIALIST keyed by dsid. + + Args: + dsid (str): Dataset ID (e.g. 'd123000'). + logact (int | None): Logging action flags; defaults to PGDBI['ERRLOG']. + + Returns: + dict | None: Record with keys 'specialist', 'lstname', 'fstname', + or None on query error. + """ if logact is None: logact = self.PGDBI['ERRLOG'] - if dsid in self.SPECIALIST: return self.SPECIALIST['dsid'] + if dsid in self.SPECIALIST: return self.SPECIALIST[dsid] pgrec = self.pgget("dsowner, dssgrp", "specialist, lstname, fstname", "specialist = logname AND dsid = '{}' AND priority = 1".format(dsid), logact) @@ -1498,11 +2137,26 @@ def get_specialist(self, dsid, logact = None): pgrec['specialist'] = "datahelp" pgrec['lstname'] = "Help" pgrec['fstname'] = "Data" - self.SPECIALIST['dsid'] = pgrec # cache specialist info for dsowner of dsid + self.SPECIALIST[dsid] = pgrec # cache specialist info for dsowner of dsid return pgrec - # build customized email from get_email() def build_customized_email(self, table, field, condition, subject, logact = 0): + """Send a buffered email, falling back to DB caching on failure. + + Retrieves the accumulated email body from get_email(), addresses it, + and sends via send_python_email(). On failure, tries send_customized_email() + and finally cache_customized_email() to store the message for later delivery. + + Args: + table (str): Table name used for cache fallback storage. + field (str): Column name used for cache fallback storage. + condition (str): WHERE condition identifying the cache row. + subject (str | None): Email subject; auto-generated when None. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on successful send or cache, self.FAILURE otherwise. + """ estat = self.FAILURE msg = self.get_email() if not msg: return estat @@ -1524,14 +2178,21 @@ def build_customized_email(self, table, field, condition, subject, logact = 0): self.pglog("Email {} cached to '{}.{}' for {}, Subject: {}".format(receiver, table, field, condition, subject), logact) return estat - # email: full user email address - # get user real name from table ruser for a given email address - # opts == 1: include email - # opts == 2: include org_type - # opts == 4: include country - # opts == 8: include valid_email - # opts == 16: include org def get_ruser_names(self, email, opts = 0, date = None): + """Retrieve user name fields from ruser (or dssdb.user) for an email address. + + Selects the active record for email on date, adding a derived 'name' key + (First Last). Falls back to dssdb.user when ruser has no match. + + Args: + email (str): User email address. + opts (int): Bitmask to include extra fields: + 1=email, 2=org_type, 4=country, 8=valid_email, 16=org. + date (str | None): Reference date (YYYY-MM-DD); None means today. + + Returns: + dict: Record with at least 'name'; may include extra fields per opts. + """ fields = "lname lstname, fname fstname" if opts&1: fields += ", email" if opts&2: fields += ", org_type" @@ -1566,8 +2227,22 @@ def get_ruser_names(self, email, opts = 0, date = None): if opts&1: pgrec['email'] = email return pgrec - # cache a customized email for sending it later def cache_customized_email(self, table, field, condition, emlmsg, logact = 0): + """Store an email message in a database column for later delivery. + + Attempts a pgupdt() to write emlmsg into table.field for condition. Falls + back to send_customized_email() when the update fails. + + Args: + table (str): Target table name. + field (str): Column to store the email body in. + condition (str): WHERE condition identifying the target row. + emlmsg (str): Full email message text (headers + body). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on update, result of send_customized_email() on failure. + """ pgrec = {field: emlmsg} if self.pgupdt(table, pgrec, condition, logact|self.ERRLOG): if logact: self.pglog("Email cached to '{}.{}' for {}".format(table, field, condition), logact&(~self.EXITLG)) @@ -1577,10 +2252,19 @@ def cache_customized_email(self, table, field, condition, emlmsg, logact = 0): self.pglog(f"Error {msg}, try to send directly now", logact|self.ERRLOG) return self.send_customized_email(msg, emlmsg, logact) - # otype: user organization type - # email: user email address) - # return: orgonizaion type like DSS, NCAR, UNIV... def get_org_type(self, otype, email): + """Derive an organisation type from a type hint and email domain. + + Recognises UCAR/NCAR addresses and refines NCAR to DSS for DECS group members. + Maps common TLDs to org types (EDU→UNIV, GOV, MIL, ORG, COM, NET). + + Args: + otype (str | None): Initial organisation type hint; defaults to 'OTHER'. + email (str | None): Email address used for TLD/domain inference. + + Returns: + str: Normalised organisation type string (e.g. 'DSS', 'NCAR', 'UNIV'). + """ if not otype: otype = "OTHER" if email: ms = re.search(r'(@|\.)ucar\.edu$', email) @@ -1597,17 +2281,40 @@ def get_org_type(self, otype, email): if otype == 'EDU': otype = "UNIV" return otype - # join values and handle the null values @staticmethod def join_values(vstr, vals): + """Append a formatted value list to an existing string. + + Builds a line like 'Value(a, b, c)' or 'Values(a, b)' and appends it to + vstr (newline-separated). Treats None vstr as an empty string. + + Args: + vstr (str | None): Existing accumulated string, or None to start fresh. + vals (list): Values to include in the appended line. + + Returns: + str: Updated string with the new value line appended. + """ if vstr: vstr += "\n" elif vstr is None: vstr = '' return "{}Value{}({})".format(vstr, ('s' if len(vals) > 1 else ''), ', '.join(map(str, vals))) - # check table hostname to find the system down times. Cache the result for 10 minutes def get_system_downs(self, hostname, logact = 0): + """Return the cached system-down status for a hostname, refreshing every 10 minutes. + + Queries the hostname table for service status, planned downtime start/end, and + optional path restrictions. Results are stored in self.SYSDOWN[hostname]. + + Args: + hostname (str): Short hostname to query. + logact (int): Logging action flags; default 0. + + Returns: + dict: Status dict with keys: 'start', 'end', 'active', 'path', + 'chktime', 'curtime'. + """ curtime = int(time.time()) newhost = 0 if hostname not in self.SYSDOWN: @@ -1636,8 +2343,20 @@ def get_system_downs(self, hostname, logact = 0): self.SYSDOWN[hostname]['curtime'] = curtime return self.SYSDOWN[hostname] - # return seconds for how long the system will continue to be down def system_down_time(self, hostname, offset, logact = 0): + """Return the number of seconds a system will continue to be down. + + Uses cached data from get_system_downs(). Returns PBS job time when the + system is down indefinitely and the caller is a PBS batch job. + + Args: + hostname (str): Host to check. + offset (int): Seconds before the scheduled start to consider the system down. + logact (int): Logging action flags; default 0. + + Returns: + int: Remaining down seconds, or 0 when the system is up. + """ down = self.get_system_downs(hostname, logact) if down['start'] and down['curtime'] >= (down['start'] - offset): if not down['end']: @@ -1647,8 +2366,19 @@ def system_down_time(self, hostname, offset, logact = 0): return (down['end'] - down['curtime']) return 0 # the system is not down - # return string message if the system is down def system_down_message(self, hostname, path, offset, logact = 0): + """Return a human-readable downtime message, or None when the system is up. + + Args: + hostname (str): Host to check. + path (str | None): Service path to match against scheduled-down paths. + offset (int): Seconds before scheduled start to report as down. + logact (int): Logging action flags; default 0. + + Returns: + str | None: Downtime description string, or None when the system is up + or the path does not match. + """ down = self.get_system_downs(hostname, logact) msg = None if down['start'] and down['curtime'] >= (down['start'] - offset): @@ -1665,45 +2395,57 @@ def system_down_message(self, hostname, path, offset, logact = 0): msg = " And will end by " + self.current_datetime(down['end']) return msg - # return 1 if given path match daemon paths, 0 if not; -1 if cannot compare @staticmethod def match_down_path(path, dpaths): + """Check whether a service path matches any colon-separated down paths. + + Args: + path (str | None): Service path to test. + dpaths (str | None): Colon-separated list of path prefixes to match. + + Returns: + int: 1 if path matches a prefix in dpaths, 0 if no match, + -1 if either argument is falsy. + """ if not (path and dpaths): return -1 paths = re.split(':', dpaths) for p in paths: if re.match(r'^{}'.format(p), path): return 1 return 0 - # validate is login user is in DECS group - # check all node if skpdsg is false, otherwise check non-DSG nodes def validate_decs_group(self, cmdname, logname, skpdsg): + """Verify that a login name belongs to the DECS group, exiting if not. + + When skpdsg is True and the current host is in PGLOG['DSGHOSTS'], the check + is skipped (DSG nodes are exempt). Falls back to the current user (CURUID) + when logname is not supplied. + + Args: + cmdname (str): Name of the command being run, used in the error message. + logname (str | None): Login name to validate; uses CURUID when falsy. + skpdsg (bool | int): When True, skip the check on DSG-designated hosts. + """ if skpdsg and self.PGLOG['DSGHOSTS'] and re.search(r'(^|:){}'.format(self.PGLOG['HOSTNAME']), self.PGLOG['DSGHOSTS']): return - if not logname: lgname = self.PGLOG['CURUID'] + if not logname: logname = self.PGLOG['CURUID'] if not self.pgget("dssgrp", '', "logname = '{}'".format(logname), self.LGEREX): self.pglog("{}: Must be in DECS Group to run '{}' on {}".format(logname, cmdname, self.PGLOG['HOSTNAME']), self.LGEREX) - # add an allusage record into yearly table; create a new yearly table if it does not exist - # year -- year to identify the yearly table, evaluated if missing - # records -- hash to hold one or multiple records. - # Dict keys: email -- user email address, - # org_type -- organization type - # country -- country code - # dsid -- dataset ID - # date -- date data accessed - # time -- time data accessed - # quarter -- quarter of the year data accessed - # size -- bytes of data accessed - # method -- delivery methods: MSS,Web,Ftp,Tape,Cd,Disk,Paper,cArt,Micro - # source -- usage source flag: W - wusage, O - ordusage - # midx -- refer to mbr2loc.midx if not 0 - # ip -- user IP address - # region -- user region name; for example, Colorado - # isarray -- if true, mutiple records provided via arrays for each hash key - # docheck -- if 1, check and add only if record is not on file - # docheck -- if 2, check and add if record is not on file, and update if exists - # docheck -- if 4, check and add if record is not on file, and update if exists, - # and also checking NULL email value too def add_yearly_allusage(self, year, records, isarray = 0, docheck = 0): + """Insert or upsert usage records into the per-year allusage_YYYY table. + + Creates the table via pgddl (ADDTBL flag) when it does not yet exist. + Auto-computes the quarter from the date field when not supplied. + + Args: + year (int | str | None): Four-digit year; derived from records['date'] when 0/None. + records (dict): Usage data. Keys match allusage columns; see class docstring + or inline comment for the full field list. + isarray (int): 0 for a single record dict, 1 for parallel value lists. + docheck (int): 0=insert always; 1=skip if exists; 2=upsert; 4=upsert+NULL email. + + Returns: + int: Number of rows inserted or updated. + """ acnt = 0 if not year: ms = re.match(r'^(\d\d\d\d)', str(records['date'][0] if isarray else records['date'])) @@ -1753,21 +2495,21 @@ def add_yearly_allusage(self, year, records, isarray = 0, docheck = 0): acnt = self.pgadd(tname, record, self.LGEREX|self.ADDTBL) return acnt - # add a wusage record into yearly table; create a new yearly table if it does not exist - # year -- year to identify the yearly table, evaluated if missing - # records -- hash to hold one or multiple records. - # Dict keys: wid - reference to wfile.wid - # wuid_read - reference to wuser.wuid, 0 if missing email - # dsid - reference to dataset.dsid at the time of read - # date_read - date file read - # time_read - time file read - # quarter - quarter of the year data accessed - # size_read - bytes of data read - # method - download methods: WEB, CURL, MGET, FTP and MGET - # locflag - location flag: Glade or Object - # ip - IP address - # isarray -- if true, mutiple records provided via arrays for each hash key def add_yearly_wusage(self, year, records, isarray = 0): + """Insert web-usage records into the per-year wusage_YYYY table. + + Creates the table via pgddl (ADDTBL flag) when it does not yet exist. + Auto-computes the quarter from date_read when not supplied. + + Args: + year (int | str | None): Four-digit year; derived from records['date_read'] when 0/None. + records (dict): Web-usage data. Keys match wusage columns; see class docstring + or inline comment for the full field list. + isarray (int): 0 for a single record dict, 1 for parallel value lists. + + Returns: + int: Number of rows inserted. + """ acnt = 0 if not year: ms = re.match(r'^(\d\d\d\d)', str(records['date_read'][0] if isarray else records['date_read'])) @@ -1789,18 +2531,45 @@ def add_yearly_wusage(self, year, records, isarray = 0): acnt = self.pgadd(tname, record, self.LGEREX|self.ADDTBL) return acnt - # double quote a array of single or sign delimited strings - def pgnames(self, ary, sign = None, joinstr = None): + def pgnames(self, ary, sign=None, joinstr=None): + """Double-quote a list of identifiers, optionally joining them. + + Delegates each name to pgname() for quoting, then either returns the list + or joins it with joinstr. + + Args: + ary (list[str]): Identifier strings to process. + sign (str | None): Separator character(s) forwarded to pgname() for + schema.table or table.column splitting. + joinstr (str | None): String to join the results with; None returns a list. + + Returns: + list[str] | str: List of quoted names when joinstr is None, otherwise a + single joined string. + """ pgary = [] for a in ary: pgary.append(self.pgname(a, sign)) - if joinstr == None: + if joinstr is None: return pgary else: return joinstr.join(pgary) - # double quote a single or sign delimited string def pgname(self, str, sign = None): + """Double-quote a single identifier or a sign-delimited compound identifier. + + Recursively splits on the first character of sign (e.g. '.' for schema.table), + quotes each component, then reassembles. A component is quoted only when it + contains characters outside [a-z0-9_], starts with a digit, or is a reserved + PostgreSQL word listed in PGRES. + + Args: + str (str): Identifier string to process. + sign (str | None): Separator string; None treats str as a simple identifier. + + Returns: + str: Properly double-quoted identifier string. + """ if sign: nstr = '' names = str.split(sign[0]) @@ -1814,29 +2583,53 @@ def pgname(self, str, sign = None): nstr = '"{}"'.format(nstr) return nstr - # get a postgres password for given host, port, dbname, usname def get_pgpass_password(self): + """Return the database password for the current connection settings. + + Checks PGDBI['PWNAME'] first, then tries OpenBao (get_baopassword()), and + finally falls back to the .pgpass file (get_pgpassword()). + + Returns: + str | None: Password string, or None when no credential is found. + """ if self.PGDBI['PWNAME']: return self.PGDBI['PWNAME'] pwname = self.get_baopassword() if not pwname: pwname = self.get_pgpassword() return pwname - # get the pg passwords from file .pgpass def get_pgpassword(self): + """Look up the password in the cached .pgpass credentials. + + Tries matching on short hostname first, then full hostname, using the + current DBPORT (defaulting to 5432), DBNAME, and LNNAME. + + Returns: + str | None: Password string, or None when no matching entry is found. + """ if not self.DBPASS: self.read_pgpass() dbport = str(self.PGDBI['DBPORT']) if self.PGDBI['DBPORT'] else '5432' pwname = self.DBPASS.get((self.PGDBI['DBSHOST'], dbport, self.PGDBI['DBNAME'], self.PGDBI['LNNAME'])) if not pwname: pwname = self.DBPASS.get((self.PGDBI['DBHOST'], dbport, self.PGDBI['DBNAME'], self.PGDBI['LNNAME'])) return pwname - # get the pg passwords from OpenBao def get_baopassword(self): + """Look up the password from OpenBao for the current database and login name. + + Loads OpenBao secrets for PGDBI['DBNAME'] on first call (or when not cached). + + Returns: + str | None: Password string, or None when not found in OpenBao. + """ dbname = self.PGDBI['DBNAME'] if dbname not in self.DBBAOS: self.read_openbao() return self.DBBAOS[dbname].get(self.PGDBI['LNNAME']) - # Reads the .pgpass file and returns a dictionary of credentials. def read_pgpass(self): + """Read the .pgpass file and populate DBPASS with credentials. + + Searches for .pgpass in DSSHOME first, then GDEXHOME. Each entry is parsed + into a (host, port, dbname, username) tuple key mapping to a password value. + """ pgpass = self.PGLOG['DSSHOME'] + '/.pgpass' if not op.isfile(pgpass): pgpass = self.PGLOG['GDEXHOME'] + '/.pgpass' try: @@ -1849,8 +2642,12 @@ def read_pgpass(self): except Exception as e: self.pglog(str(e), self.PGDBI['ERRLOG']) - # Reads OpenBao secrets and returns a dictionary of credentials. def read_openbao(self): + """Read OpenBao secrets and populate DBBAOS with credentials for DBNAME. + + Uses the hvac client to fetch key-value secrets from the configured BAOURL. + Parses keys matching 'pass' patterns to extract database usernames and passwords. + """ dbname = self.PGDBI['DBNAME'] self.DBBAOS[dbname] = {} url = 'https://bao.k8s.ucar.edu/' diff --git a/src/rda_python_common/pg_file.py b/src/rda_python_common/pg_file.py index 323580f..dcfb8b9 100644 --- a/src/rda_python_common/pg_file.py +++ b/src/rda_python_common/pg_file.py @@ -23,6 +23,47 @@ from .pg_sig import PgSIG class PgFile(PgUtil, PgSIG): + """File operations across local, remote, object-store, and Globus endpoints. + + Provides a unified API to copy, move, delete, check, list, and manage files on: + - Local filesystem (LHOST) + - Remote hosts via rsync/ssh (remote_*) + - S3-compatible object store via isd_s3_cli (object_*) + - Quasar/Globus tape backup via dsglobus (backup_*) + + Inherits date/time utilities from PgUtil and signal handling from PgSIG. + + Class Constants: + CMDBTH (int): pgsystem flag — capture both stdout and stderr. + RETBTH (int): pgsystem flag — return both stdout and stderr. + CMDRET (int): pgsystem flag — return stdout and save stderr. + CMDERR (int): pgsystem flag — display command and save stderr. + CMDGLB (int): pgsystem flag — return stdout and save stderr for Globus calls. + + Instance Attributes: + PGCMPS (dict): Compression tool mapping: ext → [compress_cmd, decompress_cmd, fmt_label]. + CMPSTR (str): Regex alternation of all compression extensions. + PGTARS (dict): Archive tool mapping: ext → [pack_cmd, unpack_cmd, fmt_label]. + TARSTR (str): Regex alternation of all archive extensions. + DELDIRS (dict): Directory → host map for deferred empty-directory cleanup. + TASKIDS (dict): Pending Globus task IDs keyed by 'endpoint-file'. + LHOST (str): Local host sentinel ('localhost'). + OHOST (str): Object-store hostname. + BHOST (str): Backup (Quasar) hostname. + DHOST (str): Disaster-recovery hostname. + OBJCTCMD (str): Object-store CLI executable name. + BACKCMD (str): Globus CLI executable name. + DIRLVLS (int): Levels of parent directories to check for empty-dir cleanup (0=off). + BFILES (dict): Cached bfile records keyed by bid. + ECNTS (dict): Per-storage-type consecutive error counters. + ELMTS (dict): Per-storage-type maximum consecutive error limits. + DHOSTS (dict): Storage flag → hostname mapping. + DPATHS (dict): Storage flag → default path mapping. + QSTATS (dict): Globus status letter → human-readable label. + QPOINTS (dict): Storage flag → Globus endpoint name. + QHOSTS (dict): Globus endpoint name → hostname. + ENDPOINTS (dict): Globus endpoint name → display label. + """ CMDBTH = (0x0033) # return both stdout and stderr, 16 + 32 + 2 + 1 RETBTH = (0x0030) # return both stdout and stderr, 16 + 32 @@ -31,6 +72,14 @@ class PgFile(PgUtil, PgSIG): CMDGLB = (0x0313) # return stdout and save error for globus, 1+2+16+256+512 def __init__(self): + """Initialise PgFile with compression/archive tables and storage host settings. + + Calls PgUtil.__init__() and PgSIG.__init__() via super(), then populates + compression (PGCMPS/CMPSTR) and archive (PGTARS/TARSTR) lookup tables, storage + host names and paths (LHOST, OHOST, BHOST, DHOST, DHOSTS, DPATHS), Globus + endpoint mappings (QPOINTS, QHOSTS, ENDPOINTS), and error-tracking counters + (ECNTS, ELMTS). + """ super().__init__() # initialize parent class self.PGCMPS = { # extension Compress Uncompress ArchiveFormat @@ -106,10 +155,31 @@ def __init__(self): # reset the up limit for a specified error type def reset_error_limit(self, etype, lmt): + """Set the maximum consecutive-error limit for a storage error type. + + Args: + etype (str): Error type key — one of 'D', 'H', 'L', 'R', 'O', 'B'. + lmt (int): New limit; 0 disables exit-on-error for this type. + """ self.ELMTS[etype] = lmt - # wrapping self.pglog() to show error and no fatal exit at the first call for retry + # wrapping self.pglog() to show error and no fatal exit at the first call for retry def errlog(self, msg, etype, retry = 0, logact = 0): + """Log an error and optionally sleep before a retry. + + On the first attempt (retry=0) appends a retry notice to msg and suppresses + fatal exit. On subsequent attempts increments the error counter for etype and + triggers exit when the limit is reached. + + Args: + msg (str): Error message to log. + etype (str): Storage error type key ('L', 'R', 'O', 'B', …). + retry (int): 0 for first attempt (sleep + suppress exit); non-zero for retries. + logact (int): Additional logging action flags; default 0. + + Returns: + int: Always self.FAILURE. + """ bckgrnd = self.PGLOG['BCKGRND'] logact |= self.ERRLOG if not retry: @@ -134,8 +204,23 @@ def errlog(self, msg, etype, retry = 0, logact = 0): # fromfile - source file name # tohost - target host name, default to self.LHOST # fromhost - original host name, default to self.LHOST - # Return 1 if successful 0 if failed with error message generated in self.pgsystem() cached in self.PGLOG['SYSERR'] + # Return 1 if successful 0 if failed with error message generated in self.pgsystem() cached in self.PGLOG['SYSERR'] def copy_gdex_file(self, tofile, fromfile, tohost = None, fromhost = None, logact = 0): + """Copy a file between any combination of local, remote, object, and backup hosts. + + Dispatches to the appropriate low-level copy helper based on the source and + target host names. Background copying is not supported for remote→remote transfers. + + Args: + tofile (str): Destination file path. + fromfile (str): Source file path. + tohost (str | None): Destination host; defaults to LHOST (local). + fromhost (str | None): Source host; defaults to LHOST (local). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if tohost is None: tohost = self.LHOST if fromhost is None: fromhost = self.LHOST thost = self.strip_host_name(tohost) @@ -168,6 +253,21 @@ def copy_gdex_file(self, tofile, fromfile, tohost = None, fromhost = None, logac # tofile - target file name # fromfile - source file name def local_copy_local(self, tofile, fromfile, logact = 0): + """Copy a file or directory within the local filesystem. + + Verifies the source exists, creates the target directory if needed, then + runs ``cp -f`` (file) or ``cp -rf`` (directory). Retries once after resetting + permissions if the first attempt fails. Validates size match for regular files. + + Args: + tofile (str): Destination path; trailing '/' causes the source basename + to be appended. + fromfile (str): Source path. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ finfo = self.check_local_file(fromfile, 0, logact) if not finfo: if finfo != None: return self.FAILURE @@ -213,6 +313,21 @@ def local_copy_local(self, tofile, fromfile, logact = 0): # fromfile - source file name # host - remote host name def local_copy_remote(self, tofile, fromfile, host, logact = 0): + """Copy a local file to a remote host using the configured sync command. + + Creates the remote target directory if needed, then runs the sync command. + Retries once on failure, validating the size of the transferred file. + + Args: + tofile (str): Destination path on the remote host; trailing '/' appends + the source basename. + fromfile (str): Source local file path. + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ finfo = self.check_local_file(fromfile, 0, logact) if not finfo: if finfo != None: return self.FAILURE @@ -248,6 +363,21 @@ def local_copy_remote(self, tofile, fromfile, host, logact = 0): # bucket - bucket name on Object store # meta - reference to metadata hash def local_copy_object(self, tofile, fromfile, bucket = None, meta = None, logact = 0): + """Upload a local file to the object store. + + Skips upload when the target already exists (unless OVRIDE is set). + Attaches user and group metadata. Retries once on failure. + + Args: + tofile (str): Object key (destination path in the bucket). + fromfile (str): Source local file path. + bucket (str | None): Target bucket; defaults to PGLOG['OBJCTBKT']. + meta (dict | None): Extra metadata key/value pairs to attach to the object. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not bucket: bucket = self.PGLOG['OBJCTBKT'] if meta is None: meta = {} if 'user' not in meta: meta['user'] = self.PGLOG['CURUID'] @@ -275,12 +405,27 @@ def local_copy_object(self, tofile, fromfile, bucket = None, meta = None, logact return self.FAILURE # Copy multiple files from a Globus endpoint to another - # tofiles - target file name list, echo name leading with /dsnnn.n/ on Quasar and + # tofiles - target file name list, echo name leading with /dsnnn.n/ on Quasar and # leading with /data/ or /decsdata/ on local glade disk # fromfiles - source file name list, the same format as the tofiles - # topoint - target endpoint name, 'gdex-glade', 'gdex-quasar' or 'gdex-quasar-dgdexta' + # topoint - target endpoint name, 'gdex-glade', 'gdex-quasar' or 'gdex-quasar-dgdexta' # frompoint - source endpoint name, the same choices as the topoint def quasar_multiple_trasnfer(self, tofiles, fromfiles, topoint, frompoint, logact = 0): + """Transfer multiple files between two Globus endpoints in a single batch task. + + Builds a JSON batch-transfer spec from parallel source/destination lists + and submits it to dsglobus. Sets self.TASKIDS when the task is still active. + + Args: + tofiles (list[str]): Destination file paths on topoint. + fromfiles (list[str]): Source file paths on frompoint. + topoint (str): Destination Globus endpoint name. + frompoint (str): Source Globus endpoint name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS, self.FAILURE, or self.FINISH (task still active). + """ ret = self.FAILURE fcnt = len(fromfiles) transfer_files = {"files": []} @@ -309,12 +454,27 @@ def quasar_multiple_trasnfer(self, tofiles, fromfiles, topoint, frompoint, logac return ret # Copy a file from a Globus endpoint to another - # tofile - target file name, leading with /dsnnn.n/ on Quasar and + # tofile - target file name, leading with /dsnnn.n/ on Quasar and # leading with /data/ or /decsdata/ on local glade disk # fromfile - source file, the same format as the tofile - # topoint - target endpoint name, 'gdex-glade', 'gdex-quasar' or 'gdex-quasar-dgdexta' + # topoint - target endpoint name, 'gdex-glade', 'gdex-quasar' or 'gdex-quasar-dgdexta' # frompoint - source endpoint name, the same choices as the topoint def endpoint_copy_endpoint(self, tofile, fromfile, topoint, frompoint, logact = 0): + """Copy a single file between two Globus endpoints with checksum verification. + + Skips copy when target already exists (unless OVRIDE is set). + Sets self.TASKIDS when the task is still active. + + Args: + tofile (str): Destination file path on topoint. + fromfile (str): Source file path on frompoint. + topoint (str): Destination Globus endpoint name. + frompoint (str): Source Globus endpoint name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS, self.FAILURE, or self.FINISH (task still active). + """ ret = self.FAILURE finfo = self.check_globus_file(fromfile, frompoint, 0, logact) if not finfo: @@ -337,6 +497,21 @@ def endpoint_copy_endpoint(self, tofile, fromfile, topoint, frompoint, logact = # submit a globus task and return a task id def submit_globus_task(self, cmd, endpoint, logact = 0, qstr = None): + """Submit a dsglobus command as a Globus task and wait for it to complete. + + Retries once on error. Resets ECNTS['B'] on success or active-task return. + Checks host down-status when syserr is present. + + Args: + cmd (str): Complete dsglobus command string to execute. + endpoint (str): Target Globus endpoint name (used for host-status checks). + logact (int): Logging action flags; default 0. + qstr (str | None): Optional JSON string piped to stdin (for batch transfers). + + Returns: + dict: Task dict with keys 'id' (task UUID or None) and 'stat' + ('S'=succeeded, 'A'=active, 'F'=failed, 'U'=unknown). + """ task = {'id': None, 'stat': 'U'} loop = reset = 0 while (loop-reset) < 2: @@ -369,6 +544,20 @@ def submit_globus_task(self, cmd, endpoint, logact = 0, qstr = None): # check Globus transfer status for given taskid. Cancel the task # if self.NOWAIT presents and Details is neither OK nor Queued def check_globus_status(self, taskid, endpoint = None, logact = 0): + """Poll a Globus task for its current status. + + When NOWAIT is set and the detail is not OK/Queued, cancels the task. + Resets ECNTS['B'] on success or active return. + + Args: + taskid (str): Globus task UUID to query. + endpoint (str | None): Endpoint name for host-status checks; defaults to + PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + str: Single-letter status — 'S' succeeded, 'A' active, 'F' failed, 'U' unknown. + """ ret = 'U' if not taskid: return ret if not endpoint: endpoint = self.PGLOG['BACKUPEP'] @@ -407,6 +596,20 @@ def check_globus_status(self, taskid, endpoint = None, logact = 0): # return SUCCESS if Globus transfer is done; FAILURE otherwise def check_globus_finished(self, tofile, topoint, logact = 0): + """Block until a previously submitted Globus task completes. + + Looks up the task ID in self.TASKIDS using 'endpoint-file' as the key. + When NOWAIT is set, polls up to 2 extra times before switching to blocking mode. + Removes the task from TASKIDS on success. + + Args: + tofile (str): Destination file path used to look up the task key. + topoint (str): Destination Globus endpoint name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on completion, self.FAILURE on error or non-success status. + """ ret = self.SUCCESS ckey = "{}-{}".format(topoint, tofile) if ckey in self.TASKIDS: @@ -443,6 +646,17 @@ def check_globus_finished(self, tofile, topoint, logact = 0): # fromfile - source file name, leading with /data/ or /decsdata/ # endpoint - endpoint name on Quasar Backup Server def local_copy_backup(self, tofile, fromfile, endpoint = None, logact = 0): + """Copy a local GLADE file to the Quasar backup endpoint via Globus. + + Args: + tofile (str): Destination path on the backup endpoint (leading '/dsNNN.N/'). + fromfile (str): Source local path (leading '/data/' or '/decsdata/'). + endpoint (str | None): Globus endpoint name; defaults to PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS, self.FAILURE, or self.FINISH. + """ if not endpoint: endpoint = self.PGLOG['BACKUPEP'] return self.endpoint_copy_endpoint(tofile, fromfile, endpoint, 'gdex-glade', logact) @@ -451,6 +665,17 @@ def local_copy_backup(self, tofile, fromfile, endpoint = None, logact = 0): # fromfile - source file name, leading with /dsnnn.n/ # endpoint - endpoint name on Quasar Backup Server def backup_copy_local(self, tofile, fromfile, endpoint = None, logact = 0): + """Copy a file from the Quasar backup endpoint to a local GLADE path via Globus. + + Args: + tofile (str): Destination local path (leading '/data/' or '/decsdata/'). + fromfile (str): Source backup path (leading '/dsNNN.N/'). + endpoint (str | None): Globus endpoint name; defaults to PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS, self.FAILURE, or self.FINISH. + """ if not endpoint: endpoint = self.PGLOG['BACKUPEP'] return self.endpoint_copy_endpoint(tofile, fromfile, 'gdex-glade', endpoint, logact) @@ -459,12 +684,26 @@ def backup_copy_local(self, tofile, fromfile, endpoint = None, logact = 0): # fromfile - source file name # host - remote host name def remote_copy_local(self, tofile, fromfile, host, logact = 0): + """Copy a file from a remote host to the local filesystem. + + Creates the local target directory if needed. Retries once after resetting + permissions if the first attempt fails. Validates size match for regular files. + + Args: + tofile (str): Destination local path; trailing '/' appends source basename. + fromfile (str): Source file path on the remote host. + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ cmd = self.get_sync_command(host) finfo = self.check_remote_file(fromfile, host, 0, logact) + target = tofile if not finfo: if finfo != None: return self.FAILURE return self.errlog("{}-{}: {} to copy to {}".format(host, fromfile, self.PGLOG['MISSFILE'], tofile), 'R', 1, logact) - target = tofile ms = re.match(r'^(.+)/$', tofile) if ms: dir = ms.group(1) @@ -496,6 +735,20 @@ def remote_copy_local(self, tofile, fromfile, host, logact = 0): # fromfile - source file name # bucket - bucket name on Object store def object_copy_local(self, tofile, fromfile, bucket = None, logact = 0): + """Download a file from the object store to the local filesystem. + + Changes to the target directory, downloads using isd_s3_cli, verifies size, + sets permissions, and renames if needed. Retries once on failure. + + Args: + tofile (str): Destination local file path. + fromfile (str): Object key (source path in the bucket). + bucket (str | None): Source bucket; defaults to PGLOG['OBJCTBKT']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ ret = self.FAILURE if not bucket: bucket = self.PGLOG['OBJCTBKT'] finfo = self.check_object_file(fromfile, bucket, 0, logact) @@ -537,6 +790,23 @@ def object_copy_local(self, tofile, fromfile, bucket = None, logact = 0): # bucket - bucket name on Object store # meta - reference to metadata hash def remote_copy_object(self, tofile, fromfile, host, bucket = None, meta = None, logact = 0): + """Copy a file from a remote host to the object store. + + If host is local, delegates directly to local_copy_object(). Otherwise copies + the file locally first (to TMPPATH), uploads it to the object store, then + removes the temporary local copy. + + Args: + tofile (str): Object key (destination path in bucket). + fromfile (str): Source file path on the remote host. + host (str): Remote hostname. + bucket (str | None): Target bucket; defaults to PGLOG['OBJCTBKT']. + meta (dict | None): Metadata to attach to the object. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if self.is_local_host(host): return self.local_copy_object(tofile, fromfile, bucket, meta, logact) locfile = "{}/{}".format(self.PGLOG['TMPPATH'], op.basename(tofile)) ret = self.remote_copy_local(locfile, fromfile, host, logact) @@ -552,6 +822,21 @@ def remote_copy_object(self, tofile, fromfile, host, bucket = None, meta = None, # bucket - bucket name on Object store # meta - reference to metadata hash def object_copy_remote(self, tofile, fromfile, host, bucket = None, logact = 0): + """Copy a file from the object store to a remote host. + + If host is local, delegates to object_copy_local(). Otherwise downloads to + TMPPATH first, uploads to the remote, then removes the temporary copy. + + Args: + tofile (str): Destination file path on the remote host. + fromfile (str): Object key (source path in bucket). + host (str): Remote hostname. + bucket (str | None): Source bucket; defaults to PGLOG['OBJCTBKT']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if self.is_local_host(host): return self.object_copy_local(tofile, fromfile, bucket, logact) locfile = "{}/{}".format(self.PGLOG['TMPPATH'], op.basename(tofile)) ret = self.object_copy_local(locfile, fromfile, bucket, logact) @@ -563,8 +848,21 @@ def object_copy_remote(self, tofile, fromfile, host, bucket = None, logact = 0): # Delete a file/directory on a given host name (including local host) no background process for deleting # file - file name to be deleted # host - host name the file on, default to self.LHOST - # Return 1 if successful 0 if failed with error message generated in self.pgsystem() cached in self.PGLOG['SYSERR'] + # Return 1 if successful 0 if failed with error message generated in self.pgsystem() cached in self.PGLOG['SYSERR'] def delete_gdex_file(self, file, host, logact = 0): + """Delete a file or directory on any supported storage host. + + Dispatches to delete_local_file(), delete_object_file(), or + delete_remote_file() based on the host name. + + Args: + file (str): File or directory path to delete. + host (str): Storage host (local, object, or remote hostname). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: return self.delete_local_file(file, logact) @@ -576,6 +874,18 @@ def delete_gdex_file(self, file, host, logact = 0): # Delete a local file/irectory def delete_local_file(self, file, logact = 0): + """Delete a local file or directory with retry on failure. + + Uses ``rm -rf``. After deletion, records the parent directory for deferred + empty-directory cleanup when DIRLVLS is set. + + Args: + file (str): Local file or directory path to delete. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ info = self.check_local_file(file, 0, logact) if not info: return self.FAILURE cmd = "rm -rf " @@ -596,6 +906,19 @@ def delete_local_file(self, file, logact = 0): # Delete file/directory on a remote host def delete_remote_file(self, file, host, logact = 0): + """Delete a file or directory on a remote host. + + Verifies existence first. Retries once on failure. Records the parent + directory for deferred cleanup when DIRLVLS is set. + + Args: + file (str): Remote file or directory path to delete. + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not self.check_remote_file(file, host, logact): return self.FAILURE cmd = self.get_sync_command(host) for loop in range(2): @@ -605,8 +928,21 @@ def delete_remote_file(self, file, host, logact = 0): self.errlog(self.PGLOG['SYSERR'], 'R', loop, logact) return self.FAILURE - # Delete a file on object store + # Delete a file on object store def delete_object_file(self, file, bucket = None, logact = 0): + """Delete one or more object-store files matching a key pattern. + + Lists matching keys, deletes each, then re-lists to confirm deletion. + Retries once on failure. + + Args: + file (str): Object key or key prefix to match for deletion. + bucket (str | None): Target bucket; defaults to PGLOG['OBJCTBKT']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not bucket: bucket = self.PGLOG['OBJCTBKT'] ocmd = self.OBJCTCMD for loop in range(2): @@ -623,8 +959,20 @@ def delete_object_file(self, file, bucket = None, logact = 0): if errmsg: self.errlog(errmsg, 'O', loop, logact) return self.FAILURE - # Delete a backup file on Quasar Server + # Delete a backup file on Quasar Server def delete_backup_file(self, file, endpoint = None, logact = 0): + """Delete a file on the Quasar backup endpoint via a Globus delete task. + + Sets self.TASKIDS when the task is still active. + + Args: + file (str): File path on the backup endpoint. + endpoint (str | None): Globus endpoint name; defaults to PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS, self.FAILURE, or self.FINISH. + """ if not endpoint: endpoint = self.PGLOG['BACKUPEP'] info = self.check_backup_file(file, endpoint, 0, logact) if not info: return self.FAILURE @@ -642,6 +990,19 @@ def delete_backup_file(self, file, endpoint = None, logact = 0): # file - file name (mandatory) # info - gathered file info with option 14, None means file not exists def reset_local_info(self, file, info = None, logact = 0): + """Attempt to make a local file or its parent directory writable. + + Called before retrying a failed copy or delete. Resets file mode to 0o664 + and directory mode to 0o775, and changes the group to GDEXGRP. + + Args: + file (str): File path whose permissions need resetting. + info (dict | None): Existing file-info dict (opt=14); re-fetched when None. + logact (int): Logging action flags; default 0. + + Returns: + int: 1 if any change was made, 0 otherwise. + """ ret = 0 if info: if info['isfile']: @@ -658,6 +1019,16 @@ def reset_local_info(self, file, info = None, logact = 0): # reset local directory group/mode def reset_local_directory(self, dir, info = None, logact = 0): + """Reset a local directory's mode to 0o775 and group to GDEXGRP. + + Args: + dir (str): Local directory path. + info (dict | None): File-info dict (opt=14); re-fetched when None or incomplete. + logact (int): Logging action flags; default 0. + + Returns: + int: 1 if any change was made, 0 otherwise. + """ ret = 0 if not (info and 'mode' in info and 'group' in info and 'logname' in info): info = self.check_local_file(dir, 14, logact) @@ -670,6 +1041,16 @@ def reset_local_directory(self, dir, info = None, logact = 0): # reset local file group/mode def reset_local_file(self, file, info = None, logact = 0): + """Reset a local file's mode to 0o664 and group to GDEXGRP. + + Args: + file (str): Local file path. + info (dict | None): File-info dict (opt=14); re-fetched when None or incomplete. + logact (int): Logging action flags; default 0. + + Returns: + int: Number of successful changes made (0 if none). + """ ret = 0 if not (info and 'mode' in info and 'group' in info and 'logname' in info): info = self.check_local_file(file, 14, logact) @@ -686,6 +1067,19 @@ def reset_local_file(self, file, info = None, logact = 0): # host - host name the file is moved on, default to self.LHOST # Return self.SUCCESS if successful self.FAILURE otherwise def move_gdex_file(self, tofile, fromfile, host, logact = 0): + """Move a file on any supported storage host (same host only). + + Dispatches to move_local_file(), move_object_file(), or move_remote_file(). + + Args: + tofile (str): Destination file path. + fromfile (str): Source file path. + host (str): Storage host name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: return self.move_local_file(tofile, fromfile, logact) @@ -699,6 +1093,20 @@ def move_gdex_file(self, tofile, fromfile, host, logact = 0): # tofile - target file name # fromfile - source file name def move_local_file(self, tofile, fromfile, logact = 0): + """Move a file or directory within the local filesystem using ``mv``. + + Skips move when tofile already exists and has the right content (unless OVRIDE + is set). Creates the target directory if needed. Records the source parent + directory for deferred empty-directory cleanup when DIRLVLS is set. + + Args: + tofile (str): Destination local path. + fromfile (str): Source local path. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ dir = self.get_local_dirname(tofile) info = self.check_local_file(fromfile, 0, logact) tinfo = self.check_local_file(tofile, 0, logact) @@ -732,6 +1140,21 @@ def move_local_file(self, tofile, fromfile, logact = 0): # host - remote host name # locfile - local copy of tofile def move_remote_file(self, tofile, fromfile, host, logact = 0): + """Move a file on a remote host by copy-then-delete. + + If host is local, delegates to move_local_file(). Otherwise copies the file + locally (to TMPPATH), uploads to the remote destination, removes the local + temp copy, then deletes the remote source. + + Args: + tofile (str): Destination path on the remote host. + fromfile (str): Source path on the remote host. + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if self.is_local_host(host): return self.move_local_file(tofile, fromfile, logact) ret = self.FAILURE dir = op.dirname(tofile) @@ -765,6 +1188,21 @@ def move_remote_file(self, tofile, fromfile, host, logact = 0): # tobucket - target bucket name # frombucket - original bucket name def move_object_file(self, tofile, fromfile, tobucket, frombucket, logact = 0): + """Move an object-store file to a new key (same or different bucket). + + Retrieves existing metadata before moving. Skips move when target already + exists with the same size (unless OVRIDE is set). + + Args: + tofile (str): Destination object key. + fromfile (str): Source object key. + tobucket (str | None): Destination bucket; defaults to PGLOG['OBJCTBKT']. + frombucket (str | None): Source bucket; defaults to tobucket. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ ret = self.FAILURE if not tobucket: tobucket = self.PGLOG['OBJCTBKT'] if not frombucket: frombucket = tobucket @@ -804,6 +1242,18 @@ def move_object_file(self, tofile, fromfile, tobucket, frombucket, logact = 0): # tobucket - target bucket name # frombucket - original bucket name def move_object_path(self, topath, frompath, tobucket, frombucket, logact = 0): + """Move all object-store keys under a path prefix to a new prefix. + + Args: + topath (str): Destination path prefix. + frompath (str): Source path prefix. + tobucket (str | None): Destination bucket; defaults to PGLOG['OBJCTBKT']. + frombucket (str | None): Source bucket; defaults to tobucket. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ ret = self.FAILURE if not tobucket: tobucket = self.PGLOG['OBJCTBKT'] if not frombucket: frombucket = tobucket @@ -830,6 +1280,20 @@ def move_object_path(self, topath, frompath, tobucket, frombucket, logact = 0): # fromfile - source file name # endpoint - Globus endpoint def move_backup_file(self, tofile, fromfile, endpoint = None, logact = 0): + """Rename a file on the Quasar backup endpoint via dsglobus. + + Creates the target parent directory if the rename fails with 'No such file + or directory'. Resets ECNTS['B'] on success. + + Args: + tofile (str): New path on the backup endpoint. + fromfile (str): Current path on the backup endpoint. + endpoint (str | None): Globus endpoint; defaults to PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ ret = self.FAILURE if not endpoint: endpoint = self.PGLOG['BACKUPEP'] finfo = self.check_backup_file(fromfile, endpoint, 0, logact) @@ -872,6 +1336,18 @@ def move_backup_file(self, tofile, fromfile, endpoint = None, logact = 0): # host - host name the directory on, default to self.LHOST # Return self.SUCCESS(1) if successful or self.FAILURE(0) if failed def make_gdex_directory(self, dir, host, logact = 0): + """Create a directory on any supported host. + + Dispatches to make_local_directory() or make_remote_directory(). + + Args: + dir (str): Directory path to create. + host (str): Target host name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not dir: return self.SUCCESS shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: @@ -883,10 +1359,32 @@ def make_gdex_directory(self, dir, host, logact = 0): # Make a local directory # dir - directory path to be made def make_local_directory(self, dir, logact = 0): + """Create a local directory, including all parent directories. + + Args: + dir (str): Local directory path to create. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ return self.make_one_local_directory(dir, None, logact) # Make a local directory recursively def make_one_local_directory(self, dir, odir = None, logact = 0): + """Recursively create a single local directory. + + Returns immediately when dir already exists or is '/'. Refuses to create + within restricted root paths. Resets permissions and retries once on failure. + + Args: + dir (str): Directory to create. + odir (str | None): Original requested directory (for error messages). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not dir or op.isdir(dir): return self.SUCCESS if op.isfile(dir): return self.errlog(dir + ": is file, cannot make directory", 'L', 1, logact) if not odir: odir = dir @@ -910,9 +1408,33 @@ def make_one_local_directory(self, dir, odir = None, logact = 0): # dir - directory path to be made # host - host name the directory on def make_remote_directory(self, dir, host, logact = 0): + """Create a directory on a remote host. + + Args: + dir (str): Remote directory path to create. + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ return self.make_one_remote_directory(dir, None, host, logact) def make_one_remote_directory(self, dir, odir, host, logact = 0): + """Recursively create a single directory on a remote host via the sync command. + + Returns immediately when the directory already exists. Refuses to create within + restricted root paths. + + Args: + dir (str): Remote directory to create. + odir (str | None): Original requested directory (for error messages). + host (str): Remote hostname. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ info = self.check_remote_file(dir, host, 0, logact) if info: if info['isfile']: return self.errlog("{}-{}: is file, cannot make directory".format(host, dir), 'R', 1, logact) @@ -931,10 +1453,35 @@ def make_one_remote_directory(self, dir, odir, host, logact = 0): # Make a quasar directory # dir - directory path to be made def make_backup_directory(self, dir, endpoint, logact = 0): + """Create a directory on the Quasar backup endpoint via dsglobus. + + Args: + dir (str): Directory path on the backup endpoint. + endpoint (str): Globus endpoint name. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ return self.make_one_backup_directory(dir, None, endpoint, logact) # Make a quasar directory recursively def make_one_backup_directory(self, dir, odir, endpoint = None, logact = 0): + """Recursively create a single directory on a Quasar backup endpoint. + + Returns immediately for '/' or when the directory already exists. Retries + recursively when 'No such file or directory' is reported. Resets ECNTS['B'] + on success. + + Args: + dir (str): Directory path to create. + odir (str | None): Original requested directory (for error messages). + endpoint (str | None): Globus endpoint; defaults to PGLOG['BACKUPEP']. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not dir or dir == '/': return self.SUCCESS if not endpoint: endpoint = self.PGLOG['BACKUPEP'] info = self.check_backup_file(dir, endpoint, 0, logact) @@ -947,6 +1494,7 @@ def make_one_backup_directory(self, dir, odir, endpoint = None, logact = 0): if not self.make_one_backup_directory(op.dirname(dir), odir, endpoint, logact): return self.FAILURE bcmd = self.BACKCMD cmd = f"{bcmd} mkdir -ep {endpoint} -p {dir}" + ret = self.FAILURE for loop in range(2): buf = self.pgsystem(cmd, logact, self.CMDRET) syserr = self.PGLOG['SYSERR'] @@ -970,6 +1518,22 @@ def make_one_backup_directory(self, dir, odir, endpoint = None, logact = 0): # check and return 1 if a root directory def is_root_directory(self, dir, etype, host = None, action = None, logact = 0): + """Return 1 if dir is a root/protected directory that must not be deleted. + + Checks against GPFSROOTS, HOMEROOTS, and a depth limit based on how many + leading components dir contains. Logs an error with host-status context when + action is provided. + + Args: + dir (str): Directory path to check. + etype (str): Error type key for errlog(). + host (str | None): Associated host for host_down_status(). + action (str | None): Action description for the error message. + logact (int): Logging action flags; default 0. + + Returns: + int: 1 if dir is a root/protected path, 0 otherwise. + """ ret = cnt = 0 if re.match(r'^{}'.format(self.PGLOG['DSSDATA']), dir): ms = re.match(r'^({})(.*)$'.format(self.PGLOG['GPFSROOTS']), dir) @@ -997,6 +1561,22 @@ def is_root_directory(self, dir, etype, host = None, action = None, logact = 0): # set mode for a given direcory/file on a given host (include local host) def set_gdex_mode(self, file, isfile, host, nmode = None, omode = None, logname = None, logact = 0): + """Set the permission mode of a file/directory on any supported host. + + Dispatches to set_local_mode() or set_remote_mode(). + + Args: + file (str): File or directory path. + isfile (int): 1 for regular file, 0 for directory. + host (str): Target host. + nmode (int | None): New octal mode; defaults to FILEMODE or EXECMODE. + omode (int | None): Current mode (skip re-fetch when provided). + logname (str | None): Current owner login (used for local mode change). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: return self.set_local_mode(file, isfile, nmode, omode, logname, logact) @@ -1006,6 +1586,23 @@ def set_gdex_mode(self, file, isfile, host, nmode = None, omode = None, logname # set mode for given local directory or file def set_local_mode(self, file, isfile = 1, nmode = 0, omode = 0, logname = None, logact = 0): + """Set the permission mode of a local file or directory. + + No-op when nmode already equals omode. Fetches the current mode from + check_local_file() when omode/logname are not provided. + + Args: + file (str): Local file or directory path. + isfile (int): 1 for regular file, 0 for directory. + nmode (int): New octal mode; 0 → FILEMODE or EXECMODE. + omode (int): Current mode; 0 triggers a fresh stat call. + logname (str | None): Current owner login (informational, used to detect + whether a stat call is needed). + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS on success, self.FAILURE on error. + """ if not nmode: nmode = (self.PGLOG['FILEMODE'] if isfile else self.PGLOG['EXECMODE']) if not (omode and logname): info = self.check_local_file(file, 6) @@ -1023,6 +1620,21 @@ def set_local_mode(self, file, isfile = 1, nmode = 0, omode = 0, logname = None, # set mode for given directory or file on remote host def set_remote_mode(self, file, isfile, host, nmode = 0, omode = 0, logact = 0): + """Set the permission mode of a file or directory on a remote host. + + No-op when nmode already equals omode. + + Args: + file (str): Remote file or directory path. + isfile (int): 1 for regular file, 0 for directory. + host (str): Remote hostname. + nmode (int): New octal mode; 0 → FILEMODE or EXECMODE. + omode (int): Current mode; 0 triggers a remote stat call. + logact (int): Logging action flags; default 0. + + Returns: + int: Result of pgsystem() call (truthy on success). + """ if not nmode: nmode = (self.PGLOG['FILEMODE'] if isfile else self.PGLOG['EXECMODE']) if not omode: info = self.check_remote_file(file, host, 6) @@ -1035,6 +1647,21 @@ def set_remote_mode(self, file, isfile, host, nmode = 0, omode = 0, logact = 0): # change group for given local directory or file def change_local_group(self, file, ngrp = None, ogrp = None, logname = None, logact = 0): + """Change the group ownership of a local file or directory. + + No-op when the file already belongs to the target group. Fetches current + ownership from check_local_file() when ogrp/logname are not provided. + + Args: + file (str): Local file or directory path. + ngrp (str | None): New group name; None uses PGLOG['GDEXGID'] directly. + ogrp (str | None): Current group name (skip re-fetch when provided with logname). + logname (str | None): Current owner login name. + logact (int): Logging action flags; default 0. + + Returns: + int | None: self.SUCCESS on success, self.FAILURE on error, None if already correct. + """ if not ngrp: ngid = self.PGLOG['GDEXGID'] else: @@ -1059,15 +1686,30 @@ def change_local_group(self, file, ngrp = None, ogrp = None, logname = None, log # Check if given path on a specified host or the host itself are down # path: path name to be checked # host: host name the file on, default to self.LHOST - # chkopt: 1 - do a file/path check, 0 - do not + # chkopt: 1 - do a file/path check, 0 - do not # Return array of 2 (hstat, msg) # hstat: 0 if system is up and accessible, # 1 - host is down, # 2 - if path not accessible # negative values if planned system down # msg: None - stat == 0 - # an unempty string for system down message - stat != 0 + # an unempty string for system down message - stat != 0 def host_down_status(self, path, host, chkopt = 0, logact = 0): + """Diagnose whether a storage host or path is currently inaccessible. + + Checks the local filesystem, GPFS, object store, backup endpoints, and remote + hosts. Calls system_down_message() to detect planned outages. + + Args: + path (str): Path to check; empty string skips path-level checks. + host (str): Host to check. + chkopt (int): 1 to perform an actual file/path existence check. + logact (int): Logging action flags; default 0. + + Returns: + tuple: (hstat, msg) where hstat is 0 (up), 1 (host down), 2 (path inaccessible), + or a negative value for a planned outage; msg is None when hstat == 0. + """ shost = self.strip_host_name(host) hstat = 0 rets = [0, None] @@ -1156,6 +1798,16 @@ def host_down_status(self, path, host, chkopt = 0, logact = 0): # host: host name the file on, default to self.LHOST # Return errmsg if not accessible and None otherwise def check_host_down(self, path, host, logact = 0): + """Return an error message if a path on a host is inaccessible, else None. + + Args: + path (str): Path to check. + host (str): Host name. + logact (int): Logging action flags; default 0. + + Returns: + str | None: Error message string if down, None if accessible. + """ (hstat, msg) = self.host_down_status(path, host, 1, logact) return msg if hstat else None @@ -1165,6 +1817,19 @@ def check_host_down(self, path, host, logact = 0): # reset the service flag to A or I accordingly # Return 0 if accessible, dsservice.sindex if not, and -1 if can not be checked def check_service_accessibilty(self, sname, fhost = None, logact = 0): + """Check whether a named service is accessible from a specified host. + + Looks up the service in dsservice table and calls host_down_status() for + the associated path/flag. + + Args: + sname (str): Service name to check. + fhost (str | None): Host from which to check; defaults to PGLOG['HOSTNAME']. + logact (int): Logging action flags; default 0. + + Returns: + int | str | None: 0 if accessible, error message if not, -1 if undefined. + """ if not fhost: fhost = self.PGLOG['HOSTNAME'] pgrec = self.pgget("dsservice", "*", "service = '{}' AND hostname = '{}'".format(sname, fhost), logact) if not pgrec: @@ -1176,12 +1841,36 @@ def check_service_accessibilty(self, sname, fhost = None, logact = 0): # check if this host is a local host for given host name def is_local_host(self, host): + """Return 1 if host resolves to the local host, 0 otherwise. + + Considers batch nodes as local via valid_batch_host(). + + Args: + host (str): Host name to test. + + Returns: + int: 1 if local, 0 if remote. + """ host = self.strip_host_name(host) if host == self.LHOST or self.valid_batch_host(host): return 1 return 0 # check and return action string on a node other than local one def local_host_action(self, host, action, info, logact = 0): + """Log a 'cannot perform action on non-local host' message and return a status. + + Returns 1 silently when host is local. Returns 0 when logact is 0 (no log). + Otherwise logs a message directing the user to the correct node/interface. + + Args: + host (str): Target host name. + action (str): Action description for the error message. + info (str): Subject of the action (file, dataset, etc.). + logact (int): Logging action flags; default 0. + + Returns: + int | None: 1 if local, 0 if no logact, else result of pglog(). + """ if self.is_local_host(host): return 1 if not logact: return 0 if host == "partition": @@ -1197,6 +1886,17 @@ def local_host_action(self, host, action, info, logact = 0): # ping a given remote host name # return None if system is up error messge if not def ping_remote_host(self, host): + """Ping a remote host and return None if reachable, an error string if not. + + Appends '.ucar.edu' and retries when 'unknown host' is reported. + Sends 3 ICMP packets and considers the host up if at least 1 is received. + + Args: + host (str): Hostname or IP to ping. + + Returns: + str | None: None if reachable, error message string if unreachable. + """ while True: buf = self.pgsystem("ping -c 3 " + host, self.LOGWRN, self.CMDRET) if buf: @@ -1215,13 +1915,35 @@ def ping_remote_host(self, host): return "Cannot ping " + host # compare given two host names, return 1 if same and 0 otherwise - def same_hosts(self, host1, host2): + def same_hosts(self, host1, host2): + """Return 1 if two host names resolve to the same host, 0 otherwise. + + Comparison is case-insensitive after stripping domain components. + + Args: + host1 (str): First hostname. + host2 (str): Second hostname. + + Returns: + int: 1 if the same host, 0 otherwise. + """ host1 = self.strip_host_name(host1) host2 = self.strip_host_name(host2) return (1 if self.pgcmp(host1, host2, 1) == 0 else 0) # strip and identify the proper host name def strip_host_name(self, host): + """Return the short hostname component, mapped to LHOST when it matches self. + + Strips any domain suffix (everything after the first dot). Maps the current + machine's hostname to 'localhost' and returns LHOST for empty/None input. + + Args: + host (str | None): Hostname to normalise. + + Returns: + str: Short hostname, or self.LHOST for local/empty input. + """ if not host: return self.LHOST ms = re.match(r'^([^\.]+)\.', host) if ms: host = ms.group(1) @@ -1242,6 +1964,20 @@ def strip_host_name(self, host): # 32 - get checksum (checksum), work for local file only # Return a dict of file info, or None if file not exists def check_gdex_file(self, file, host = None, opt = 0, logact = 0): + """Return file status info for a file on any supported storage host. + + Dispatches to check_local_file(), check_object_file(), check_backup_file(), + or check_remote_file() based on the host name. + + Args: + file (str): File path. + host (str | None): Storage host; defaults to LHOST. + opt (int): Bitmask of info to retrieve (see check_local_file for values). + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ if host is None: host = self.LHOST shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: @@ -1259,6 +1995,21 @@ def check_gdex_file(self, file, host = None, opt = 0, logact = 0): # wrapper to self.check_local_file() and self.check_globus_file() to check info for a file # on local or remote Globus endpoints def check_globus_file(self, file, endpoint = None, opt = 0, logact = 0): + """Return file info for a file on a local or remote Globus endpoint. + + Converts GLADE-relative paths (starting with '/data/' or '/decsdata/') to + absolute paths before calling check_local_file() for 'gdex-glade'. Delegates + to check_backup_file() for all other endpoints. + + Args: + file (str): File path relative to the endpoint. + endpoint (str | None): Globus endpoint name; defaults to PGLOG['BACKUPEP']. + opt (int): Info bitmask. + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ if not endpoint: endpoint = self.PGLOG['BACKUPEP'] if endpoint == 'gdex-glade': if re.match(r'^/(data|decsdata)/', file): file = self.PGLOG['DSSDATA'] + file @@ -1279,6 +2030,22 @@ def check_globus_file(self, file, endpoint = None, opt = 0, logact = 0): # 128 - check twice for missing file # Return: a dict of file info, or None if not exists def check_local_file(self, file, opt = 0, logact = 0): + """Return status info for a local file or directory. + + Retries after a short sleep when opt includes bit 128 (double-check). + Resets ECNTS['L'] on success. + + Args: + file (str): Local file or directory path. + opt (int): Bitmask of info to retrieve: + 0=size only, 1=mtime, 2=owner, 4=mode, 8=group, + 16=weekday, 32=checksum, 64=delete if too small, + 128=retry once for missing file. + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ ret = None if not file: return ret loop = 0 @@ -1304,6 +2071,20 @@ def check_local_file(self, file, opt = 0, logact = 0): # local function to get local file stat def local_file_stat(self, file, fstat, opt, logact): + """Build a file-info dict from an os.stat result for a local file. + + Handles regular files and directories. Optionally deletes files that are too + small (opt bit 64). Populates info fields according to the opt bitmask. + + Args: + file (str): File path (used for deletion and MD5 calculation). + fstat (os.stat_result): Result of os.stat(file). + opt (int): Info bitmask (same as check_local_file). + logact (int): Logging action flags. + + Returns: + dict | None: File info dict, or None if the file is too small/invalid. + """ if not fstat: self.errlog(file + ": Error check file stat", 'L', 1, logact) return None @@ -1341,6 +2122,14 @@ def local_file_stat(self, file, fstat, opt, logact): # get total size of files under a given path @staticmethod def local_path_size(pname): + """Return the total byte size of all files under a directory path. + + Args: + pname (str | None): Directory path; defaults to '.' when falsy. + + Returns: + int: Total size in bytes of all regular files found recursively. + """ if not pname: pname = '.' # To get size of current directory size = 0 for path, dirs, files in os.walk(pname): @@ -1356,8 +2145,22 @@ def local_path_size(pname): # 4 - get permission mode in 3 octal digits (mode) # 8 - get group name (group), assumed 'dss' # 16 - get week day 0-Sunday, 1-Monday (week_day) - # Return: a dict of file info, or None if not exists + # Return: a dict of file info, or None if not exists def check_remote_file(self, file, host, opt = 0, logact = 0): + """Return file status info for a file on a remote host via the sync command. + + Strips a trailing '/' from the file path. Retries once on transient errors. + Resets ECNTS['R'] on success. + + Args: + file (str): Remote file path. + host (str): Remote hostname. + opt (int): Info bitmask (0=size, 1=mtime, 2=owner, 4=mode, 8=group, 16=weekday). + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ if not file: return None ms = re.match(r'^(.+)/$', file) if ms: file = ms.group(1) # remove ending '/' in case @@ -1381,6 +2184,15 @@ def check_remote_file(self, file, host, opt = 0, logact = 0): # local function to get remote file stat def remote_file_stat(self, line, opt): + """Parse one line of sync-command directory output into a file-info dict. + + Args: + line (str): One output line from the remote sync/ls command. + opt (int): Info bitmask. + + Returns: + dict | None: File info dict, or None if the line cannot be parsed. + """ info = {} items = re.split(r'\s+', line) if len(items) < 5 or items[4] == '.': return None @@ -1416,6 +2228,21 @@ def remote_file_stat(self, line, opt): # 64 - check once, no rechecking # Return a dict of file info, or None if file not exists def check_object_file(self, file, bucket = None, opt = 0, logact = 0): + """Return status info for an object-store file key. + + Strips trailing '/'. Uses opt bit 64 to skip the retry. Fetches metadata + (uhash) when opt includes bits 2, 4, or 8. Resets ECNTS['O'] on success. + + Args: + file (str): Object key to check. + bucket (str | None): Bucket name; defaults to PGLOG['OBJCTBKT']. + opt (int): Bitmask — 0=size, 1=mtime, 2=owner, 4=meta, 8=group, + 16=weekday, 32=checksum, 64=no-retry. + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ if not bucket: bucket = self.PGLOG['OBJCTBKT'] ret = None if not file: return ret @@ -1463,6 +2290,16 @@ def check_object_file(self, file, bucket = None, opt = 0, logact = 0): # path: object store path name # Return count of object key names, 0 if not file exists; None if error checking def check_object_path(self, path, bucket = None, logact = 0): + """Return the count of object keys matching a path prefix. + + Args: + path (str): Object key prefix to query. + bucket (str | None): Bucket name; defaults to PGLOG['OBJCTBKT']. + logact (int): Logging action flags; default 0. + + Returns: + int | None: Count of matching keys (0 if none), or None on error. + """ if not bucket: bucket = self.PGLOG['OBJCTBKT'] ret = None if not path: return ret @@ -1484,6 +2321,16 @@ def check_object_path(self, path, bucket = None, logact = 0): # object store function to get file stat def object_file_stat(self, hash, uhash, opt): + """Build a file-info dict from object-store list and metadata JSON. + + Args: + hash (dict): One entry from the isd_s3_cli 'lo' JSON output. + uhash (dict | None): Metadata from isd_s3_cli 'gm' JSON output. + opt (int): Info bitmask (same as check_object_file). + + Returns: + dict | None: File info dict, or None if hash is invalid. + """ info = {'isfile': 1, 'data_size': int(hash['Size']), 'fname': op.basename(hash['Key'])} if not opt: return info if opt&17: @@ -1515,6 +2362,21 @@ def object_file_stat(self, hash, uhash, opt): # 64 - rechecking # Return a dict of file info, or None if file not exists def check_backup_file(self, file, endpoint = None, opt = 0, logact = 0): + """Return status info for a file on a Quasar backup endpoint. + + Uses opt bit 64 to enable re-checking after a short sleep. Resets ECNTS['B'] + on success. + + Args: + file (str): File path on the backup endpoint. + endpoint (str | None): Globus endpoint name; defaults to PGLOG['BACKUPEP']. + opt (int): Bitmask — 0=size, 1=mtime, 2=owner, 4=mode, 8=group, + 16=weekday, 64=recheck. + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ ret = None if not file: return ret if not endpoint: endpoint = self.PGLOG['BACKUPEP'] @@ -1557,6 +2419,15 @@ def check_backup_file(self, file, endpoint = None, opt = 0, logact = 0): # backup store function to get file stat def backup_file_stat(self, line, opt): + """Parse one line of dsglobus ls output into a file-info dict. + + Args: + line (str): One output line from dsglobus ls. + opt (int): Info bitmask. + + Returns: + dict | None: File info dict, or None if the line cannot be parsed. + """ info = {} items = re.split(r'[\s\|]+', line) if len(items) < 8: return None @@ -1590,6 +2461,17 @@ def backup_file_stat(self, line, opt): # 16 - get week day 0-Sunday, 1-Monday (week_day) # Return a dict of file info, or None if file not exists def check_tar_file(self, file, tfile, opt = 0, logact = 0): + """Return status info for a member file inside a tar archive. + + Args: + file (str): Member file name to search for. + tfile (str): Path to the tar archive. + opt (int): Info bitmask (0=size, 1=mtime, 2=owner, 4=mode, 8=group, 16=weekday). + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not in archive, self.FAILURE on error. + """ ret = None if not (file and tfile): return ret for loop in range(2): @@ -1608,6 +2490,15 @@ def check_tar_file(self, file, tfile, opt = 0, logact = 0): # local function to get file stat in a tar file def tar_file_stat(self, line, opt): + """Parse one line of ``tar -tvf`` output into a file-info dict. + + Args: + line (str): One output line from tar listing. + opt (int): Info bitmask. + + Returns: + dict | None: File info dict, or None if the line cannot be parsed. + """ items = re.split(r'\s+', line) if len(items) < 6: return None ms = re.match(r'^([d\-])([\w\-]{9})$', items[0]) @@ -1645,6 +2536,20 @@ def tar_file_stat(self, line, opt): # 16 - get week day 0-Sunday, 1-Monday (week_day) # Return a dict of file info, or None if file not exists def check_ftp_file(self, file, opt = 0, name = None, pswd = None, logact = 0): + """Return status info for a file on an FTP server via ncftpls. + + Retries with the parent directory listing if the direct file listing fails. + + Args: + file (str): FTP file path to check. + opt (int): Info bitmask (0=size, 1=mtime, 2=owner, 4=mode, 8=group, 16=weekday). + name (str | None): FTP login name. + pswd (str | None): FTP password. + logact (int): Logging action flags; default 0. + + Returns: + dict | None | int: File info dict, None if not found, self.FAILURE on error. + """ if not file: return None ms = re.match(r'^(.+)/$', file) if ms: file = ms.group(1) # remove ending '/' in case @@ -1667,6 +2572,17 @@ def check_ftp_file(self, file, opt = 0, name = None, pswd = None, logact = 0): # local function to get stat of a file on ftp server def ftp_file_stat(self, line, opt): + """Parse one line of ncftpls -l output into a file-info dict. + + Handles both year-based and time-based ls date formats. + + Args: + line (str): One output line from ncftpls -l. + opt (int): Info bitmask. + + Returns: + dict | None: File info dict, or None if the line cannot be parsed. + """ items = re.split(r'\s+', line) if len(items) < 9: return None ms = re.match(r'^([d\-])([\w\-]{9})$', items[0]) @@ -1711,6 +2627,19 @@ def ftp_file_stat(self, line, opt): # 32 - get checksum (checksum), work for local file only # Return: a dict with filenames as keys None if empty directory def gdex_glob(self, dir, host, opt = 0, logact = 0): + """List files/directories on any supported storage host. + + Dispatches to local_glob(), object_glob(), backup_glob(), or remote_glob(). + + Args: + dir (str): Directory or path prefix to list. + host (str): Storage host name. + opt (int): Info bitmask for each entry's file-info dict. + logact (int): Logging action flags; default 0. + + Returns: + dict: Mapping of file path → file-info dict (empty when directory is empty). + """ shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: return self.local_glob(dir, opt, logact) @@ -1734,6 +2663,20 @@ def gdex_glob(self, dir, host, opt = 0, logact = 0): # 256 - get files only and ignore directories # Return: dict with filenames as keys or None if empty directory def local_glob(self, dir, opt = 0, logact = 0): + """List files/directories under a local directory path using glob patterns. + + Appends '/*' when dir has no glob characters and exists as a directory. + Appends '*' when dir does not exist (allows prefix-style patterns). + Bit 256 in opt filters out directories. + + Args: + dir (str): Local directory path or glob pattern. + opt (int): Info bitmask; bit 256 = files only. + logact (int): Logging action flags; default 0. + + Returns: + dict: Mapping of file path → file-info dict. + """ flist = {} if not re.search(r'[*?]', dir): if op.exists(dir): @@ -1756,6 +2699,17 @@ def local_glob(self, dir, opt = 0, logact = 0): # 16 - get week day 0-Sunday, 1-Monday (week_day) # Return: dict with filenames as keys or None if empty directory def remote_glob(self, dir, host, opt = 0, logact = 0): + """List files/directories in a remote directory via the sync command. + + Args: + dir (str): Remote directory path. + host (str): Remote hostname. + opt (int): Info bitmask. + logact (int): Logging action flags; default 0. + + Returns: + dict: Mapping of file path → file-info dict (empty on error). + """ flist = {} if not re.search(r'/$', dir): dir += '/' buf = self.pgsystem(self.get_sync_command(host) + " dir", self.LOGWRN, self.CMDRET) @@ -1777,6 +2731,17 @@ def remote_glob(self, dir, host, opt = 0, logact = 0): # 16 - get week day 0-Sunday, 1-Monday (week_day) # Return: a dict with filenames as keys, or None if not exists def object_glob(self, dir, bucket = None, opt = 0, logact = 0): + """List object-store files matching a key prefix. + + Args: + dir (str): Object key prefix to list (trailing '/' stripped). + bucket (str | None): Bucket name; defaults to PGLOG['OBJCTBKT']. + opt (int): Info bitmask (bits 2 and 8 trigger metadata fetch). + logact (int): Logging action flags; default 0. + + Returns: + dict | int: Mapping of key → file-info dict, or self.FAILURE on error. + """ flist = {} if not bucket: bucket = self.PGLOG['OBJCTBKT'] ms = re.match(r'^(.+)/$', dir) @@ -1818,6 +2783,21 @@ def object_glob(self, dir, bucket = None, opt = 0, logact = 0): # 64 - rechecking # Return: a dict with filenames as keys, or None if not exists def backup_glob(self, dir, endpoint = None, opt = 0, logact = 0): + """List files/directories on a Quasar backup endpoint. + + Bit 64 in opt enables re-checking after a sleep when the directory is not + found or empty. + + Args: + dir (str): Directory path on the backup endpoint. + endpoint (str | None): Globus endpoint; defaults to PGLOG['BACKUPEP']. + opt (int): Info bitmask; bit 64 = recheck. + logact (int): Logging action flags; default 0. + + Returns: + dict | int | None: Mapping of fname → file-info dict, self.FAILURE on error, + or None when directory does not exist. + """ if not dir: return None if not endpoint: endpoint = self.PGLOG['BACKUPEP'] bcmd = self.BACKCMD @@ -1856,6 +2836,14 @@ def backup_glob(self, dir, endpoint = None, opt = 0, logact = 0): # local function to get file/directory mode for given permission string, for example, rw-rw-r-- @staticmethod def get_file_mode(perm): + """Convert a 9 or 10-character permission string to an octal mode integer. + + Args: + perm (str): Permission string like 'rwxr-xr--' (9 chars) or 'drwxr-xr--' (10 chars). + + Returns: + int: Octal mode value (e.g. 0o755). + """ mbits = [4, 2, 1] mults = [64, 8, 1] plen = len(perm) @@ -1871,9 +2859,21 @@ def get_file_mode(perm): # Evaluate md5 checksum # file: file name for MD5 checksum - # count: defined if filename is a array + # count: defined if filename is a array # Return: one or a array of 128-bits md5 'fingerprint' None if failed def get_md5sum(self, file, count = 0, logact = 0): + """Compute MD5 checksum(s) for one or more local files. + + Args: + file (str | list): A single file path, or a list of file paths when count > 0. + count (int): Number of files in the list; 0 = single file mode. + logact (int): Logging action flags; default 0. + + Returns: + str | list | None: Hex MD5 string for a single file, a list of hex strings + (with None for missing files) for multiple files, or None + on failure. + """ cmd = 'md5sum ' if count > 0: checksum = [None]*count @@ -1896,6 +2896,18 @@ def get_md5sum(self, file, count = 0, logact = 0): # file1, file2: file names # Return: 0 if same and 1 if not def compare_md5sum(self, file1, file2, logact = 0): + """Compare MD5 checksums of two files or directories. + + For directories, lists files in each and compares the concatenated checksums. + + Args: + file1 (str): First file or directory path. + file2 (str): Second file or directory path. + logact (int): Logging action flags; default 0. + + Returns: + int: 0 if checksums match, 1 if they differ. + """ if op.isdir(file1) or op.isdir(file2): files1 = self.get_directory_files(file1) fcnt1 = len(files1) if files1 else 0 @@ -1904,7 +2916,7 @@ def compare_md5sum(self, file1, file2, logact = 0): if fcnt1 != fcnt2: return 1 chksm1 = self.get_md5sum(files1, fcnt1, logact) chksm1 = ''.join(chksm1) - chksm2 = self.get_md5sum(files1, fcnt2, logact) + chksm2 = self.get_md5sum(files2, fcnt2, logact) chksm2 = ''.join(chksm2) else: chksm1 = self.get_md5sum(file1, 0, logact) @@ -1913,6 +2925,17 @@ def compare_md5sum(self, file1, file2, logact = 0): # change local directory to todir, and return odir upon success def change_local_directory(self, todir, logact = 0): + """Change the current working directory, creating it if necessary. + + Updates PGLOG['CURDIR'] on success. Returns the previous directory. + + Args: + todir (str): Target directory to change to. + logact (int): Logging action flags; defaults to LOGWRN when 0. + + Returns: + str | int: Previous working directory on success, self.FAILURE on error. + """ if logact: lact = logact&~(self.EXITLG|self.ERRLOG) else: @@ -1937,16 +2960,33 @@ def change_local_directory(self, todir, logact = 0): # record the directory for the deleted file # pass in empty dir to turn the recording delete directory on def record_delete_directory(self, dir, val): + """Record a directory for deferred empty-directory cleanup, or set the level count. + + When dir is None and val is an integer (or numeric string), sets DIRLVLS. + Otherwise records dir → val (host) in DELDIRS for later cleanup. + + Args: + dir (str | None): Directory path to record, or None to configure DIRLVLS. + val (int | str): Host name when dir is set; level count when dir is None. + """ if dir is None: if isinstance(val, int): self.DIRLVLS = val - elif re.match(r'^\d+$'): + elif re.match(r'^\d+$', val): self.DIRLVLS = int(val) elif dir and not re.match(r'^(\.|\./|/)$', dir) and dir not in self.DELDIRS: self.DELDIRS[dir] = val # remove the recorded delete directory if it is empty def clean_delete_directory(self, logact = 0): + """Remove recorded empty directories up to DIRLVLS parent levels. + + Iterates from leaf to parent, deleting directories that are confirmed empty + via gdex_empty_directory(). Clears DELDIRS after completion. + + Args: + logact (int): Logging action flags; defaults to LOGWRN when 0. + """ if not self.DIRLVLS: return if logact: lact = logact&~(self.EXITLG) @@ -1975,6 +3015,16 @@ def clean_delete_directory(self, logact = 0): # remove the empty given directory and its all subdirectories # return 1 if empty dirctory removed 0 otherwise def clean_empty_directory(self, dir, host, logact = 0): + """Recursively remove empty subdirectories under a given directory. + + Args: + dir (str): Root directory to clean. + host (str): Storage host name. + logact (int): Logging action flags; defaults to LOGWRN when 0. + + Returns: + int: 1 if dir itself was removed (was empty), 0 otherwise. + """ if not dir: return 0 dirs = self.gdex_glob(dir, host) cnt = 0 @@ -2000,6 +3050,15 @@ def clean_empty_directory(self, dir, host, logact = 0): # check if given directory is empty # Return: 0 if empty directory, 1 if not empty and -1 if invalid directory def gdex_empty_directory(self, dir, host): + """Check whether a directory on any supported host is empty. + + Args: + dir (str): Directory path. + host (str): Storage host name. + + Returns: + int: 0 if empty, 1 if not empty, 2 if a root/protected directory, -1 if invalid. + """ shost = self.strip_host_name(host) if self.pgcmp(shost, self.LHOST, 1) == 0: return self.local_empty_directory(dir) @@ -2009,6 +3068,14 @@ def gdex_empty_directory(self, dir, host): # return 0 if empty local directory, 1 if not; -1 if cannot remove def local_empty_directory(self, dir): + """Check whether a local directory is empty. + + Args: + dir (str): Local directory path. + + Returns: + int: 0 if empty, 1 if not empty, 2 if a root directory, -1 if not a directory. + """ if not op.isdir(dir): return -1 if self.is_root_directory(dir, 'L'): return 2 if not re.search(r'/$', dir): dir += '/' @@ -2017,6 +3084,15 @@ def local_empty_directory(self, dir): # return 0 if empty remote directory, 1 if not; -1 if cannot remove def remote_empty_directory(self, dir, host): + """Check whether a remote directory is empty via the sync command. + + Args: + dir (str): Remote directory path. + host (str): Remote hostname. + + Returns: + int: 0 if empty, 1 if not empty, 2 if a root directory, -1 on error. + """ if self.is_root_directory(dir, 'R', host): return 2 if not re.search(r'/$', dir): dir += '/' buf = self.pgsystem("{} {}".format(self.get_sync_command(host), dir), self.LOGWRN, self.CMDRET) @@ -2030,6 +3106,16 @@ def remote_empty_directory(self, dir, host): # host: host name the file on, default to self.LHOST # return: array of file sizes size is -1 if file does not exist def gdex_file_sizes(self, files, host, logact = 0): + """Return the sizes of multiple files on any supported storage host. + + Args: + files (list[str]): File paths to measure. + host (str): Storage host name. + logact (int): Logging action flags; default 0. + + Returns: + list[int]: Size in bytes per file; -1 if not found, -2 on error. + """ sizes = [] for file in files: sizes.append(self.gdex_file_size(file, host, 2, logact)) return sizes @@ -2039,6 +3125,15 @@ def gdex_file_sizes(self, files, host, logact = 0): # files: file names to get sizes # return: array of file sizes size is -1 if file does not exist def local_file_sizes(self, files, logact = 0): + """Return the sizes of multiple local files. + + Args: + files (list[str]): Local file paths to measure. + logact (int): Logging action flags; default 0. + + Returns: + list[int]: Size in bytes per file; -1 if not found, -2 on error. + """ sizes = [] for file in files: sizes.append(self.local_file_size(file, 6, logact)) return sizes @@ -2054,6 +3149,18 @@ def local_file_sizes(self, files, logact = 0): # -1 - file not exists # -2 - error check file def gdex_file_size(self, file, host, opt = 0, logact = 0): + """Return the size of a single file on any supported storage host. + + Args: + file (str): File path. + host (str): Storage host name. + opt (int): Bitmask — 1=delete if too small, 2=log if too small, + 4=log if missing. + logact (int): Logging action flags; default 0. + + Returns: + int: Size in bytes; 0 if too small/empty; -1 if not found; -2 on error. + """ info = self.check_gdex_file(file, host, 0, logact) if info: if info['isfile'] and info['data_size'] < self.PGLOG['MINSIZE']: @@ -2081,6 +3188,17 @@ def gdex_file_size(self, file, host, opt = 0, logact = 0): # -1 - file not exists # -2 - error check file def local_file_size(self, file, opt = 0, logact = 0): + """Return the size of a single local file. + + Args: + file (str): Local file path. + opt (int): Bitmask — 1=delete if too small, 2=log if too small, + 4=log if missing. + logact (int): Logging action flags; default 0. + + Returns: + int: Size in bytes; 0 if too small/empty; -1 if not found; -2 on error. + """ if not op.exists(file): if opt&4: self.lmsg(file, self.PGLOG['MISSFILE'], logact) return -1 # file not eixsts @@ -2105,6 +3223,18 @@ def local_file_size(self, file, opt = 0, logact = 0): # 3 - get compress file name # return: array of new file name and archive format if changed otherwise original one def compress_local_file(self, ifile, fmt = None, act = 0, logact = 0): + """Compress or uncompress a local file, or compute the resulting file name. + + Args: + ifile (str): Input file name (may already have a compression extension). + fmt (str | None): Archive format hint (e.g. 'GZ', 'BZ2'). + act (int): 0=uncompress, 1=compress, 2=get uncompressed name, + 3=get compressed name. + logact (int): Logging action flags; default 0. + + Returns: + tuple: (output_filename, updated_fmt) after the operation. + """ ms = re.match(r'^(.+)\.({})'.format(self.CMPSTR), ifile) if ms: ofile = ms.group(1) @@ -2124,6 +3254,16 @@ def compress_local_file(self, ifile, fmt = None, act = 0, logact = 0): # get file archive format from a givn file name; None if not found def get_file_format(self, fname): + """Return the archive format label for a file based on its extension. + + Checks tar formats first, then compression-only formats. + + Args: + fname (str): File name to inspect. + + Returns: + str | None: Format label (e.g. 'TAR.GZ', 'GZ'), or None if unrecognised. + """ ms = re.search(r'\.({})$'.format(self.TARSTR), fname, re.I) if ms: return self.PGTARS[ms.group(1)][2] ms = re.search(r'\.({})$'.format(self.CMPSTR), fname, re.I) @@ -2138,6 +3278,18 @@ def get_file_format(self, fname): # 1 - tar # return: self.SUCCESS upon successful self.FAILURE otherwise def tar_local_file(self, tfile, files, fmt, act, logact = 0): + """Create or extract a tar/tar.gz/tgz/zip archive. + + Args: + tfile (str): Archive file path. + files (list[str] | None): Member files (required for act=1; optional for act=0). + fmt (str | None): Archive format key; auto-detected from tfile extension when None. + act (int): 0=extract, 1=create. + logact (int): Logging action flags; default 0. + + Returns: + int: Result of pgsystem() call (truthy on success), or self.FAILURE on bad args. + """ if not fmt: ms = re.search(r'\.({})$'.format(self.TARSTR), tfile, re.I) if ms: fmt = ms.group(1) @@ -2155,7 +3307,15 @@ def tar_local_file(self, tfile, files, fmt, act, logact = 0): # get local file archive format by checking extension of given local file name # file: local file name - def local_archive_format(self,file): + def local_archive_format(self, file): + """Return the archive format string for a local file based on its extension. + + Args: + file (str): Local file name. + + Returns: + str: Format string like 'TAR.GZ', 'GZ', 'TAR', or '' if unrecognised. + """ ms = re.search(r'\.({})$'.format(self.CMPSTR), file) if ms: fmt = ms.group(1) @@ -2169,13 +3329,35 @@ def local_archive_format(self,file): # local function to show message with full local file path def lmsg(self, file, msg, logact = 0): + """Log an error with the full absolute path of a local file. + + Converts relative paths to absolute using the current working directory. + + Args: + file (str): Local file path (relative or absolute). + msg (str): Error message to append. + logact (int): Logging action flags; default 0. + + Returns: + int: Always self.FAILURE. + """ if not op.isabs(file): file = self.join_paths(os.getcwd(), file) return self.errlog("{}: {}".format(file, msg), 'L', 1, logact) # check if given path is executable locally # return self.SUCCESS if yes self.FAILURE if not def check_local_executable(self, path, actstr = '', logact = 0): - if os.access(path, os.W_OK): return self.SUCCESS + """Check whether a local path is executable by the current process. + + Args: + path (str): Local path to test. + actstr (str): Optional action prefix for the error message. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if executable, self.FAILURE otherwise. + """ + if os.access(path, os.X_OK): return self.SUCCESS if self.check_local_accessible(path, actstr, logact): if actstr: actstr += '-' self.errlog("{}{}: Accessible, but Unexecutable on'{}'".format(actstr, path, self.PGLOG['HOSTNAME']), 'L', 1, logact) @@ -2184,6 +3366,16 @@ def check_local_executable(self, path, actstr = '', logact = 0): # check if given path is writable locally # return self.SUCCESS if yes self.FAILURE if not def check_local_writable(self, path, actstr = '', logact = 0): + """Check whether a local path is writable by the current process. + + Args: + path (str): Local path to test. + actstr (str): Optional action prefix for the error message. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if writable, self.FAILURE otherwise. + """ if os.access(path, os.W_OK): return self.SUCCESS if self.check_local_accessible(path, actstr, logact): if actstr: actstr += '-' @@ -2193,6 +3385,16 @@ def check_local_writable(self, path, actstr = '', logact = 0): # check if given path is accessible locally # return self.SUCCESS if yes, self.FAILURE if not def check_local_accessible(self, path, actstr = '', logact = 0): + """Check whether a local path exists and is accessible by the current process. + + Args: + path (str): Local path to test. + actstr (str): Optional action prefix for the error message. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if accessible, self.FAILURE otherwise. + """ if os.access(path, os.F_OK): return self.SUCCESS if actstr: actstr += '-' self.errlog("{}{}: Unaccessible on '{}'".format(actstr, path, self.PGLOG['HOSTNAME']), 'L', 1, logact) @@ -2201,6 +3403,18 @@ def check_local_accessible(self, path, actstr = '', logact = 0): # check if given webfile under self.PGLOG['DSSDATA'] is writable # return self.SUCCESS if yes self.FAILURE if not def check_webfile_writable(self, action, wfile, logact = 0): + """Check whether a web file path under DSSDATA is locally writable. + + Returns SUCCESS immediately for paths outside DSSDATA (no check needed). + + Args: + action (str): Action description for the error message. + wfile (str): Web file absolute path. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if writable or outside DSSDATA, self.FAILURE otherwise. + """ ms = re.match(r'^({}/\w+)'.format(self.PGLOG['DSSDATA']), wfile) if ms: return self.check_local_writable(ms.group(1), "{} {}".format(action, wfile), logact) @@ -2209,6 +3423,21 @@ def check_webfile_writable(self, action, wfile, logact = 0): # convert the one file to another via uncompress, move/copy, and/or compress def convert_files(self, ofile, ifile, keep = 0, logact = 0): + """Convert a file between compression formats via decompress → move → compress. + + Handles the full pipeline: decompresses ifile (if compressed), moves/copies + to the target name, and re-compresses (if ofile has a compression extension). + The keep flag preserves a backup of the input file. + + Args: + ofile (str): Output (destination) file name. + ifile (str): Input (source) file name. + keep (int): 1 to preserve a '.keep' copy of the input. + logact (int): Logging action flags; default 0. + + Returns: + int: self.SUCCESS if ofile exists after conversion, self.FAILURE otherwise. + """ if ofile == ifile: return self.SUCCESS oname = ofile iname = ifile @@ -2267,7 +3496,7 @@ def convert_files(self, ofile, ifile, keep = 0, logact = 0): else: self.pgsystem("{} {}".format(self.PGCMPS[oext][0], oname), logact, 5) if keep and op.exists(kfile) and kfile != ifile: - if op.exist(ifile): + if op.exists(ifile): self.delete_local_file(kfile, logact) else: self.move_local_file(ifile, kfile, logact) @@ -2280,6 +3509,15 @@ def convert_files(self, ofile, ifile, keep = 0, logact = 0): # return 0 if same, 1 different, -1 if can not compare @staticmethod def compare_file_info(ainfo, binfo): + """Compare two file-info dicts by size, modified date, and modified time. + + Args: + ainfo (dict | None): First file-info dict. + binfo (dict | None): Second file-info dict. + + Returns: + int: 0 if identical, 1 if different, -1 if either dict is missing. + """ if not (ainfo and binfo): return -1 # at least one is missing return (0 if (ainfo['data_size'] == binfo['data_size'] and ainfo['date_modified'] == binfo['date_modified'] and @@ -2288,12 +3526,32 @@ def compare_file_info(ainfo, binfo): # get local_dirname @staticmethod def get_local_dirname(file): + """Return the directory part of a local file path. + + Returns the current working directory when dirname() would return '.'. + + Args: + file (str): Local file path. + + Returns: + str: Absolute directory path. + """ dir = op.dirname(file) if dir == '.': dir = os.getcwd() return dir # collect valid file names under a given directory, current directory if empty def get_directory_files(self, dir = None, limit = 0, level = 0): + """Collect all regular file paths under a directory, recursively. + + Args: + dir (str | None): Root directory to search; searches '*' in cwd when None. + limit (int): Maximum recursion depth; 0 = unlimited. + level (int): Current recursion depth (internal; start at 0). + + Returns: + list[str] | None: Sorted list of file paths, or None when no files found. + """ files = [] if dir: if level == 0 and op.isfile(dir): @@ -2313,6 +3571,15 @@ def get_directory_files(self, dir = None, limit = 0, level = 0): # reads a local file into a string and returns it def read_local_file(self, file, logact = 0): + """Read and return the entire contents of a local text file. + + Args: + file (str): Local file path to read. + logact (int): Logging action flags; default 0. + + Returns: + str | int: File contents string on success, self.FAILURE on error. + """ try: fd = open(file, 'r') except Exception as e: @@ -2324,6 +3591,16 @@ def read_local_file(self, file, logact = 0): # open a local file and return the file handler def open_local_file(self, file, mode = 'r', logact = None): + """Open a local file and return the file object. + + Args: + file (str): Local file path. + mode (str): Open mode string (e.g. 'r', 'w', 'a', 'rb'); default 'r'. + logact (int | None): Logging action flags; defaults to LOGERR. + + Returns: + file | int: Opened file object on success, self.FAILURE on error. + """ if logact is None: logact = self.LOGERR try: fd = open(file, mode) @@ -2333,6 +3610,16 @@ def open_local_file(self, file, mode = 'r', logact = None): # change absolute paths to relative paths def get_relative_paths(self, files, cdir, logact = 0): + """Convert absolute file paths to paths relative to a working directory. + + Args: + files (list[str]): List of file paths to convert. + cdir (str | None): Base directory; defaults to os.getcwd() when None. + logact (int): Logging action flags for paths outside cdir; default 0. + + Returns: + list[str]: File paths made relative to cdir. + """ cnt = len(files) if cnt == 0: return files if not cdir: cdir = os.getcwd() @@ -2346,6 +3633,19 @@ def get_relative_paths(self, files, cdir, logact = 0): # check if the action to path is blocked def check_block_path(self, path, act = '', logact = 0): + """Return 1 if path is not blocked, or log an error and return 0 if it is. + + Blocks operations targeting PGLOG['USRHOME'] to prevent accidental writes + to user home directories. + + Args: + path (str): Target path to check. + act (str): Action name for the error message; defaults to 'Copy'. + logact (int): Logging action flags; default 0. + + Returns: + int: 1 if allowed, result of pglog() (falsy) if blocked. + """ blockpath = self.PGLOG['USRHOME'] if not act: act = 'Copy' if re.match(r'^{}'.format(blockpath), path): @@ -2356,6 +3656,21 @@ def check_block_path(self, path, act = '', logact = 0): # join two filenames by uing the common prefix/suffix and keeping the different main bodies, # the bodies are seprated by sep replace fext with text if provided def join_filenames(self, name1, name2, sep = '-', fext = None, text = None): + """Merge two filenames into one by keeping their common prefix/suffix. + + The differing middle bodies are joined with sep. Optionally removes a + compression extension and appends a text suffix. + + Args: + name1 (str): First file name. + name2 (str): Second file name. + sep (str): Separator between the two differing bodies; default '-'. + fext (str | None): File extension to strip from both names before merging. + text (str | None): Extension to append to the merged name. + + Returns: + str: Merged file name. + """ if fext: name1 = self.remove_file_extention(name1, fext) name2 = self.remove_file_extention(name2, fext) @@ -2390,9 +3705,19 @@ def join_filenames(self, name1, name2, sep = '-', fext = None, text = None): if text: fname += "." + text return fname - # remove given file extention if provided + # remove given file extention if provided # otherwise try to remove predfined compression extention in self.PGCMPS def remove_file_extention(self, fname, fext): + """Remove a specific or the first matching compression extension from a filename. + + Args: + fname (str): File name to process. + fext (str | None): Extension to remove (without dot); when None, tries all + compression extensions in PGCMPS. + + Returns: + str: File name with the extension removed, or '' when fname is falsy. + """ if not fname: return '' if fext: fname = re.sub(r'\.{}$'.format(fext), '', fname, 1, re.I) @@ -2407,6 +3732,20 @@ def remove_file_extention(self, fname, fext): # check if a previous down storage system is up now for given dflag # return error message if failed checking, and None otherwise def check_storage_down(self, dflag, dpath, dscheck, logact = 0): + """Check whether a previously-down storage system is now accessible. + + Updates dscheck['dflags'] to reflect the current storage status. Retries + up to 2 times, stopping early for planned outages. + + Args: + dflag (str): Storage flag key (e.g. 'G', 'O', 'B', 'D'). + dpath (str | None): Path to test; uses DPATHS[dflag] when None. + dscheck (dict | None): dscheck record to update; uses PGLOG['DSCHECK'] when None. + logact (int): Logging action flags; default 0. + + Returns: + str | None: Error message string if still down, None if accessible. + """ if dflag not in self.DHOSTS: if logact: self.pglog(dflag + ": Unknown Down Flag for Storage Systems", logact) return None @@ -2430,6 +3769,19 @@ def check_storage_down(self, dflag, dpath, dscheck, logact = 0): # return an array of strings for storage systems that are still down, # and empty array if all up def check_storage_dflags(self, dflags, dscheck = None, logact = 0): + """Check all storage systems recorded as down in a dflags string or dict. + + Clears dscheck.dflags in the database when all systems are back up. + + Args: + dflags (str | dict | None): Storage flags to check; str for a set of flag + characters, dict for flag → path mapping. + dscheck (dict | None): dscheck record; uses PGLOG['DSCHECK'] when None. + logact (int): Logging action flags; default 0. + + Returns: + list[str]: Error messages for each storage system still down; empty if all up. + """ if not dflags: return 0 isdict = isinstance(dflags, dict) msgary = [] @@ -2447,6 +3799,21 @@ def check_storage_dflags(self, dflags, dscheck = None, logact = 0): # clear the cached bfile records if frec is None. # return 0 if not yet, 1 if backed up, or -1 if backed up but modified def file_backup_status(self, frec, chgdays = 1, logact = 0): + """Return the backup status of a data file record. + + Caches bfile records by bid. When frec is None, clears the cache. + + Args: + frec (dict | None): File record dict with keys 'bid', 'date_modified', + 'type', 'checksum'/'data_size', and 'sfile'/'wfile'. + Pass None to clear the BFILES cache. + chgdays (int): Number of days of modification allowed before marking as + changed (−1 = accept any age); default 1. + logact (int): Logging action flags; default 0. + + Returns: + int: 1 if backed up, -1 if backed up but file changed since backup, 0 if not. + """ if frec is None: self.BFILES.clear() return 0 diff --git a/src/rda_python_common/pg_log.py b/src/rda_python_common/pg_log.py index 4d22355..5c9180b 100644 --- a/src/rda_python_common/pg_log.py +++ b/src/rda_python_common/pg_log.py @@ -27,6 +27,28 @@ from unidecode import unidecode class PgLOG: + """Logging and process management class for RDA Python tools. + + PgLOG provides a unified interface for logging messages to files and + STDERR/STDOUT, sending email notifications, running system commands, + and managing process metadata. + + Logging behavior is controlled by bitfield ``logact`` flags (e.g. MSGLOG, + WARNLG, ERRLOG, EXITLG). Combine flags with ``|`` to compose actions:: + + pglog.pglog("something went wrong", PgLOG.LOGERR | PgLOG.EXITLG) + + Key flag groups: + + * Output destination: ``MSGLOG`` (log file), ``WARNLG`` / ``ERRLOG`` (STDERR), + ``EMLLOG`` / ``SNDEML`` (email buffer / send immediately) + * Control flow: ``EXITLG`` (sys.exit after logging), ``RETMSG`` (return msg) + * Formatting: ``SEPLIN`` (separator line), ``BRKLIN`` (blank line) + * Return constants: ``SUCCESS`` (1), ``FAILURE`` (0), ``FINISH`` (2) + + Instance state is held in ``self.PGLOG`` (dict), ``self.CPID`` (process info + dict), ``self.COMMANDS`` (command-path cache), and ``self.HOSTTYPES``. + """ # define some constants for logging actions MSGLOG = (0x00001) # logging message @@ -72,6 +94,13 @@ class PgLOG: FAILURE = 0 # Unsuccessful function call def __init__(self): + """Initialize PgLOG with default configuration values. + + Populates ``self.PGLOG`` with site defaults (paths, email settings, + user IDs, batch system info, etc.) then calls :meth:`set_common_pglog` + to override those defaults from environment variables and detect the + runtime environment (hostname, PBS job, PATH construction, etc.). + """ self.PGLOG = { # more defined in untaint_suid() with environment variables 'EMLADDR': '', @@ -156,23 +185,50 @@ def __init__(self): # set additional common PGLOG values self.set_common_pglog() - # get time string in format YYMMDDHHNNSS for given ctime; or current time if ctime is 0 - def current_datetime(self, ctime = 0): - if self.PGLOG['GMTZ']: - dt = time.gmtime(ctime) if ctime else time.gmtime() - else: - dt = time.localtime(ctime) if ctime else time.localtime() + def current_datetime(self, ctime=0): + """Return a datetime string in YYYYMMDDHHMMSS format. + + Args: + ctime: Unix timestamp (seconds). Uses current time when 0. + + Returns: + 14-character string ``YYYYMMDDHHMMSS`` in local or GMT time + depending on ``self.PGLOG['GMTZ']``. + """ + get_time = time.gmtime if self.PGLOG['GMTZ'] else time.localtime + dt = get_time(ctime) if ctime else get_time() return "{:02}{:02}{:02}{:02}{:02}{:02}".format(dt[0], dt[1], dt[2], dt[3], dt[4], dt[5]) - # get an environment variable and untaint it - def get_environment(self, name, default = None, logact = 0): + def get_environment(self, name, default=None, logact=0): + """Return an environment variable value, optionally logging if missing. + + Args: + name: Environment variable name. + default: Value returned when the variable is unset (default None). + logact: Logging action flags; if non-zero and variable is missing, + calls :meth:`pglog` with that action. + + Returns: + The variable's string value, or *default* if unset. + """ env = os.getenv(name, default) if env is None and logact: self.pglog(name + ": Environment variable is not defined", logact) return env - # cache the msg string to global email entries for later call of send_email() - def set_email(self, msg, logact = 0): + def set_email(self, msg, logact=0): + """Append or prepend *msg* to the internal email buffers. + + Buffers are flushed and composed by :meth:`send_email` / + :meth:`send_python_email`. Pass ``msg=None`` to clear ``EMLMSG``. + + Args: + msg: Message text to buffer. ``None`` clears ``EMLMSG``. + logact: Combination of ``EMLTOP`` (prepend / finalise as top-level + email), ``ERRLOG`` (record as numbered error), ``EMLSUM`` + (record in summary section), ``EMLLOG`` (record in detail + section), ``BRKLIN`` / ``SEPLIN`` (formatting). + """ if logact and msg: if logact&self.EMLTOP: if self.PGLOG['PRGMSG']: @@ -215,12 +271,25 @@ def set_email(self, msg, logact = 0): elif msg is None: self.PGLOG['EMLMSG'] = "" - # retrieve the cached email message def get_email(self): + """Return the currently buffered email message string.""" return self.PGLOG['EMLMSG'] - # send a customized email with all entries included - def send_customized_email(self, logmsg, emlmsg, logact = None): + def send_customized_email(self, logmsg, emlmsg, logact=None): + """Send an email whose headers are embedded inside *emlmsg*. + + The message body must contain ``From:``, ``To:``, and ``Subject:`` + header lines. ``Cc:`` is optional. Headers are stripped from the + body before sending. + + Args: + logmsg: Prefix string for error/status log messages. + emlmsg: Full email text with embedded ``From/To/Cc/Subject`` lines. + logact: Logging action flags (default ``LOGWRN``). + + Returns: + ``SUCCESS`` on success, ``FAILURE`` on error. + """ if logact is None: logact = self.LOGWRN entries = { 'fr': ["From", 1, None], @@ -257,14 +326,43 @@ def send_customized_email(self, logmsg, emlmsg, logact = None): self.pglog(errmsg, (logact|self.ERRLOG)&~self.EXITLG) return ret - # send an email; if empty msg send email message saved in self.PGLOG['EMLMSG'] instead - def send_email(self, subject = None, receiver = None, msg = None, sender = None, logact = None): + def send_email(self, subject=None, receiver=None, msg=None, sender=None, logact=None): + """Send an email via :meth:`send_python_email`. + + If *msg* is empty, the buffered ``EMLMSG`` is used and cleared. + + Args: + subject: Email subject line. Defaults to a hostname/command string. + receiver: Recipient address. Defaults to ``EMLADDR`` or ``CURUID``. + msg: Message body. Uses buffered ``EMLMSG`` when omitted. + sender: Sender address. Defaults to ``CURUID``. + logact: Logging action flags (default ``LOGWRN``). + + Returns: + ``SUCCESS`` on success, ``FAILURE`` on error or no message to send. + """ if logact is None: logact = self.LOGWRN return self.send_python_email(subject, receiver, msg, sender, None, logact) - # send an email via python module smtplib; if empty msg send email message saved - # in self.PGLOG['EMLMSG'] instead. pass cc = '' for skipping 'Cc: ' - def send_python_email(self, subject = None, receiver = None, msg = None, sender = None, cc = None, logact = None): + def send_python_email(self, subject=None, receiver=None, msg=None, sender=None, cc=None, logact=None): + """Send an email using Python's ``smtplib``. + + If *msg* is empty, uses and clears the buffered ``EMLMSG``. + Pass ``cc=''`` explicitly to suppress the Cc header entirely. + + Args: + subject: Email subject. Auto-generated from hostname/command if omitted. + receiver: Recipient address. Defaults to ``EMLADDR`` or ``CURUID``. + msg: Message body. Uses buffered ``EMLMSG`` when omitted. + sender: Sender address. Defaults to ``CURUID``. + cc: Carbon-copy address(es). Uses ``CCDADDR`` when ``None``; + pass ``''`` to skip Cc entirely. + logact: Logging action flags (default ``LOGWRN``). + + Returns: + ``SUCCESS`` on success, empty string when there is nothing to send, + or ``FAILURE`` on SMTP error. + """ if logact is None: logact = self.LOGWRN if not msg: if self.PGLOG['EMLMSG']: @@ -297,6 +395,7 @@ def send_python_email(self, subject = None, receiver = None, msg = None, sender emlmsg['Subject'] = subject if self.CPID['CPID']: logmsg += " in " + self.CPID['CPID'] logmsg += ", Subject: {}\n".format(subject) + eml = None try: eml = smtplib.SMTP(self.PGLOG['EMLSRVR'], self.PGLOG['EMLPORT']) eml.send_message(emlmsg) @@ -304,27 +403,41 @@ def send_python_email(self, subject = None, receiver = None, msg = None, sender errmsg = f"Error sending email:\n{err}\n{logmsg}" return self.pglog(errmsg, (logact|self.ERRLOG)&~self.EXITLG) finally: - eml.quit() - self.log_email(str(emlmsg)) - self.pglog(logmsg, logact&~self.EXITLG) - return self.SUCCESS + if eml is not None: + eml.quit() + self.log_email(str(emlmsg)) + self.pglog(logmsg, logact&~self.EXITLG) + return self.SUCCESS - # log email sent def log_email(self, emlmsg): - if not self.CPID['PID']: self.CPID['PID'] = "{}-{}-{}".format(self.PGLOG['HOSTNAME'], self.get_command(), self.PGLOG['CURUID']) + """Append a sent-email record to the email log file. + + Args: + emlmsg: Full email message string (as returned by ``str(EmailMessage)``). + """ + if not self.CPID['PID']: + self.CPID['PID'] = "{}-{}-{}".format(self.PGLOG['HOSTNAME'], self.get_command(), self.PGLOG['CURUID']) cmdstr = "{} {} at {}\n".format(self.CPID['PID'], self.break_long_string(self.CPID['CMD'], 40, "...", 1), self.current_datetime()) fn = "{}/{}".format(self.PGLOG['LOGPATH'], self.PGLOG['EMLFILE']) try: - f = open(fn, 'a') - f.write(cmdstr + emlmsg) - f.close() + with open(fn, 'a') as f: + f.write(cmdstr + emlmsg) except FileNotFoundError as e: - print(e) + print(e) - # Function: cmdlog(cmdline) - # cmdline - program name and all arguments - # ctime - time (in seconds) when the command starts - def cmdlog(self, cmdline = None, ctime = 0, logact = None): + def cmdlog(self, cmdline=None, ctime=0, logact=None): + """Log a command start or end event and update process timing info. + + When *cmdline* is ``None`` or matches ``end|quit|exit|abort``, logs the + elapsed execution time and clears process state. Otherwise records the + command in ``self.CPID`` and logs its start. + + Args: + cmdline: Command string to log. ``None`` or an end-keyword logs + the elapsed time since the matching start. + ctime: Unix timestamp (seconds) for the event. Defaults to now. + logact: Logging action flags (default ``MSGLOG|FRCLOG``). + """ if logact is None: logact = self.MSGLOG|self.FRCLOG if not ctime: ctime = int(time.time()) if not cmdline or re.match('(end|quit|exit|abort)', cmdline, re.I): @@ -347,11 +460,32 @@ def cmdlog(self, cmdline = None, ctime = 0, logact = None): self.CPID['CMD'] = cmdline self.CPID['CTM'] = ctime - # Function: self.pglog(msg, logact) return self.FAILURE or log message if not exit - # msg -- message to log - # locact -- logging actions: MSGLOG, WARNLG, ERRLOG, EXITLG, EMLLOG, & SNDEML - # log and display message/error and exit program according logact value - def pglog(self, msg, logact = None): + def pglog(self, msg, logact=None): + """Log *msg* and take action based on *logact* bitfield flags. + + This is the central logging method. It writes to the log file and/or + STDERR/STDOUT, buffers for email, sends email immediately, or exits the + process — all controlled by the *logact* bitmask. + + Args: + msg: Message text to log. Leading whitespace is stripped. + logact: Combination of action flags (default ``MSGLOG``): + + * ``MSGLOG`` — write to log file + * ``WARNLG`` — write to STDERR as a warning + * ``ERRLOG`` — write to error log file and STDERR + * ``EXITLG`` — call ``sys.exit(1)`` after logging + * ``EMLLOG`` — append *msg* to email buffer + * ``SNDEML`` — send buffered email now + * ``RETMSG`` — return *msg* instead of ``FAILURE`` + * ``FRCLOG`` — force write even if MSGLOG not set + * ``SEPLIN`` — prepend a separator line + * ``BRKLIN`` — prepend a blank line + + Returns: + The *msg* string if ``RETMSG`` is set; otherwise ``FAILURE`` (0). + Does not return when ``EXITLG`` is set. + """ if logact is None: logact = self.MSGLOG retmsg = None logact &= self.PGLOG['LOGMASK'] # filtering the log actions @@ -366,7 +500,7 @@ def pglog(self, msg, logact = None): if msg: msg = msg.rstrip() + "; " msg += ext else: - if msg and not re.search(r'(\n|\r)$', msg): msg += "\n" + if msg and not msg.endswith(('\n', '\r')): msg += "\n" if logact&self.RETMSG: retmsg = msg if logact&self.EMLALL: if logact&self.SNDEML or not msg: @@ -403,8 +537,18 @@ def pglog(self, msg, logact = None): else: return (retmsg if retmsg else self.FAILURE) - # write a log message - def write_message(self, msg, file, logact): + def write_message(self, msg, file, logact): + """Write *msg* to *file* (or STDOUT/STDERR when *file* is ``None``). + + When *file* is given but cannot be opened, falls back to STDOUT/STDERR + with an error notice. Appends a call-trace for error-log writes. + + Args: + msg: Text to write. + file: Absolute path to log file, or ``None`` for console output. + logact: Logging action flags used to select output stream and + formatting (``ERRLOG``, ``EXITLG``, ``BRKLIN``, ``SEPLIN``). + """ doclose = False errlog = logact&self.ERRLOG if file: @@ -422,13 +566,26 @@ def write_message(self, msg, file, logact): if errlog and file and not logact&(self.EMLALL|self.SKPTRC): OUT.write(self.get_call_trace()) if doclose: OUT.close() - # check and disconnet database before exit - def pgexit(self, stat = 0): + def pgexit(self, stat=0): + """Close the database connection (if open) and exit the process. + + Args: + stat: Exit status code passed to ``sys.exit`` (default 0). + """ if self.PGLOG['PGDBBUF']: self.PGLOG['PGDBBUF'].close() sys.exit(stat) - # get a command string for error log dump def get_error_command(self, ctime, logact): + """Build a one-line error/abort header string for log entries. + + Args: + ctime: Unix timestamp (seconds) at the time of the error. + logact: Logging action flags used to determine the prefix word + (``ABORTS``, ``QUITS``, or ``ERROR``). + + Returns: + Formatted string ending with ``\\n``. + """ if not self.CPID['PID']: self.CPID['PID'] = "{}-{}-{}".format(self.PGLOG['HOSTNAME'], self.get_command(), self.PGLOG['CURUID']) cmdstr = "{} {}".format((("ABORTS" if logact&self.ERRLOG else "QUITS") if logact&self.EXITLG else "ERROR"), self.CPID['PID']) cmdstr = self.cmd_execute_time(cmdstr, (ctime - self.CPID['CTM'])) @@ -436,26 +593,53 @@ def get_error_command(self, ctime, logact): cmdstr += " {} at {}\n".format(self.break_long_string(self.CPID['CMD'], 40, "...", 1), self.current_datetime(ctime)) return cmdstr - # get call trace track @staticmethod - def get_call_trace(cut = 1): + def get_call_trace(cut=1): + """Return a formatted call-stack trace string. + + Args: + cut: Number of innermost frames to omit (default 1 to exclude + this method itself). + + Returns: + A ``Trace: file(line){func}=>...`` string ending with ``\\n``, + or an empty string when the stack is empty. + """ t = traceback.extract_stack() n = len(t) - cut - str = '' + trace = '' sep = 'Trace: ' for i in range(n): - tc = t[i] - str += "{}{}({}){}".format(sep, tc[0], tc[1], ("" if tc[2] == '' else "{%s()}" % tc[2])) - if i == 0: sep = '=>' - return str + "\n" if str else "" + tc = t[i] + trace += "{}{}({}){}".format(sep, tc[0], tc[1], ("" if tc[2] == '' else "{%s()}" % tc[2])) + if i == 0: sep = '=>' + return trace + "\n" if trace else "" - # get caller file name @staticmethod - def get_caller_file(cidx = 0): + def get_caller_file(cidx=0): + """Return the source-file path of a caller frame. + + Args: + cidx: Index into ``traceback.extract_stack()`` (default 0 = oldest frame). + + Returns: + Absolute path string of the caller's source file. + """ return traceback.extract_stack()[cidx][0] - # log message, msg, for degugging processes according to the debug level - def pgdbg(self, level, msg = None, do_trace = True): + def pgdbg(self, level, msg=None, do_trace=True): + """Append a debug message to the debug log file if *level* is in range. + + No action is taken when ``PGLOG['DBGLEVEL']`` is falsy. The level + range is specified as an integer (``0``–*N*) or a ``"min-max"`` string. + + Args: + level: Integer debug level for this message, or a string whose + leading digits are parsed as the level. + msg: Message text. Omit to log a header-only entry that also + warns on STDERR. + do_trace: When ``True`` (default), appends a call-stack trace. + """ if not self.PGLOG['DBGLEVEL']: return # no further action if not isinstance(level, int): ms = re.match(r'^(\d+)', level) @@ -483,14 +667,26 @@ def pgdbg(self, level, msg = None, do_trace = True): if self.CPID['CPID']: msg += self.CPID['CPID'] + " <= " msg += self.break_long_string(self.CPID['CMD'], 40, "...", 1) # logging debug info - DBG = open(dfile, 'a') - DBG.write("{}:{}\n".format(level, msg)) - if do_trace: DBG.write(self.get_call_trace()) - DBG.close() + with open(dfile, 'a') as DBG: + DBG.write("{}:{}\n".format(level, msg)) + if do_trace: DBG.write(self.get_call_trace()) - # return trimed string (strip leading and trailling spaces); remove comments led by '#' if rmcmt > 0 @staticmethod - def pgtrim(line, rmcmt = 1): + def pgtrim(line, rmcmt=1): + """Strip leading/trailing whitespace and optionally remove comments. + + Args: + line: Input string to trim. + rmcmt: Comment removal mode: + + * ``0`` — no comment removal + * ``1`` — remove lines starting with ``#`` and inline comments + preceded by two-or-more spaces (`` #``) + * ``2`` — remove inline comments preceded by one-or-more spaces + + Returns: + Trimmed string, or ``''`` for comment-only lines. + """ if line: if rmcmt: if re.match(r'^\s*#', line): # comment line @@ -504,14 +700,28 @@ def pgtrim(line, rmcmt = 1): line = line.strip() # remove leading and trailing whitespaces return line - # set self.PGLOG['PUSGDIR'] from the program file with full path def set_help_path(self, progfile): + """Set ``PGLOG['PUSGDIR']`` to the directory containing *progfile*. + + Args: + progfile: Path to the calling program (typically ``__file__``). + """ self.PGLOG['PUSGDIR'] = op.dirname(op.abspath(progfile)) - # Function: show_usage(progname: Perl program name to get file "progname.usg") - # show program usage in file "self.PGLOG['PUSGDIR']/progname.usg" on screen with unix - # system function 'pg', exit program when done. - def show_usage(self, progname, opts = None): + def show_usage(self, progname, opts=None): + """Display usage information from ``.usg`` then exit. + + When *opts* is provided, prints only the description of each listed + option key extracted from the usage file. Otherwise displays the full + file via ``more``. + + Args: + progname: Base program name (without ``.py``). The file + ``/.usg`` is read. + opts: Dict mapping option letter to ``[type, ...]`` where + ``type`` is 0=Mode, 1=Single-Value, 2=Multi-Value, else Action. + Pass ``None`` to display the full usage file. + """ if self.PGLOG['PUSGDIR'] is None: self.set_help_path(self.get_caller_file(1)) usgname = self.join_paths(self.PGLOG['PUSGDIR'], progname + '.usg') if opts: # show usage for individual option of dsarch @@ -525,59 +735,69 @@ def show_usage(self, progname, opts = None): else: msg = "Action" sys.stdout.write("\nDescription of {} Option -{}:\n".format(msg, opt)) - IN = open(usgname, 'r') nilcnt = begin = 0 - for line in IN: - if begin == 0: - rx = " -{} or -".format(opt) - if re.match(rx, line): begin = 1 - elif re.match(r'^\s*$', line): - if nilcnt: break - nilcnt = 1 - else: - if re.match(r'\d[\.\s\d]', line): break # section title - if nilcnt and re.match(r' -\w\w or -', line): break - nilcnt = 0 - if begin: sys.stdout.write(line) - IN.close() + with open(usgname, 'r') as IN: + for line in IN: + if begin == 0: + rx = " -{} or -".format(opt) + if re.match(rx, line): begin = 1 + elif re.match(r'^\s*$', line): + if nilcnt: break + nilcnt = 1 + else: + if re.match(r'\d[\.\s\d]', line): break # section title + if nilcnt and re.match(r' -\w\w or -', line): break + nilcnt = 0 + if begin: sys.stdout.write(line) else: os.system("more " + usgname) self.pgexit(0) - # compare error message to patterns saved in self.PGLOG['ERR2STD'] - # return 1 if matched; 0 otherwise def err2std(self, line): + """Return 1 if *line* matches any pattern in ``PGLOG['ERR2STD']``, else 0. + + Used to redirect stderr lines to stdout when they match known patterns. + """ for err in self.PGLOG['ERR2STD']: if line.find(err) > -1: return 1 return 0 - # compare message to patterns saved in self.PGLOG['STD2ERR'] - # return 1 if matched; 0 otherwise def std2err(self, line): + """Return 1 if *line* matches any pattern in ``PGLOG['STD2ERR']``, else 0. + + Used to redirect stdout lines to stderr when they match known patterns. + """ for out in self.PGLOG['STD2ERR']: if line.find(out) > -1: return 1 return 0 - # Function: pgsystem(pgcmd, logact, cmdopt, instr) - # pgcmd - Linux system command, can be a string, "ls -l", or a list, ['ls', '-l'] - # logact - logging action option, defaults to self.LOGWRN - # cmdopt - command control option, default to 5 (1+4) - # 0 - no command control, - # 1 - log pgcmd (include the sub command calls), - # 2 - log standard output, - # 4 - log error output - # 7 - log all (pgcmd, and standard/error outputs), - # 8 - log command with time, - # 16 - return standard output message upon success - # 32 - log error as standard output - # 64 - force returning self.FAILURE if called process aborts - # 128 - tries 2 times for failed command before quits - # 256 - cache standard error message - # 512 - log instr & seconds with pgcmd if cmdopt&1 - # 1024 - turn on shell - # instr - input string passing to the command via stdin if not None - # seconds - number of seconds to wait for a timeout process if > 0 - def pgsystem(self, pgcmd, logact = None, cmdopt = 5, instr = None, seconds = 0): + def pgsystem(self, pgcmd, logact=None, cmdopt=5, instr=None, seconds=0): + """Run a system command and log/return its output. + + Args: + pgcmd: Command to execute — either a string (``"ls -l"``) or a + list (``['ls', '-l']``). + logact: Logging action flags (default ``LOGWRN``). + cmdopt: Bitfield controlling logging and execution behaviour: + + * ``1`` — log the command line + * ``2`` — log stdout + * ``4`` — log stderr as errors + * ``8`` — log command with timing (via :meth:`cmdlog`) + * ``16`` — return stdout string on success instead of ``SUCCESS`` + * ``32`` — merge stderr into stdout + * ``64`` — return ``FAILURE`` if subprocess prints ``ABORTS`` + * ``128`` — retry once on failure + * ``256`` — cache stderr in ``PGLOG['SYSERR']`` + * ``512`` — log *instr* / *seconds* alongside command + * ``1024`` — force shell execution + + instr: String fed to the command via stdin (default ``None``). + seconds: Timeout in seconds; 0 means no timeout. + + Returns: + Stdout string when ``cmdopt & 16``; otherwise ``SUCCESS`` or ``FAILURE``. + """ if logact is None: logact = self.LOGWRN ret = self.SUCCESS if not pgcmd: return ret # empty command @@ -692,17 +912,39 @@ def pgsystem(self, pgcmd, logact = None, cmdopt = 5, instr = None, seconds = 0): retbuf = '' return (retbuf if cmdopt&16 else ret) - # strip carriage return '\r', but keep ending newline '\n' @staticmethod def strip_output_line(line): + """Strip carriage returns from a terminal output line. + + Also filters intermediate progress-bar lines (lines with a ``%`` + counter that is not 100). + + Args: + line: A single output line (already stripped of surrounding whitespace). + + Returns: + Cleaned string, ``None`` for suppressed progress lines, or the + original *line* if no special characters are present. + """ ms = re.search(r'\r([^\r]+)\r*$', line) if ms: return ms.group(1) ms = re.search(r'\s\.+\s+(\d+)%\s+', line) if ms and int(ms.group(1)) != 100: return None return line - # show command running time string formated by seconds_to_string_time() - def cmd_execute_time(self, cmdstr, last, logact = None): + def cmd_execute_time(self, cmdstr, last, logact=None): + """Append execution time to *cmdstr* when *last* meets the threshold. + + Args: + cmdstr: Base command/label string. + last: Elapsed time in seconds. + logact: When non-zero, passes the result to :meth:`pglog` and + returns its return value. When zero/``None``, returns + the formatted string directly. + + Returns: + Log return value when *logact* is set; formatted string otherwise. + """ msg = cmdstr if last >= self.PGLOG['CMDTIME']: # show running for at least one minute msg += " ({})".format(self.seconds_to_string_time(last)) @@ -711,39 +953,77 @@ def cmd_execute_time(self, cmdstr, last, logact = None): else: return msg - # convert given seconds to string time with units of S-Second,M-Minute,H-Hour,D-Day @staticmethod - def seconds_to_string_time(seconds, showzero = 0): + def seconds_to_string_time(seconds, showzero=0): + """Convert a duration in seconds to a compact human-readable string. + + Examples: ``90`` → ``"1M30S"``, ``3661`` → ``"1H1M1S"``. + + Args: + seconds: Duration in seconds (int or float). Negative or zero + values produce an empty string unless *showzero* is set. + showzero: When non-zero, returns ``"0S"`` for a zero-second duration. + + Returns: + String composed of ``D`` / ``H`` / ``M`` / ``S`` components, + with fractional seconds shown to 3 decimal places when *seconds* + is a float. Returns ``""`` for non-positive *seconds* unless + *showzero* is set. + """ msg = '' - s = m = h = 0 if seconds > 0: - s = seconds%60 # seconds (0-59) - minutes = int(seconds/60) # total minutes - m = minutes%60 # minutes (0-59) - if minutes >= 60: - hours = int(minutes/60) # total hours - h = hours%24 # hours (0-23) - if hours >= 24: - msg += "{}D".format(int(hours/24)) # days - if h: msg += "{}H".format(h) - if m: msg += "{}M".format(m) + minutes, s = divmod(seconds, 60) + hours, m = divmod(int(minutes), 60) + days, h = divmod(hours, 24) + if days: msg += "{}D".format(days) + if h: msg += "{}H".format(h) + if m: msg += "{}M".format(int(m)) if s: - msg += "%dS"%(s) if isinstance(s, int) else "{:.3f}S".format(s) + msg += "%dS" % s if isinstance(s, int) else "{:.3f}S".format(s) elif showzero: msg = "0S" return msg - # wrap function to call pgsystem() with a timeout control - # return self.FAILURE if error eval or time out - def tosystem(self, cmd, timeout = 0, logact = 0, cmdopt = 5, instr = None): + def tosystem(self, cmd, timeout=0, logact=0, cmdopt=5, instr=None): + """Run a system command with a timeout via :meth:`pgsystem`. + + Args: + cmd: Command string or list (passed to :meth:`pgsystem`). + timeout: Seconds before the command is killed. Uses + ``PGLOG['TIMEOUT']`` when 0. + logact: Logging action flags. + cmdopt: Command option bitfield (see :meth:`pgsystem`). + instr: String passed to the command via stdin. + + Returns: + ``SUCCESS``, ``FAILURE``, or captured stdout (when ``cmdopt & 16``). + """ if logact is None: logact = self.LOGWRN if not timeout: timeout = self.PGLOG['TIMEOUT'] # set default timeout if missed return self.pgsystem(cmd, logact, cmdopt, instr, timeout) - # insert breaks, default to '\n', for every length, default to 1024, - # for long string; return specified number lines if mline given @staticmethod - def break_long_string(lstr, limit = 1024, bsign = "\n", mline = 200, bchars = ' &;', minlmt = 20, eline = 0): + def break_long_string(lstr, limit=1024, bsign="\n", mline=200, bchars=' &;', minlmt=20, eline=0): + """Insert line-break markers into *lstr* and optionally truncate it. + + Lines longer than *limit* are broken at a character in *bchars* when + possible, or hard-broken at *limit*. The result is capped at *mline* + lines; an optional tail of *eline* lines is preserved after the cap. + + Args: + lstr: Input string to wrap. + limit: Maximum line length before a break is inserted (default 1024). + bsign: Break marker inserted between segments (default ``"\\n"``). + mline: Maximum number of output lines/segments (default 200). + bchars: Characters at which a soft break is preferred (default ``' &;'``). + minlmt: Minimum position for a soft break; hard-breaks below this + (default 20). + eline: Number of trailing lines to preserve after *mline* is reached + (default 0). + + Returns: + Wrapped (and possibly truncated) string. + """ length = len(lstr) if lstr else 0 if length <= limit: return lstr if bsign is None: bsign = "\n" @@ -801,11 +1081,20 @@ def break_long_string(lstr, limit = 1024, bsign = "\n", mline = 200, bchars = ' mcnt += 1 return retstr - # join two paths by remove overlapping directories - # diff = 0: join given pathes - # 1: remove path1 from path2 @staticmethod - def join_paths(path1, path2, diff = 0): + def join_paths(path1, path2, diff=0): + """Join or diff two POSIX paths, removing overlapping directory components. + + Args: + path1: Left-hand path (base). + path2: Right-hand path (to append or subtract). + diff: ``0`` — join *path1* and *path2* de-duplicating overlapping + tail/head components; ``1`` — remove *path1* prefix from + *path2* and return the remainder. + + Returns: + Joined or relative path string. + """ if not path2: return path1 if not path1 or not diff and re.match('/', path2): return path2 if diff: @@ -841,15 +1130,30 @@ def join_paths(path1, path2, diff = 0): else: return '/'.join(adir1 + adir2) - # validate if a command for a given BATCH host is accessable and executable - # Return self.SUCCESS if valid; self.FAILURE if not - def valid_batch_host(self, host, logact = 0): + def valid_batch_host(self, host, logact=0): + """Return ``SUCCESS`` if *host* is a known batch host with an accessible submit command. + + Args: + host: Batch host name (case-insensitive). + logact: Logging action flags passed to :meth:`valid_command` on failure. + """ HOST = host.upper() return self.SUCCESS if HOST in self.BCHCMDS and self.valid_command(self.BCHCMDS[HOST], logact) else self.FAILURE - # validate if a given command is accessable and executable - # Return the full command path if valid; '' if not - def valid_command(self, cmd, logact = 0): + def valid_command(self, cmd, logact=0): + """Return the full path of *cmd* if it is accessible and executable. + + Results are cached in ``self.COMMANDS``. + + Args: + cmd: Command name (with optional arguments, e.g. ``"rsync -a"``). + logact: Logging action flags; when non-zero, logs an error if the + command is not found. + + Returns: + Full path string (with arguments appended) on success; ``''`` if + the command is not found. + """ ms = re.match(r'^(\S+)( .*)$', cmd) if ms: option = ms.group(2) @@ -866,8 +1170,17 @@ def valid_command(self, cmd, logact = 0): self.COMMANDS[cmd] = buf return self.COMMANDS[cmd] - # get full command path if possible def command_path(self, cmdstr): + """Return *cmdstr* with the command name replaced by its full path when available. + + Args: + cmdstr: Command string (``"cmd arg1 arg2"``). + + Returns: + String with the leading command resolved to its full path, or the + original *cmdstr* if the command already contains a path separator + or cannot be found via ``shutil.which``. + """ if not cmdstr: return '' ary = cmdstr.split(' ', 1) cmd = ary[0] @@ -876,8 +1189,19 @@ def command_path(self, cmdstr): pcmd = shutil.which(cmd) return (pcmd+optstr) if pcmd else cmdstr - # add carbon copies to self.PGLOG['CCDADDR'] - def add_carbon_copy(self, cc = None, isstr = None, exclude = 0, specialist = None): + def add_carbon_copy(self, cc=None, isstr=None, exclude=0, specialist=None): + """Update the Cc address list in ``PGLOG['CCDADDR']``. + + Passing both *cc* and *isstr* as ``None`` clears the Cc list. + + Args: + cc: Address string (if *isstr*) or list of addresses. + ``None`` or empty clears the list. + isstr: When truthy, *cc* is treated as a comma/space-separated string. + exclude: String of addresses to skip (substring match). + specialist: Address substituted when the sentinel value ``"S"`` + appears in the address list. + """ if not cc: if cc is None and isstr is None: self.PGLOG['CCDADDR'] = '' else: @@ -894,8 +1218,16 @@ def add_carbon_copy(self, cc = None, isstr = None, exclude = 0, specialist = Non self.PGLOG['CCDADDR'] += ", " self.PGLOG['CCDADDR'] += email - # get the current host name; or batch sever name if getbatch is 1 - def get_host(self, getbatch = 0): + def get_host(self, getbatch=0): + """Return the short hostname of the current or batch server. + + Args: + getbatch: When non-zero and a batch job ID is active, returns the + batch server name instead of the local hostname. + + Returns: + Short hostname string (domain stripped). + """ if getbatch and self.PGLOG['CURBID'] != 0: host = self.PGLOG['PGBATCH'] elif self.PGLOG['HOSTNAME']: @@ -904,10 +1236,16 @@ def get_host(self, getbatch = 0): host = socket.gethostname() return self.get_short_host(host) - # - # strip domain names and retrun the server name itself - # def get_short_host(self, host): + """Strip the domain suffix from *host* and return the short hostname. + + Args: + host: Fully-qualified or short hostname string. + + Returns: + Short hostname, or an uppercase batch-host token when the host + matches a known batch system. + """ if not host: return '' ms = re.match(r'^([^\.]+)\.', host) if ms: host = ms.group(1) @@ -916,8 +1254,8 @@ def get_short_host(self, host): if HOST in self.BCHCMDS: return HOST return host - # get a live PBS host name def get_pbs_host(self): + """Return the first live PBS host from ``self.PBSHOSTS``, or ``None``.""" if not self.PBSSTATS and self.PGLOG['PBSHOSTS']: self.PBSHOSTS = self.PGLOG['PBSHOSTS'].split(':') for host in self.PBSHOSTS: @@ -926,8 +1264,13 @@ def get_pbs_host(self): if host in self.PBSSTATS and self.PBSSTATS[host]: return host return None - # set host status, 0 dead & 1 live, for one or all avalaible pbs hosts - def set_pbs_host(self, host = None, stat = 0): + def set_pbs_host(self, host=None, stat=0): + """Set the live/dead status for one or all PBS hosts. + + Args: + host: Host name to update. When ``None``, updates all known hosts. + stat: ``1`` for live, ``0`` for dead (default 0). + """ if host: self.PBSSTATS[host] = stat else: @@ -936,8 +1279,13 @@ def set_pbs_host(self, host = None, stat = 0): for host in self.PBSHOSTS: self.PBSSTATS[host] = stat - # reset the batch host name in case was not set properly - def reset_batch_host(self, bhost, logact = None): + def reset_batch_host(self, bhost, logact=None): + """Change the active batch host to *bhost* if no job is currently running. + + Args: + bhost: New batch host name (case-insensitive). + logact: Logging action flags (default ``LOGWRN``). + """ if logact is None: logact = self.LOGWRN bchhost = bhost.upper() if bchhost != self.PGLOG['PGBATCH']: @@ -952,9 +1300,16 @@ def reset_batch_host(self, bhost, logact = None): self.PGLOG['PGBATCH'] = '' self.PGLOG['CURBID'] = 0 - # return the base command name of the current process @staticmethod - def get_command(cmdstr = None): + def get_command(cmdstr=None): + """Return the base command name, stripping directory and ``.py``/``.pl`` extension. + + Args: + cmdstr: Path string. Defaults to ``sys.argv[0]``. + + Returns: + Base name without extension. + """ if not cmdstr: cmdstr = sys.argv[0] cmdstr = op.basename(cmdstr) ms = re.match(r'^(.+)\.(py|pl)$', cmdstr) @@ -963,9 +1318,20 @@ def get_command(cmdstr = None): else: return cmdstr - # wrap a given command cmd for either sudo or setuid wrapper pgstart_['username'] - # to run as user asuser - def get_local_command(self, cmd, asuser = None): + def get_local_command(self, cmd, asuser=None): + """Wrap *cmd* so it runs as *asuser* on the local host. + + Uses a ``pgstart_`` setuid wrapper when available, or + ``sudo -u `` when ``SUDOGDEX`` is enabled. + + Args: + cmd: Command string to wrap. + asuser: Target username. Returns *cmd* unchanged when ``None`` or + equal to the current user. + + Returns: + Wrapped command string, or the original *cmd* if no wrapping is needed. + """ cuser = self.PGLOG['SETUID'] if self.PGLOG['SETUID'] else self.PGLOG['CURUID'] if not asuser or cuser == asuser: return cmd if cuser == self.PGLOG['GDEXUSER']: @@ -975,21 +1341,44 @@ def get_local_command(self, cmd, asuser = None): return "sudo -u {} {}".format(self.PGLOG['GDEXUSER'], cmd) # sudo as user gdexdata return cmd - # wrap a given command cmd for either sudo or setuid wrapper pgstart_['username'] - # to run as user asuser on a given remote host - def get_remote_command(self, cmd, host, asuser = None): + def get_remote_command(self, cmd, host, asuser=None): + """Wrap *cmd* for execution as *asuser* on *host* (delegates to :meth:`get_local_command`). + + Args: + cmd: Command string to wrap. + host: Target hostname (currently unused; reserved for future SSH wrapping). + asuser: Target username. + + Returns: + Wrapped command string. + """ return self.get_local_command(cmd, asuser) - # wrap a given sync command for given host name with/without sudo - def get_sync_command(self, host, asuser = None): + def get_sync_command(self, host, asuser=None): + """Return the sync command name for *host* with appropriate user context. + + Args: + host: Target hostname. + asuser: User to run as; affects which sync command variant is chosen. + + Returns: + Sync command string (e.g. ``"synccasper"`` or ``"casper-sync"``). + """ host = self.get_short_host(host) if (not (self.PGLOG['SETUID'] and self.PGLOG['SETUID'] == self.PGLOG['GDEXUSER']) and (not asuser or asuser == self.PGLOG['GDEXUSER'])): return "sync" + host return host + "-sync" - # set self.PGLOG['SETUID'] as needed - def set_suid(self, cuid = 0): + def set_suid(self, cuid=0): + """Set the real and effective UID to *cuid* and update ``SETUID``. + + Calls :meth:`set_specialist_environments` when switching to a + non-gdex specialist user. + + Args: + cuid: Target numeric UID. Defaults to the current effective UID. + """ if not cuid: cuid = self.PGLOG['EUID'] if cuid != self.PGLOG['EUID'] or cuid != self.PGLOG['RUID']: os.setreuid(cuid, cuid) @@ -998,8 +1387,16 @@ def set_suid(self, cuid = 0): self.set_specialist_environments(self.PGLOG['SETUID']) self.PGLOG['CURUID'] == self.PGLOG['SETUID'] # set CURUID to a specific specialist - # set comman pglog def set_common_pglog(self): + """Initialise runtime ``PGLOG`` values from the environment. + + Detects the current user, hostname, PBS job state, and constructs the + ``PATH`` environment variable. Also sets all path-related ``PGLOG`` + keys (``LOGPATH``, ``DSSDATA``, ``TMPDIR``, etc.) by reading environment + variables of the same name via :meth:`SETPGLOG`. + + Called automatically by :meth:`__init__`. + """ self.PGLOG['CURDIR'] = os.getcwd() # set current user id self.PGLOG['RUID'] = os.getuid() @@ -1115,12 +1512,21 @@ def set_common_pglog(self): self.PGLOG['TMPSYNC'] = self.PGLOG['DSSDBHM'] + "/tmp/.syncdir" os.umask(2) - # check and return TMPSYNC path, and add it if not exists def get_tmpsync_path(self): + """Return the path to the temporary sync directory (``PGLOG['TMPSYNC']``).""" return self.PGLOG['TMPSYNC'] - # append or prepend locpath to pgpath - def add_local_path(self, locpath, pgpath, append = 0): + def add_local_path(self, locpath, pgpath, append=0): + """Add colon-separated paths from *locpath* to *pgpath* without duplicates. + + Args: + locpath: Colon-separated path string to merge in. + pgpath: Existing path string to update. + append: ``1`` to append, ``0`` to prepend (default 0). + + Returns: + Updated colon-separated path string. + """ if not locpath: return pgpath elif not pgpath: @@ -1137,14 +1543,35 @@ def add_local_path(self, locpath, pgpath, append = 0): pgpath = path + ":" + pgpath return pgpath - # set self.PGLOG value; return a string or an array reference if sep is not emty - def SETPGLOG(self, name, value = ''): + def SETPGLOG(self, name, value=''): + """Set ``PGLOG[name]`` from the environment variable *name* or fall back to *value*. + + If the environment variable is set and non-empty it takes precedence. + If not, the existing ``PGLOG[name]`` value is kept (if present), then + *value* is used as the final default. Values starting with ``PG`` + are treated as unresolved placeholders and replaced with ``''``. + + Args: + name: ``PGLOG`` key and environment variable name. + value: Default value when neither the environment nor an existing + ``PGLOG`` entry is available. + """ oval = self.PGLOG[name] if name in self.PGLOG else '' nval = self.get_environment(name, ('' if re.match('PG', value) else value)) self.PGLOG[name] = nval if nval else oval - # set specialist home and return the default shell def set_specialist_home(self, specialist): + """Set ``HOME`` for *specialist* and return their default shell. + + Reads ``/etc/passwd`` to determine the home directory and shell. + Updates ``HOME`` in the environment when the path exists. + + Args: + specialist: Login name of the specialist user. + + Returns: + Shell basename string (e.g. ``"tcsh"``). + """ if specialist == self.PGLOG['CURUID']: return # no need reset if 'MAIL' in os.environ and re.search(self.PGLOG['CURUID'], os.environ['MAIL']): os.environ['MAIL'] = re.sub(self.PGLOG['CURUID'], specialist, os.environ['MAIL']) @@ -1163,8 +1590,16 @@ def set_specialist_home(self, specialist): os.environ['HOME'] = home return shell - # set environments for a specified specialist - def set_specialist_environments(self, specialist): + def set_specialist_environments(self, specialist): + """Parse *specialist*'s ``~/.tcshrc`` and apply ``setenv`` directives. + + Respects host-conditional ``if``/``else``/``endif`` blocks so that + only the directives matching the current hostname are applied. Skips + ``PATH``, ``SHELL``, ``IFS``, and ``CDPATH`` for security. + + Args: + specialist: Login name of the specialist user. + """ shell = self.set_specialist_home(specialist) resource = os.environ['HOME'] + "/.tcshrc" checkif = 0 # 0 outside of if; 1 start if, 2 check envs, -1 checked already @@ -1214,8 +1649,15 @@ def set_specialist_environments(self, specialist): self.SETPGLOG("HOMEBIN", self.PGLOG['PGBINDIR']) os.environ['PATH'] = self.add_local_path(self.PGLOG['HOMEBIN'], os.environ['PATH'], 0) - # set one environment for specialist def one_specialist_environment(self, line): + """Parse and apply a single ``setenv VAR VALUE`` statement. + + Expands ``$VAR`` references in the value. Skips protected variables + (``PATH``, ``SHELL``, ``IFS``, ``CDPATH``). + + Args: + line: String after the ``setenv`` keyword (``"VAR VALUE"``). + """ ms = re.match(r'^(\w+)[=\s]+(.+)$', line) if not ms: return (var, val) = ms.groups() @@ -1225,8 +1667,21 @@ def one_specialist_environment(self, line): if ms: val = ms.group(2) # remove quotes os.environ[var] = val - # get and repalce environment variables in ginve string; defaults to the values in self.PGLOG - def replace_environments(self, envstr, default = '', logact = 0): + def replace_environments(self, envstr, default='', logact=0): + """Expand the first ``$VAR`` or ``${VAR}`` reference in *envstr*. + + Looks up the variable in the environment first, then in ``PGLOG``, + then falls back to *default*. + + Args: + envstr: String containing a ``$VAR`` reference. + default: Fallback value (string or dict). When a dict, the variable + name is used as the key. + logact: Logging action flags passed to :meth:`get_environment`. + + Returns: + String with the first variable reference substituted. + """ ishash = isinstance(default, dict) ms = re.search(r'(^|.)\$({*)(\w+)(}*)', envstr) if ms: @@ -1238,8 +1693,19 @@ def replace_environments(self, envstr, default = '', logact = 0): envstr = re.sub(r'{}\${}'.format(lead, rep), (pre+env), envstr) return envstr - # validate if the current host is a valid host to process - def check_process_host(self, hosts, chost = None, mflag = None, pinfo = None, logact = None): + def check_process_host(self, hosts, chost=None, mflag=None, pinfo=None, logact=None): + """Check whether the current host is permitted to process *pinfo*. + + Args: + hosts: Host string or pattern. Prefix ``!`` to exclude listed hosts. + chost: Host to check against (default: current/batch host). + mflag: Match mode — ``'G'`` general, ``'M'`` exact, ``'I'`` inclusive. + pinfo: Process description logged on failure. + logact: Logging action flags (default ``LOGERR``). + + Returns: + ``1`` if processing is permitted; ``0`` otherwise. + """ ret = 1 error = '' if not mflag: mflag = 'G' @@ -1265,9 +1731,20 @@ def check_process_host(self, hosts, chost = None, mflag = None, pinfo = None, lo self.pglog("{}: CANNOT be processed on {} for hosthame {}".format(pinfo, chost, error), logact) return ret - # convert special foreign characters into ascii characters @staticmethod - def convert_chars(name, default = 'X'): + def convert_chars(name, default='X'): + """Transliterate *name* to ASCII-safe characters. + + Uses ``unidecode`` to transliterate Unicode characters, then strips any + remaining non-alphanumeric / non-underscore characters. + + Args: + name: Input string to convert. + default: Return value when *name* is empty or fully non-convertible. + + Returns: + ASCII-safe alphanumeric/underscore string, or *default*. + """ if not name: return default if re.match(r'^[a-zA-Z0-9]+$', name): return name # conversion not needed decoded_name = unidecode(name).strip() @@ -1278,15 +1755,37 @@ def convert_chars(name, default = 'X'): else: return default - # Retrieve host and process id - def current_process_info(self, realpid = 0): + def current_process_info(self, realpid=0): + """Return ``[hostname, pid]`` for the current or batch process. + + Args: + realpid: When non-zero, always returns the real OS PID. + When zero and a batch job is active, returns the batch + server name and job ID instead. + + Returns: + List of ``[hostname_or_batch, pid_or_jobid]``. + """ if realpid or self.PGLOG['CURBID'] < 1: return [self.PGLOG['HOSTNAME'], os.getpid()] else: return [self.PGLOG['PGBATCH'], self.PGLOG['CURBID']] - # convert given @ARGV to string. quote the entries with spaces - def argv_to_string(self, argv = None, quote = 1, action = None): + def argv_to_string(self, argv=None, quote=1, action=None): + """Convert an argument list to a shell-safe string. + + Arguments containing shell special characters (``< > | whitespace``) + are single-quoted (or double-quoted when they contain single quotes). + + Args: + argv: List of argument strings. Defaults to ``sys.argv[1:]``. + quote: When non-zero, quotes arguments with special characters. + action: When set, calls :meth:`pglog` with ``LGEREX`` if any + argument contains a special character (safety guard). + + Returns: + Space-joined argument string. + """ argstr = '' if argv is None: argv = sys.argv[1:] for arg in argv: @@ -1303,9 +1802,18 @@ def argv_to_string(self, argv = None, quote = 1, action = None): argstr += arg return argstr - # convert an integer to non-10 based string @staticmethod def int2base(x, base): + """Convert integer *x* to a string in the given *base*. + + Args: + x: Integer to convert. + base: Target numeric base (e.g. 8 for octal, 16 for hex). + + Returns: + String representation of *x* in *base*, with a leading ``'-'`` + for negative values. + """ if x == 0: return '0' negative = 0 if x < 0: @@ -1313,15 +1821,26 @@ def int2base(x, base): x = -x dgts = [] while x: - dgts.append(str(int(x%base))) - x = int(x/base) + dgts.append(str(x % base)) + x //= base if negative: dgts.append('-') dgts.reverse() return ''.join(dgts) - # convert a non-10 based string to an integer @staticmethod def base2int(x, base): + """Convert a decimal-encoded *base*-number string back to a plain integer. + + The input *x* is a decimal integer whose digits represent a number + written in *base* (e.g. ``x=1010, base=2`` → ``10``). + + Args: + x: Integer or string whose digits represent a base-*base* number. + base: Source numeric base. + + Returns: + Decoded integer value. + """ if not isinstance(x, int): x = int(x) if x == 0: return 0 negative = 0 @@ -1331,15 +1850,22 @@ def base2int(x, base): num = 0 fact = 1 while x: - num += (x%10)*fact + num += (x % 10) * fact fact *= base - x = int(x/10) + x //= 10 if negative: num = -num return num - # convert integer to ordinal string @staticmethod def int2order(num): + """Return the ordinal string for *num* (e.g. ``1`` → ``"1st"``). + + Args: + num: Non-negative integer. + + Returns: + String with appropriate suffix: ``st``, ``nd``, ``rd``, or ``th``. + """ ordstr = ['th', 'st', 'nd', 'rd'] snum = str(num) num %= 100 diff --git a/src/rda_python_common/pg_sig.py b/src/rda_python_common/pg_sig.py index fdad7c0..f27901b 100644 --- a/src/rda_python_common/pg_sig.py +++ b/src/rda_python_common/pg_sig.py @@ -18,8 +18,24 @@ from .pg_dbi import PgDBI class PgSIG(PgDBI): + """Daemon process control, signal handling, child process management, and PBS job tracking. + + Extends PgDBI to provide facilities for starting and stopping daemon + processes, forking child processes, handling POSIX signals, managing + background tasks, and querying PBS/Torque batch-job status. + + Instance Attributes: + VUSERS (list): Usernames permitted to start this daemon. + CPIDS (dict): Mapping of child process IDs to their process names. + CBIDS (dict): Mapping of background process IDs to their command strings. + SDUMP (dict): Paths for default, stderr, and stdout dump files with + keys ``'DEF'``, ``'ERR'``, and ``'OUT'``. + PGSIG (dict): Daemon control parameters including quit flag, process + counts, wait times, PID tracking, daemon name, and start time. + """ def __init__(self): + """Initialize PgSIG with default daemon control state.""" super().__init__() # initialize parent class self.VUSERS = [] # allow users to start this daemon self.CPIDS = {} # allow upto 'mproc' processes at one time for daemon @@ -48,7 +64,14 @@ def __init__(self): } # add users for starting this daemon - def add_vusers(self, user = None, mores = None): + def add_vusers(self, user=None, mores=None): + """Add permitted users for starting this daemon. + + Args: + user (str, optional): Username to add. If None, clears all permitted + users. + mores (list, optional): Additional usernames to add. + """ if not user: self.VUSERS = [] # clean all vusers else: @@ -56,7 +79,15 @@ def add_vusers(self, user = None, mores = None): if mores: self.VUSERS.extend(mores) # valid user for starting this daemon - def check_vuser(self, user, aname = None): + def check_vuser(self, user, aname=None): + """Validate that the given user is permitted to start this daemon. + + Exits the process with an error if the user is not in ``VUSERS``. + + Args: + user (str): Username to validate. + aname (str, optional): Application name used in the error message. + """ if user and self.VUSERS: valid = 0; for vuser in self.VUSERS: @@ -65,7 +96,7 @@ def check_vuser(self, user, aname = None): break if valid == 0: vuser = ', '.join(self.VUSERS) - self.pglog("{}: must be '{}' to run '{}' in Daemon mode".format(user, vuser, aname), self.LGEREX) + self.pglog("{}: must be '{}' to run '{}' in Daemon mode".format(user, vuser, aname), self.LGEREX) # turn this process into a daemon # aname - application name, or daemon name @@ -74,8 +105,26 @@ def check_vuser(self, user, aname = None): # wtime - waiting time (in seconds) for next process for the daemon # logon - turn on the logging if true # bproc - multiple background processes if > 1 - # mtime - maximum running time for the daemon if provided - def start_daemon(self, aname, uname, mproc = 1, wtime = 120, logon = 0, bproc = 1, mtime = 0): + # mtime - maximum running time for the daemon if provided + def start_daemon(self, aname, uname, mproc=1, wtime=120, logon=0, bproc=1, mtime=0): + """Fork the current process into a background daemon. + + Checks that no other instance is already running, forks into the + background, sets up signal handlers, and redirects stdio streams. + + Args: + aname (str): Application/daemon name. + uname (str): Username that is starting the daemon. + mproc (int, optional): Maximum number of concurrent child processes. + Defaults to 1. + wtime (int or str, optional): Polling wait time in seconds (or with + unit suffix). Defaults to 120. + logon (int, optional): Enable logging if non-zero. Defaults to 0. + bproc (int, optional): Maximum concurrent background processes. + Defaults to 1. + mtime (int or str, optional): Maximum daemon run time in seconds. + 0 means unlimited. Defaults to 0. + """ dstr = "Daemon '{}'{} on {}".format(aname, (" By {}".format(uname) if uname else ''), self.PGLOG['HOSTNAME']) pid = self.check_daemon(aname, uname) if pid: @@ -123,7 +172,13 @@ def start_daemon(self, aname, uname, mproc = 1, wtime = 120, logon = 0, bproc = self.pgdisconnect(1) # disconnect database in daemon # set dump output file - def set_dump(self, default = None): + def set_dump(self, default=None): + """Redirect stderr and stdout to log/error dump files. + + Args: + default (str, optional): Default path used when no environment + variable override is set. Defaults to None. + """ errdump = self.get_environment("ERRDUMP", default) outdump = self.get_environment("OUTDUMP", default) if not errdump: @@ -140,6 +195,11 @@ def set_dump(self, default = None): # stop daemon and log the ending info def stop_daemon(self, msg): + """Log a graceful daemon stop message. + + Args: + msg (str): Optional reason or context appended to the log entry. + """ msg = " with " + msg if msg else '' self.PGLOG['LOGMASK'] |= self.MSGLOG # turn on logging before daemon stops self.pglog("{} Started at {}, Stopped gracefully{} by {}".format(self.PGSIG['DSTR'], self.PGSIG['STRTM'], msg, self.current_datetime()), self.LOGWRN) @@ -148,7 +208,16 @@ def stop_daemon(self, msg): # aname - application name for the daemon # uname - user login name who started the daemon # return the process id if yes and 0 if not - def check_daemon(self, aname, uname = None): + def check_daemon(self, aname, uname=None): + """Check whether a daemon is already running. + + Args: + aname (str): Application name for the daemon. + uname (str, optional): Username who started the daemon. + + Returns: + int: The process ID of the running daemon, or 0 if not running. + """ if uname: self.check_vuser(uname, aname) pcmd = "ps -u {} -f | grep {} | grep ' 1 '".format(uname, aname) @@ -163,7 +232,7 @@ def check_daemon(self, aname, uname = None): for line in lines: ms = re.match(mp, line) pid = int(ms.group(1)) if ms else 0 - if pid > 0 and pid != cpid: return pid + if pid > 0 and pid != cpid: return pid return 0 # check if an application is running already; other than the current processs @@ -171,7 +240,17 @@ def check_daemon(self, aname, uname = None): # uname - user login name who started the application # argv - argument string # return the process id if yes and 0 if not - def check_application(self, aname, uname = None, sargv = None): + def check_application(self, aname, uname=None, sargv=None): + """Check whether another instance of the application is running. + + Args: + aname (str): Application name. + uname (str, optional): Username who started the application. + sargv (str, optional): Argument string to match against. + + Returns: + int: The process ID of the running instance, or 0 if not found. + """ if uname: self.check_vuser(uname, aname) pcmd = "ps -u {} -f | grep {} | grep -v ' grep '".format(uname, aname) @@ -215,7 +294,15 @@ def check_application(self, aname, uname = None, sargv = None): return 0 # validate if the current process is a single one. Quit if not - def validate_single_process(self, aname, uname = None, sargv = None, logact = None): + def validate_single_process(self, aname, uname=None, sargv=None, logact=None): + """Ensure only one instance of the application is running; exit otherwise. + + Args: + aname (str): Application name. + uname (str, optional): Username who started the application. + sargv (str, optional): Argument string to match against. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + """ if logact is None: logact = self.LOGWRN pid = self.check_application(aname, uname, sargv) if pid: @@ -231,7 +318,17 @@ def validate_single_process(self, aname, uname = None, sargv = None, logact = No # uname - user login name who started the application # argv - argument string # return the the number of processes (exclude the child one) - def check_multiple_application(self, aname, uname = None, sargv = None): + def check_multiple_application(self, aname, uname=None, sargv=None): + """Count how many instances of an application are running. + + Args: + aname (str): Application name. + uname (str, optional): Username who started the application. + sargv (str, optional): Argument string to match against. + + Returns: + int: Number of running instances (excluding the current process). + """ if uname: self.check_vuser(uname, aname) pcmd = "ps -u {} -f | grep {} | grep -v ' grep '".format(uname, aname) @@ -282,7 +379,16 @@ def check_multiple_application(self, aname, uname = None, sargv = None): return ccnt # validate if the running processes reach the limit for the given app; Quit if yes - def validate_multiple_process(self, aname, plimit, uname = None, sargv = None, logact = None): + def validate_multiple_process(self, aname, plimit, uname=None, sargv=None, logact=None): + """Exit if the number of running application instances meets or exceeds a limit. + + Args: + aname (str): Application name. + plimit (int): Maximum allowed number of concurrent instances. + uname (str, optional): Username who started the application. + sargv (str, optional): Argument string to match against. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + """ if logact is None: logact = self.LOGWRN pcnt = self.check_multiple_application(aname, uname, sargv) if pcnt >= plimit: @@ -296,13 +402,22 @@ def validate_multiple_process(self, aname, plimit, uname = None, sargv = None, l # fork process # return the defined result from call of fork def process_fork(self, dstr): + """Fork the current process, retrying up to 10 times on EAGAIN. + + Args: + dstr (str): Descriptive string used in error log messages. + + Returns: + int: The PID returned by ``os.fork()`` (0 in the child, child PID + in the parent). + """ for i in range(10): # try 10 times try: pid = os.fork() return pid except OSError as e: - if e.errno == errno.EAGAIN: - os.sleep(5) + if e.errno == errno.EAGAIN: + time.sleep(5) # Bug fix: os.sleep() does not exist; use time.sleep() else: self.pglog("{}: {}".format(dstr, str(e)), self.LGEREX) break @@ -310,6 +425,15 @@ def process_fork(self, dstr): # process the predefined signals def signal_catch(self, signum, frame): + """Handle SIGQUIT, SIGUSR1, and SIGUSR2 signals for the daemon. + + Adjusts logging state and forwards the signal to all child processes + when called in the server context. + + Args: + signum (int): The signal number received. + frame: The current stack frame (unused). + """ if self.PGSIG['PPID'] == 1: tmp = 'Server' elif self.PGSIG['PPID'] > 1: @@ -350,7 +474,18 @@ def signal_catch(self, signum, frame): # wrapper function to call os.kill() logging caught error based on logact # return self.SUCCESS is success; PgLog.FAILURE if not - def kill_process(self, pid, signum, logact = 0): + def kill_process(self, pid, signum, logact=0): + """Send a signal to a process, optionally logging errors. + + Args: + pid (int): Target process ID. + signum (int or signal.Signals): Signal to send. + logact (int, optional): Log action flags for error reporting. + Defaults to 0 (no logging). + + Returns: + int: ``self.SUCCESS`` on success, ``self.FAILURE`` on error. + """ try: os.kill(pid, signum) except Exception as e: @@ -367,6 +502,12 @@ def kill_process(self, pid, signum, logact = 0): # wait child process to finish def clean_dead_child(self, signum, frame): + """Reap zombie child processes in response to SIGCHLD. + + Args: + signum (int): The signal number received (expected to be SIGCHLD). + frame: The current stack frame (unused). + """ live = 0 while True: try: @@ -385,6 +526,14 @@ def clean_dead_child(self, signum, frame): # send signal to daemon and exit def signal_daemon(self, sname, aname, uname): + """Send a named signal to a running daemon and exit. + + Args: + sname (str): Signal name: ``'quit'``/``'stop'``, ``'logon'``/``'on'``, + or ``'logoff'``/``'off'`` (case-insensitive). + aname (str): Application/daemon name. + uname (str): Username who started the daemon. + """ dstr = "Daemon '{}'{} on {}".format(aname, ((" By " + uname) if uname else ""), self.PGLOG['HOSTNAME']) pid = self.check_daemon(aname, uname) if pid > 0: @@ -401,7 +550,7 @@ def signal_daemon(self, sname, aname, uname): self.PGLOG['DBGLEVEL'] = 0 else: self.pglog("{}: invalid Signal for {}".format(sname, dstr), self.LGEREX) - + if self.kill_process(pid, signum, self.LOGERR) == self.SUCCESS: self.pglog("{}: signal sent to {}".format(msg, dstr), self.LOGWRN|self.FRCLOG) else: @@ -409,7 +558,15 @@ def signal_daemon(self, sname, aname, uname): sys.exit(0) # start a time child to run the command in case hanging - def timeout_command(self, cmd, logact = None, cmdopt = 4): + def timeout_command(self, cmd, logact=None, cmdopt=4): + """Run a command under a timeout child process to prevent hangs. + + Args: + cmd (str): Shell command to execute. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + cmdopt (int, optional): Command options passed to ``pgsystem``. + Defaults to 4. + """ if logact is None: logact = self.LOGWRN if logact&self.EXITLG: logact &= ~self.EXITLG self.pglog("> " + cmd, logact) @@ -417,9 +574,22 @@ def timeout_command(self, cmd, logact = None, cmdopt = 4): self.pgsystem(cmd, logact, cmdopt) sys.exit(0) - # start a timeout child process + # start a timeout child process # return: 1 - in child, 0 - in parent - def start_timeout_child(self, msg, logact = None): + def start_timeout_child(self, msg, logact=None): + """Fork a timeout-monitoring child process. + + The child returns 1 immediately so the caller can run the actual + command. The parent waits up to ``TIMEOUT`` * 2 seconds, then kills + the child if it has not finished. + + Args: + msg (str): Descriptive message / command string used in logging. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + int: 1 if in the child process, 0 if in the parent. + """ if logact is None: logact = self.LOGWRN pid = self.process_fork(msg) if pid == 0: # in child @@ -432,18 +602,30 @@ def start_timeout_child(self, msg, logact = None): # in parent for i in range(self.PGLOG['TIMEOUT']): if not self.check_process(pid): break - sys.sleep(2) - if self.check_process(self, pid): + time.sleep(2) # Bug fix: sys.sleep() does not exist; use time.sleep() + if self.check_process(pid): # Bug fix: removed extra 'self' argument msg += ": timeout({} secs) in CPID {}".format(2*self.PGLOG['TIMEOUT'], pid) pids = self.kill_children(pid, 0) - sys.sleep(6) + time.sleep(6) # Bug fix: sys.sleep() does not exist; use time.sleep() if self.kill_process(pid, signal.SIGKILL, self.LOGERR): pids.insert(0, pid) if pids: msg += "\nProcess({}) Killed".format(','.join(map(str, pids))) self.pglog(msg, logact) return 0 # kill children recursively start from the deepest and return the pids got killed - def kill_children(self, pid, logact = None): + def kill_children(self, pid, logact=None): + """Recursively kill all child processes of a given PID. + + Traverses the process tree depth-first (deepest first) and sends + SIGKILL to each process. + + Args: + pid (int): Parent process ID whose children should be killed. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + list: PIDs of processes that were successfully killed. + """ if logact is None: logact = self.LOGWRN buf = self.pgsystem("ps --ppid {} -o pid".format(pid), logact, 20) pids = [] @@ -461,8 +643,23 @@ def kill_children(self, pid, logact = None): return pids # start a child process - # pname - unique process name - def start_child(self, pname, logact = None, dowait = 0): + # pname - unique process name + def start_child(self, pname, logact=None, dowait=0): + """Fork a named child process for the daemon to manage. + + Records the child PID in ``CPIDS`` and resets daemon state inside + the child. + + Args: + pname (str): Unique process name for this child. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + dowait (int, optional): If non-zero, wait for a slot to open when + at the process limit. Defaults to 0. + + Returns: + int: 1 if the child was started (or no child needed), -1 if + already running, or the result of ``pglog`` on failure. + """ if logact is None: logact = self.LOGWRN if self.PGSIG['MPROC'] < 2: return 1 # no need child process if logact&self.EXITLG: logact &= ~self.EXITLG @@ -493,11 +690,19 @@ def start_child(self, pname, logact = None, dowait = 0): self.CBIDS = {} # empty backgroud proces info in case not self.PGSIG['DSTR'] += ": CPID {} for {}".format(pid, pname) self.cmdlog("CPID {} for {}".format(pid, pname)) - self.pgdisconnect(0) # disconnect database in child + self.pgdisconnect(0) # disconnect database in child return 1 # child started successfully - # get child process id for given pname + # get child process id for given pname def pname2cpid(self, pname): + """Look up the child process ID for a given process name. + + Args: + pname (str): Unique process name to look up. + + Returns: + int: The child PID, or 0 if not found. + """ for cpid in self.CPIDS: if self.CPIDS[cpid] == pname: return cpid return 0 @@ -508,7 +713,21 @@ def pname2cpid(self, pname): # dowait - 0 no wait, 1 wait all done, -1 wait only when all children are running # return the number of running processes if dowait == 0 or 1 # return the number of none-running processes if dowait == -1 - def check_child(self, pname, pid = 0, logact = None, dowait = 0): + def check_child(self, pname, pid=0, logact=None, dowait=0): + """Check whether one or all managed child processes are still running. + + Args: + pname (str or None): Process name to check; None checks all. + pid (int, optional): Specific PID to check. Defaults to 0. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + dowait (int, optional): 0 returns immediately; positive waits for + all to finish; negative waits only while all slots are occupied. + Defaults to 0. + + Returns: + int: Number of running processes (``dowait >= 0``) or number of + idle slots (``dowait < 0``). + """ if logact is None: logact = self.LOGWRN if self.PGSIG['MPROC'] < 2: return 0 # no child process if logact&self.EXITLG: logact &= ~self.EXITLG @@ -544,7 +763,21 @@ def check_child(self, pname, pid = 0, logact = None, dowait = 0): # uname - user login name to started the application # mproc - upper limit of muiltiple child processes # wtime - waiting time (in seconds) for next process - def start_none_daemon(self, aname, cact = None, uname = None, mproc = 1, wtime = 120, logon = 1, bproc = 1): + def start_none_daemon(self, aname, cact=None, uname=None, mproc=1, wtime=120, logon=1, bproc=1): + """Initialize signal handling and process limits for non-daemon mode. + + Args: + aname (str): Application/daemon name. + cact (str, optional): Short action name appended to the description. + uname (str, optional): Username who started the application. + mproc (int, optional): Maximum concurrent child processes. Defaults + to 1. + wtime (int or str, optional): Polling wait time. Defaults to 120. + logon (int, optional): Enable message logging if non-zero. Defaults + to 1. + bproc (int, optional): Maximum concurrent background processes. + Defaults to 1. + """ dstr = aname if cact: dstr += " for Action " + cact if uname: @@ -567,6 +800,14 @@ def start_none_daemon(self, aname, cact = None, uname = None, mproc = 1, wtime = # pid - specified process id # pmsg - process message if given def check_process(self, pid): + """Check whether a process is still running. + + Args: + pid (int): Process ID to check. + + Returns: + int: 1 if the process is running, 0 otherwise. + """ buf = self.pgsystem("ps -p {} -o pid".format(pid), self.LGWNEX, 20) if buf: mp = r'^\s*{}$'.format(pid) @@ -576,7 +817,18 @@ def check_process(self, pid): return 0 # check a process id on give host - def check_host_pid(self, host, pid, pmsg = None, logact = None): + def check_host_pid(self, host, pid, pmsg=None, logact=None): + """Check whether a PID is running on a specific host. + + Args: + host (str): Hostname to check (passed to ``rdaps -h``). + pid (int): Process ID to look up. + pmsg (str, optional): Message to log if the process is found. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + int: 1 if running, 0 if not, -1 on system error. + """ if logact is None: logact = self.LOGWRN cmd = 'rdaps' if host: cmd += " -h " + host @@ -594,7 +846,21 @@ def check_host_pid(self, host, pid, pmsg = None, logact = None): # aname - application name # pmsg - process message if given # return 1 if process is steal live, 0 died already, -1 error checking - def check_host_process(self, host, pid, ppid = 0, uname = None, aname = None, pmsg = None, logact = None): + def check_host_process(self, host, pid, ppid=0, uname=None, aname=None, pmsg=None, logact=None): + """Check whether a process is running on a host using rdaps. + + Args: + host (str): Hostname to check. + pid (int): Process ID. + ppid (int, optional): Parent process ID. Defaults to 0. + uname (str, optional): Username filter. + aname (str, optional): Application name filter. + pmsg (str, optional): Message to log if the process is found. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + int: 1 if process is still alive, 0 if dead, -1 on error. + """ if logact is None: logact = self.LOGWRN cmd = "rdaps" if host: cmd += " -h " + host @@ -608,7 +874,21 @@ def check_host_process(self, host, pid, ppid = 0, uname = None, aname = None, pm return 1 # get a single pbs status record via qstat - def get_pbs_info(self, qopts, multiple = 0, logact = 0, chkcnt = 1): + def get_pbs_info(self, qopts, multiple=0, logact=0, chkcnt=1): + """Retrieve PBS job status information via ``qstat``. + + Args: + qopts (str): Options or job ID passed to ``qstat -n -w``. + multiple (int, optional): If non-zero, collect data for multiple + jobs into lists. Defaults to 0. + logact (int, optional): Log action flags. Defaults to 0. + chkcnt (int, optional): Number of retry attempts if ``qstat`` + returns no output. Defaults to 1. + + Returns: + dict: Mapping of column names to values (or lists of values when + ``multiple`` is non-zero). Empty dict on failure. + """ stat = {} loop = 0 buf = None @@ -628,7 +908,7 @@ def get_pbs_info(self, qopts, multiple = 0, logact = 0, chkcnt = 1): ckeys[1] = 'UserName' ckeys[3] = 'JobName' ckeys[7] = 'Reqd' + ckeys[7] - ckeys[8] = 'Reqd' + ckeys[7] + ckeys[8] = 'Reqd' + ckeys[8] # Bug fix: was ckeys[7] (wrong index) ckeys[9] = 'State' ckeys[10] = 'Elap' + ckeys[7] ckeys.append('Node') @@ -662,7 +942,17 @@ def get_pbs_info(self, qopts, multiple = 0, logact = 0, chkcnt = 1): # check status of a pbs batch id # bid - specified batch id # return hash of batch status, 0 if cannot check any more - def check_pbs_status(self, bid, logact = None): + def check_pbs_status(self, bid, logact=None): + """Retrieve historical status of a PBS batch job via ``qhist``. + + Args: + bid (str or int): PBS batch job ID. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + dict: Mapping of column names to values, or an empty dict on + failure. + """ if logact is None: logact = self.LOGWRN stat = {} buf = self.pgsystem("qhist -w -j {}".format(bid), logact, 20) @@ -687,13 +977,24 @@ def check_pbs_status(self, bid, logact = None): vals = re.split(r'\s+', self.pgtrim(line)) for i in range(kcnt): stat[ckeys[i]] = vals[i] - break + break return stat # check if a pbs batch id is live # bid - specified batch id # return 1 if process is steal live, 0 died already or error checking - def check_pbs_process(self, bid, pmsg = None, logact = None): + def check_pbs_process(self, bid, pmsg=None, logact=None): + """Check whether a PBS batch job is currently queued or running. + + Args: + bid (str or int): PBS batch job ID. + pmsg (str, optional): Message prefix logged with the result state. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + int: 1 if the job is active (B/R/Q/S/H/W/X), 0 if finished, + -1 if the job record was not found. + """ if logact is None: logact = self.LOGWRN stat = self.get_pbs_info(bid, 0, logact) ret = -1 @@ -712,6 +1013,20 @@ def check_pbs_process(self, bid, pmsg = None, logact = None): # get wait time def get_wait_time(self, wtime, default, tmsg): + """Parse a wait-time value and convert it to seconds. + + Accepts an integer, a plain numeric string, or a string with a unit + suffix (``D`` days, ``H`` hours, ``M`` minutes, ``S`` seconds). + + Args: + wtime (int or str): Wait time value to parse. If falsy the + ``default`` is used. + default (int): Fallback value when ``wtime`` is falsy. + tmsg (str): Descriptive label used in error log messages. + + Returns: + int: Wait time in seconds. + """ if not wtime: wtime = default # use default time if type(wtime) is int: return wtime if re.match(r'^(\d*)$', wtime): return int(wtime) @@ -721,6 +1036,7 @@ def get_wait_time(self, wtime, default, tmsg): unit = ms.group(2) else: self.pglog("{}: '{}' NOT in (D,H,M,S)".format(wtime, tmsg), self.LGEREX) + return default # Bug fix: LGEREX exits, but add fallback so 'unit' is always defined if unit != 'S': ret *= 60 # seconds in a minute if unit != 'M': @@ -731,7 +1047,24 @@ def get_wait_time(self, wtime, default, tmsg): # start a background process and record its id; check self.pgsystem() in self.pm for # valid cmdopt values - def start_background(self, cmd, logact = None, cmdopt = 5, dowait = 0): + def start_background(self, cmd, logact=None, cmdopt=5, dowait=0): + """Start a shell command as a background process and record its ID. + + If ``BPROC`` is less than 2, the command is run synchronously via + ``pgsystem``. + + Args: + cmd (str): Shell command to run in the background. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + cmdopt (int, optional): Bitfield controlling logging and redirection + (see ``pgsystem`` docs). Defaults to 5. + dowait (int, optional): If non-zero, wait for a background slot to + open before starting. Defaults to 0. + + Returns: + int or str: Result from ``pgsystem`` (synchronous mode) or + ``record_background``. + """ if logact is None: logact = self.LOGWRN if self.PGSIG['BPROC'] < 2: return self.pgsystem(cmd, logact, cmdopt) # no background act = logact&(~self.EXITLG) @@ -745,7 +1078,7 @@ def start_background(self, cmd, logact = None, cmdopt = 5, dowait = 0): self.show_wait_message(i, "{}-{}: wait any {} background calls".format(self.PGSIG['DSTR'], cmd, bcnt), act, dowait) i += 1 else: - return self.pglog("{}-{}: {} background calls already at {}".format(self.PGSIG['DSTR'], cmd, bcnt, self.current_datetime()), act) + return self.pglog("{}-{}: {} background calls already at {}".format(self.PGSIG['DSTR'], cmd, bcnt, self.current_datetime()), act) cmdlog = (act if cmdopt&1 else self.WARNLG) if cmdopt&8: self.cmdlog("starts '{}'".format(cmd), None, cmdlog) @@ -762,8 +1095,16 @@ def start_background(self, cmd, logact = None, cmdopt = 5, dowait = 0): os.system(bckcmd) return self.record_background(cmd, logact) - # get background process id for given bcmd + # get background process id for given bcmd def bcmd2cbid(self, bcmd): + """Look up the background process ID for a given command string. + + Args: + bcmd (str): Background command string to look up. + + Returns: + int: The background process ID, or 0 if not found. + """ for cbid in self.CBIDS: if self.CBIDS[cbid] == bcmd: return cbid return 0 @@ -771,7 +1112,19 @@ def bcmd2cbid(self, bcmd): # check one or all child processes if they are still running # bid - check this specified background process id if given # return the number of processes are still running - def check_background(self, bcmd, bid = 0, logact = None, dowait = 0): + def check_background(self, bcmd, bid=0, logact=None, dowait=0): + """Check whether one or all background processes are still running. + + Args: + bcmd (str or None): Background command to check; None checks all. + bid (int, optional): Specific background process ID. Defaults to 0. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + dowait (int, optional): If non-zero, keep checking until all + background processes have finished. Defaults to 0. + + Returns: + int: Number of background processes still running. + """ if logact is None: logact = self.LOGWRN if self.PGSIG['BPROC'] < 2: return 0 # no background process if logact&self.EXITLG: logact &= ~self.EXITLG @@ -788,7 +1141,10 @@ def check_background(self, bcmd, bid = 0, logact = None, dowait = 0): elif bid in self.CBIDS: del self.CBIDS[bid] # clean the saved info for the process elif not bcmd: - for bid in self.CBIDS: + # Bug fix: cannot delete from a dict while iterating over it; + # iterate over a copy of the keys instead. + cbids = list(self.CBIDS) + for bid in cbids: if self.check_process(bid): # process is not done yet bcnt += 1 else: @@ -801,7 +1157,17 @@ def check_background(self, bcmd, bid = 0, logact = None, dowait = 0): # check and record process id for background command; return 1 if success full; # 0 otherwise; -1 if done already - def record_background(self, bcmd, logact = None): + def record_background(self, bcmd, logact=None): + """Locate and record the process ID for a recently started background command. + + Args: + bcmd (str): The background command whose PID should be recorded. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + + Returns: + int: 1 if the PID was found and recorded, 0 if not found after + retries, -1 if already recorded. + """ if logact is None: logact = self.LOGWRN ms = re.match(r'^(\S+)', bcmd) if ms: @@ -829,27 +1195,61 @@ def record_background(self, bcmd, logact = None): return 0 # sleep for given period for the daemon, stops if maximum running time reached - def sleep_daemon(self, wtime = 0, mtime = None): + def sleep_daemon(self, wtime=0, mtime=None): + """Sleep for the daemon's configured wait interval. + + Checks the maximum running time and sets the QUIT flag if it has been + exceeded. + + Args: + wtime (int, optional): Override sleep duration in seconds. Uses + ``PGSIG['WTIME']`` when 0. Defaults to 0. + mtime (int, optional): Maximum running time in seconds. Uses + ``PGSIG['MTIME']`` when None. Defaults to None. + + Returns: + int: Actual sleep time in seconds (0 if QUIT was triggered). + """ if not wtime: wtime = self.PGSIG['WTIME'] if mtime is None: mtime = self.PGSIG['MTIME'] if mtime > 0: rtime = int(time.time()) - self.PGSIG['STIME'] if rtime >= mtime: self.PGSIG['QUIT'] = 1 - wtime = 0 + wtime = 0 if wtime: time.sleep(wtime) return wtime # show wait message every dintv and then sleep for PGSIG['WTIME'] - def show_wait_message(self, loop, msg, logact = None, dowait = 0): + def show_wait_message(self, loop, msg, logact=None, dowait=0): + """Log a wait message every 30 loops and optionally sleep. + + Args: + loop (int): Current loop iteration count. + msg (str): Message to log. + logact (int, optional): Log action flags. Defaults to ``LOGWRN``. + dowait (int, optional): If non-zero, sleep for ``PGSIG['WTIME']`` + seconds. Defaults to 0. + """ if logact is None: logact = self.LOGWRN if loop > 0 and (loop%30) == 0: - self.pglog("{} at {}".format(msg, self.current_datetime()), logact) + self.pglog("{} at {}".format(msg, self.current_datetime()), logact) if dowait: time.sleep(self.PGSIG['WTIME']) # register a time out function to raise a time out error @contextmanager - def pgtimeout(self, seconds = 0, logact = 0): + def pgtimeout(self, seconds=0, logact=0): + """Context manager that raises ``TimeoutError`` after a given interval. + + Args: + seconds (int, optional): Timeout in seconds. Uses + ``PGLOG['TIMEOUT']`` when 0. Defaults to 0. + logact (int, optional): Reserved for future log action use. + Defaults to 0. + + Yields: + None: Control is yielded to the ``with`` block body. + """ if not seconds: seconds = self.PGLOG['TIMEOUT'] signal.signal(signal.SIGALRM, self.raise_pgtimeout) signal.alarm(seconds) @@ -863,11 +1263,21 @@ def pgtimeout(self, seconds = 0, logact = 0): # raise a timeout Error @staticmethod def raise_pgtimeout(signum, frame): - raise TimeoutError + """SIGALRM handler that raises ``TimeoutError``. + + Args: + signum (int): Signal number (expected SIGALRM). + frame: Current stack frame (unused). + + Raises: + TimeoutError: Always raised to interrupt the timed block. + """ + raise TimeoutError # Add a timeout block. def timeout_func(self): - with self.pgtimeout(1): - print('entering block') - time.sleep(10) - print('This should never get printed because the line before timed out') + """Demonstrate the pgtimeout context manager with a 1-second limit.""" + with self.pgtimeout(1): + print('entering block') + time.sleep(10) + print('This should never get printed because the line before timed out') diff --git a/src/rda_python_common/pg_util.py b/src/rda_python_common/pg_util.py index 4c4d52c..e0ff151 100644 --- a/src/rda_python_common/pg_util.py +++ b/src/rda_python_common/pg_util.py @@ -19,7 +19,28 @@ class PgUtil(PgLOG): + """Miscellaneous date/time, dataset ID, and record-manipulation utilities. + + Extends PgLOG with helpers for date arithmetic, formatting, temporal pattern + parsing, column-oriented record manipulation, sorting, searching, and file + classification. Inherits all logging utilities from PgLOG. + + Instance Attributes: + DATEFMTS (dict): Regex fragments for each temporal unit (C, Y, Q, M, W, D, H, N, S). + MONTHS (list[str]): Full lowercase month names, index 0 = January. + MNS (list[str]): Three-letter lowercase month abbreviations, index 0 = 'jan'. + WDAYS (list[str]): Full lowercase weekday names, index 0 = 'sunday'. + WDS (list[str]): Three-letter lowercase weekday abbreviations, index 0 = 'sun'. + MDAYS (list[int]): Days per month; index 0 = days in year (365/366), + indices 1-12 = days in each month (Feb updated for leap years). + """ + def __init__(self): + """Initialise PgUtil with date/time lookup tables. + + Calls PgLOG.__init__(), then populates DATEFMTS (temporal format regex fragments), + MONTHS, MNS, WDAYS, WDS (month/weekday name lists), and MDAYS (days-per-month array). + """ super().__init__() # initialize parent class self.DATEFMTS = { 'C': '(CC|C)', # century @@ -44,6 +65,14 @@ def __init__(self): # dt: optional given date in format of "YYYY-MM-DD" # return weekday: 0 - Sunday, 1 - Monday, ..., 6 - Saturday def get_weekday(self, date = None): + """Return the weekday number for a given date, using Sunday=0 convention. + + Args: + date (str | None): Date string in 'YYYY-MM-DD' format; uses today when None. + + Returns: + int: Weekday number where 0 = Sunday, 1 = Monday, …, 6 = Saturday. + """ if date is None: ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() else: @@ -53,6 +82,16 @@ def get_weekday(self, date = None): # mn: given month string like "Jan" or "January", or numeric number 1 to 12 # Return: numeric Month if not fmt (default); three-charater or full month names for given fmt def get_month(self, mn, fmt = None): + """Convert a month value to a numeric index or a formatted name string. + + Args: + mn (int | str): Month as an integer (1-12), numeric string, or name/abbreviation. + fmt (str | None): Output format token (e.g. 'MM', 'Mon', 'Month'); returns the + numeric month when None. + + Returns: + int | str: Numeric month (1-12) when fmt is None; formatted string otherwise. + """ if not isinstance(mn, int): if re.match(r'^\d+$', mn): mn = int(mn) @@ -80,6 +119,17 @@ def get_month(self, mn, fmt = None): # wday: given weekday string like "Sun" or "Sunday", or numeric number 0 to 6 # Return: numeric Weekday if !fmt (default); three-charater or full week name for given fmt def get_wday(self, wday, fmt = None): + """Convert a weekday value to a numeric index or a formatted name string. + + Args: + wday (int | str): Weekday as 0-6 integer, numeric string, or name/abbreviation. + fmt (str | None): Output format token (e.g. 'W', 'Www', 'Week'); returns the + numeric weekday when None. + + Returns: + int | str: Numeric weekday (0=Sunday … 6=Saturday) when fmt is None; + formatted string otherwise. + """ if not isinstance(wday, int): if re.match(r'^\d+$', wday): wday = int(wday) @@ -91,7 +141,7 @@ def get_wday(self, wday, fmt = None): if fmt and wday >= 0 and wday <= 6: slen = len(fmt) if slen == 4: - swday = self.WDAYS[w] + swday = self.WDAYS[wday] if re.match(r'^We', fmt): swday = swday.capitalize() elif re.match(r'^WE', fmt): @@ -112,6 +162,20 @@ def get_wday(self, wday, fmt = None): # Return: type if given file name is a valid online file; '' otherwise @staticmethod def valid_online_file(file, type = None, exists = None): + """Determine whether a file path is a valid, publicly-servable data file. + + Rejects files that do not exist (unless exists is False), hidden files (basename + starts with a comma), index HTML files, and files with special extensions + (.doc, .php, .html, .shtml). + + Args: + file (str): File path to check. + type (str | None): Caller-supplied file type; returned unchanged when not 'D'. + exists (bool | None): When False, skips the filesystem existence check. + + Returns: + str: The file type (defaulting to 'D') on success, or '' when rejected. + """ if exists is None or exists: if not op.exists(file): return '' # file does not exist bname = op.basename(file) @@ -123,17 +187,38 @@ def valid_online_file(file, type = None, exists = None): # Return: current time string in format of HH:MM:SS def curtime(self, getdate = False): + """Return the current time (or datetime) as a formatted string. + + Args: + getdate (bool): When True returns 'YYYY-MM-DD HH:MM:SS'; otherwise 'HH:MM:SS'. + + Returns: + str: Formatted current time string. + """ ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() fmt = "%Y-%m-%d %H:%M:%S" if getdate else "%H:%M:%S" return time.strftime(fmt, ct) # wrapper function of curtime(True) to get datetime in form of YYYY-MM-DD HH:NN:SS def curdatetime(self): + """Return the current date and time as 'YYYY-MM-DD HH:MM:SS'. + + Returns: + str: Current datetime string. + """ return self.curtime(True) # fmt: optional date format, defaults to YYYY-MM-DD # Return: current (date, hour) def curdatehour(self, fmt = None): + """Return the current date and hour as a two-element list. + + Args: + fmt (str | None): Date format string passed to fmtdate(); defaults to 'YYYY-MM-DD'. + + Returns: + list: [date_str, hour_int] where hour is 0-23. + """ ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() dt = self.fmtdate(ct[0], ct[1], ct[2], fmt) if fmt else time.strftime("%Y-%m-%d", ct) return [dt, ct[3]] @@ -141,8 +226,19 @@ def curdatehour(self, fmt = None): # tm: optional time in seconds since the Epoch # Return: current date and time strings def get_date_time(self, tm = None): + """Split a time value into [date_str, time_str] components. + + Accepts multiple input types and normalises them to a two-element list. + + Args: + tm: Input value — None (→ now), str ('YYYY-MM-DD HH:MM:SS'), int/float + (Unix epoch), datetime.datetime, datetime.date, or datetime.time. + + Returns: + list | None: [date_str, time_str] on success, or None when tm is unrecognised. + """ act = ct = None - if tm == None: + if tm is None: ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() elif isinstance(tm, str): act = tm.split(' ') @@ -154,7 +250,7 @@ def get_date_time(self, tm = None): act = [str(tm), '00:00:00'] elif isinstance(tm, datetime.time): act = [None, str(tm)] - if ct == None: + if ct is None: return act if act else None else: return [time.strftime("%Y-%m-%d", ct), time.strftime("%H:%M:%S", ct)] @@ -162,7 +258,16 @@ def get_date_time(self, tm = None): # tm: optional time in seconds since the Epoch # Return: current datetime strings def get_datetime(self, tm = None): - if tm == None: + """Return a datetime value normalised to a 'YYYY-MM-DD HH:MM:SS' string. + + Args: + tm: Input — None (→ now), str (returned as-is), int/float (Unix epoch), + datetime.datetime, or datetime.date. + + Returns: + str: Datetime string, or the original value when the type is unrecognised. + """ + if tm is None: ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() return time.strftime("%Y-%m-%d %H:%M:%S", ct) elif isinstance(tm, str): @@ -179,6 +284,14 @@ def get_datetime(self, tm = None): # file: file name, get curent timestamp if missed # Return: timestsmp string in format of 'YYYYMMDDHHMMSS def timestamp(self, file = None): + """Return a compact timestamp string in 'YYYYMMDDHHMMSS' format. + + Args: + file (str | None): Path to a file whose mtime is used; uses current time when None. + + Returns: + str: 14-character timestamp string. + """ if file is None: ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() else: @@ -190,6 +303,15 @@ def timestamp(self, file = None): # check date/time and set to default one if empty date @staticmethod def check_datetime(date, default): + """Return date if non-empty and non-zero, otherwise return the default. + + Args: + date (str | any): Date value to check; coerced to str when not already. + default (str): Fallback value returned when date is falsy or starts with '0000'. + + Returns: + str: Validated date string or default. + """ if not date: return default if not isinstance(date, str): date = str(date) if re.match(r'^0000', date): return default @@ -198,12 +320,34 @@ def check_datetime(date, default): # fmt: date format, default to "YYYY-MM-DD" # Return: new formated current date string def curdate(self, fmt = None): + """Return the current date as a formatted string. + + Args: + fmt (str | None): Date format string for fmtdate(); defaults to 'YYYY-MM-DD'. + + Returns: + str: Formatted current date string. + """ ct = time.gmtime() if self.PGLOG['GMTZ'] else time.localtime() return self.fmtdate(ct[0], ct[1], ct[2], fmt) if fmt else time.strftime("%Y-%m-%d", ct) # check given string to identify temporal pattern and their units # defined in (keys self.DATEFMTS) def temporal_pattern_units(self, string, seps): + """Parse a string for temporal format tokens and return their unit mappings. + + Extracts patterns enclosed by seps delimiters, ignores generic ('P…') and + current-time ('C…C') patterns, and maps each found DATEFMTS key to its unit + multiplier (quarter→3, century→100, others→1). + + Args: + string (str): Input string containing delimited temporal patterns. + seps (str): Two-character string where seps[0] is the opening delimiter + and seps[1] the closing delimiter. + + Returns: + dict: Mapping of DATEFMTS key (e.g. 'Y', 'M', 'D') to unit multiplier. + """ mkeys = ['D', 'Q', 'M', 'C', 'Y', 'H', 'N', 'S'] units = {} match = seps[0] + "([^" + seps[1] + "]+)" + seps[1] @@ -226,6 +370,20 @@ def temporal_pattern_units(self, string, seps): # format output for given date and hour def format_datehour(self, date, hour, tofmt = None, fromfmt = None): + """Format a date and hour value into a string using an optional format template. + + When tofmt is given, substitutes the hour token in the formatted date string. + When tofmt is absent, appends the zero-padded hour with a space separator. + + Args: + date (str | any): Date value; formatted via format_date() when truthy. + hour (int | None): Hour value (0-23); appended/substituted when not None. + tofmt (str | None): Output date+hour format string. + fromfmt (str | None): Input date format string passed to format_date(). + + Returns: + str: Formatted date-hour string. + """ if date: datehour = self.format_date(str(date), tofmt, fromfmt) elif tofmt: @@ -249,6 +407,17 @@ def format_datehour(self, date, hour, tofmt = None, fromfmt = None): # the sep value; str to int for digital values @staticmethod def split_datetime(sdt, sep = r'\D'): + """Split a date, time, or datetime string into a list of integer/string parts. + + Splits on the regex sep pattern and converts purely numeric parts to int. + + Args: + sdt (str | any): Datetime value; coerced to str when not already. + sep (str): Regex separator pattern; defaults to any non-digit character. + + Returns: + list: Mixed int/str parts of the split datetime. + """ if not isinstance(sdt, str): sdt = str(sdt) adt = re.split(sep, sdt) acnt = len(adt) @@ -261,6 +430,21 @@ def split_datetime(sdt, sep = r'\D'): # fromfmt: date formats, default to YYYY-MM-DD # Return: new formated date string according to tofmt def format_date(self, cdate, tofmt = None, fromfmt = None): + """Reformat a date string from one format to another. + + Parses cdate according to fromfmt (auto-detected when omitted) and renders it + using tofmt. Supports year, century, quarter, month (numeric and name), and + day tokens defined in DATEFMTS. + + Args: + cdate (str | any): Input date value; coerced to str when needed. + tofmt (str | None): Output format string (e.g. 'Month D, YYYY'); when None + returns 'YYYY-MM-DD'. + fromfmt (str | None): Input format string; auto-detected from cdate when None. + + Returns: + str | None: Reformatted date string, or the original value when cdate is falsy. + """ if not cdate: return cdate if not isinstance(cdate, str): cdate = str(cdate) dates = [None, None, None] @@ -338,6 +522,23 @@ def format_date(self, cdate, tofmt = None, fromfmt = None): # tofmt: date format, ex. "Month D, YYYY", default to "YYYY-MM-DD HH:NN:SS" # Return: new formated datehour string def fmtdatetime(self, yr, mn, dy, hr = None, nn = None, ss = None, tofmt = None): + """Format year/month/day/hour/minute/second components into a datetime string. + + Carries over-range values (e.g. seconds ≥ 60) into the next unit automatically + before formatting. Delegates date formatting to fmtdate(). + + Args: + yr (int): Year. + mn (int): Month (1-12). + dy (int): Day of month. + hr (int | None): Hour (0-23). + nn (int | None): Minute (0-59). + ss (int | None): Second (0-59). + tofmt (str | None): Output format; defaults to 'YYYY-MM-DD HH:NN:SS'. + + Returns: + str: Formatted datetime string. + """ if not tofmt: tofmt = "YYYY-MM-DD HH:NN:SS" tms = [ss, nn, hr, dy] fks = ['S', 'N', 'H'] @@ -361,10 +562,10 @@ def fmtdatetime(self, yr, mn, dy, hr = None, nn = None, ss = None, tofmt = None) if ms: fmt = ms.group(1) if len(fmt) == 2: - str = "{:02}".format(tms[i]) + sval = "{:02}".format(tms[i]) else: - str = str(tms[i]) - sdt = re.sub(fmt, str, sdt, 1) + sval = str(tms[i]) + sdt = re.sub(fmt, sval, sdt, 1) return sdt # yr: year value @@ -374,6 +575,20 @@ def fmtdatetime(self, yr, mn, dy, hr = None, nn = None, ss = None, tofmt = None) # tofmt: date format, ex. "Month D, YYYY", default to "YYYY-MM-DD:HH" # Return: new formated datehour string def fmtdatehour(self, yr, mn, dy, hr, tofmt = None): + """Format year/month/day/hour components into a date-hour string. + + Normalises out-of-range hour values (negative or ≥ 24) by adjusting the day. + + Args: + yr (int): Year. + mn (int): Month (1-12). + dy (int): Day of month. + hr (int | None): Hour (0-23); may be negative or ≥ 24 (adjusted automatically). + tofmt (str | None): Output format; defaults to 'YYYY-MM-DD:HH'. + + Returns: + str: Formatted date-hour string. + """ if not tofmt: tofmt = "YYYY-MM-DD:HH" if hr != None and dy != None: # adjust hour value out of range if hr < 0: @@ -402,6 +617,20 @@ def fmtdatehour(self, yr, mn, dy, hr, tofmt = None): # tofmt: date format, ex. "Month D, YYYY", default to "YYYY-MM-DD" # Return: new formated date string def fmtdate(self, yr, mn, dy, tofmt = None): + """Format year, month, and day components into a date string. + + Applies adjust_ymd() to normalise out-of-range values, then substitutes day, + month (numeric or name), quarter, year, and century tokens from tofmt. + + Args: + yr (int | None): Year component. + mn (int | None): Month component (1-12). + dy (int | None): Day of month. + tofmt (str | None): Output format string; defaults to 'YYYY-MM-DD'. + + Returns: + str: Formatted date string. + """ (y, m, d) = self.adjust_ymd(yr, mn, dy) if not tofmt or tofmt == 'YYYY-MM-DD': return "{}-{:02}-{:02}".format(y, m, d) if dy != None: @@ -470,6 +699,17 @@ def fmtdate(self, yr, mn, dy, tofmt = None): # format given date and time into standard timestamp @staticmethod def join_datetime(sdate, stime): + """Combine separate date and time strings into a single datetime string. + + Args: + sdate (str | any): Date portion; coerced to str when not already. + Returns None when falsy. + stime (str | any): Time portion; defaults to '00:00:00' when falsy. + A leading single digit is zero-padded. + + Returns: + str | None: Combined 'YYYY-MM-DD HH:MM:SS' string, or None when sdate is falsy. + """ if not sdate: return None if not stime: stime = "00:00:00" if not isinstance(sdate, str): sdate = str(sdate) @@ -481,6 +721,15 @@ def join_datetime(sdate, stime): # split a date or datetime into an array of [date, time] @staticmethod def date_and_time(sdt): + """Split a datetime string into [date, time] parts. + + Args: + sdt (str | any): Datetime value; coerced to str when not already. + + Returns: + list: [date_str, time_str]; time_str defaults to '00:00:00' when absent; + [None, None] when sdt is falsy. + """ if not sdt: return [None, None] if not isinstance(sdt, str): sdt = str(sdt) adt = re.split(' ', sdt) @@ -491,6 +740,17 @@ def date_and_time(sdt): # convert given date/time to unix epoch time; -1 if cannot @staticmethod def unixtime(stime): + """Convert a date/time string to a Unix epoch timestamp. + + Parses the date portion (YYYY-MM-DD) and optional time portion (HH:MM:SS) + from stime and returns the corresponding local epoch seconds. + + Args: + stime (str | any): Date or datetime string; coerced to str when needed. + + Returns: + float: Unix epoch timestamp via time.mktime(). + """ pt = [0]*9 if not isinstance(stime, str): stime = str(stime) ms = re.match(r'^(\d+)-(\d+)-(\d+)', stime) @@ -508,6 +768,19 @@ def unixtime(stime): # edate: end date in form of 'YYYY' or 'YYYY-MM' or 'YYYY-MM-DD' # Return: list of start and end dates in format of YYYY-MM-DD def daterange(self, sdate, edate): + """Expand partial dates in a [sdate, edate] pair to full 'YYYY-MM-DD' strings. + + Partial dates: + - 'YYYY' → sdate becomes 'YYYY-01-01', edate becomes 'YYYY-12-31'. + - 'YYYY-MM' → sdate becomes 'YYYY-MM-01', edate becomes last day of that month. + + Args: + sdate (str | any | None): Start date (partial or full). + edate (str | any | None): End date (partial or full). + + Returns: + list: [sdate_str, edate_str] both in 'YYYY-MM-DD' format. + """ if sdate: if not isinstance(sdate, str): sdate = str(sdate) if not re.search(r'\d+-\d+-\d+', sdate): @@ -534,6 +807,16 @@ def daterange(self, sdate, edate): # date to datetime range @staticmethod def dtrange(dates): + """Extend a [date, date] pair to a [datetime, datetime] pair covering the full days. + + Appends ' 00:00:00' to dates[0] and ' 23:59:59' to dates[1]. + + Args: + dates (list): Two-element list [start_date, end_date]; modified in-place. + + Returns: + list: The same list with datetime strings. + """ date = dates[0] if date: if not isinstance(date, str): date = str(date) @@ -549,6 +832,20 @@ def dtrange(dates): # fmt: period format, ex. "YYYYMon-YYYMon", default to "YYYYMM-YYYYMM" # Return: a string of formated period def format_period(self, sdate, edate, fmt = None): + """Format a date range as a period string using start and end format tokens. + + When fmt is None, produces 'YYYYMM-YYYYMM'. The format string may contain a + hyphen separator dividing the start and end sub-formats. + + Args: + sdate (str | any | None): Start date in 'YYYY-MM-DD' or similar format. + edate (str | any | None): End date in 'YYYY-MM-DD' or similar format. + fmt (str | None): Period format like 'YYYYMon-YYYYMon'; the literal word + 'current' in the end sub-format is kept verbatim. + + Returns: + str: Formatted period string. + """ period = '' if not fmt: sfmt = efmt = "YYYYMM" @@ -583,6 +880,17 @@ def format_period(self, sdate, edate, fmt = None): # newid: True to format a new dsid; defaults to False for now # returns a new or old dsid according to the newid option def format_dataset_id(self, dsid, newid = None, logact = None): + """Normalise a dataset ID to old-style ('ds###.#') or new-style ('[a-z]######') format. + + Args: + dsid (str | any): Input dataset ID in any recognised format. + newid (bool | None): True → return new-style ID; False → old-style; + None → uses PGLOG['NEWDSID']. + logact (int | None): Logging action for invalid IDs; defaults to LGEREX. + + Returns: + str: Normalised dataset ID string. + """ if newid is None: newid = self.PGLOG['NEWDSID'] if logact is None: logact = self.LGEREX dsid = str(dsid) @@ -611,6 +919,19 @@ def format_dataset_id(self, dsid, newid = None, logact = None): # newid: True to format a new dsid; defaults to False for now # returns a new or old metadata dsid according to the newid option def metadata_dataset_id(self, dsid, newid = None, logact = None): + """Normalise a dataset ID to metadata format (no 'ds' prefix for old style). + + Like format_dataset_id() but old-style output is '###.#' instead of 'ds###.#'. + + Args: + dsid (str | any): Input dataset ID. + newid (bool | None): True → new-style; False → metadata old-style; + None → uses PGLOG['NEWDSID']. + logact (int | None): Logging action for invalid IDs; defaults to LGEREX. + + Returns: + str: Normalised metadata dataset ID string. + """ if newid is None: newid = self.PGLOG['NEWDSID'] if logact is None: logact = self.LGEREX ms = re.match(r'^([a-z])(\d\d\d)(\d\d\d)$', dsid) @@ -638,6 +959,16 @@ def metadata_dataset_id(self, dsid, newid = None, logact = None): # and find it according to the flag value O (Old), N (New) or B (Both) formats # returns dsid if found in given id string; None otherwise def find_dataset_id(self, idstr, flag = 'B', logact = 0): + """Search a string for a dataset ID in old, new, or both formats. + + Args: + idstr (str): String to search. + flag (str): 'N' = new-style only, 'O' = old-style only, 'B' = both (default). + logact (int): Logging action when no ID is found; 0 = silent. + + Returns: + str | None: The first matching dataset ID string, or None when not found. + """ if flag in 'NB': ms = re.search(r'(^|\W)(([a-z])\d{6})($|\D)', idstr) if ms and ms.group(3) in self.PGLOG['DSIDCHRS']: return ms.group(2) @@ -651,6 +982,17 @@ def find_dataset_id(self, idstr, flag = 'B', logact = 0): # find and convert all found dsids according to old/new dsids # for newid = False/True def convert_dataset_ids(self, idstr, newid = None, logact = 0): + """Find and convert all dataset IDs in a string between old and new formats. + + Args: + idstr (str | None): Input string possibly containing dataset IDs. + newid (bool | None): True → convert old→new; False → new→old; + None → uses PGLOG['NEWDSID']. + logact (int): Logging action flags; default 0. + + Returns: + tuple: (converted_str, count) where count is the number of IDs converted. + """ if newid is None: newid = self.PGLOG['NEWDSID'] flag = 'O' if newid else 'N' cnt = 0 @@ -668,6 +1010,15 @@ def convert_dataset_ids(self, idstr, newid = None, logact = 0): # Return: a dict to the idx record out of records @staticmethod def onerecord(records, idx): + """Extract a single row from a column-oriented multi-record dict. + + Args: + records (dict): Column-oriented dict (field → list-of-values) from pgmget(). + idx (int): Row index to extract. + + Returns: + dict: Row dict with field → scalar value for the given index. + """ record = {} for fld in records: record[fld] = records[fld][idx] @@ -679,6 +1030,19 @@ def onerecord(records, idx): # Return: add a record to a dict of lists @staticmethod def addrecord(records, record, idx): + """Insert or replace a row in a column-oriented multi-record dict. + + Appends None padding when idx exceeds the current length of a column list. + Initialises records to an empty dict when None is passed. + + Args: + records (dict | None): Column-oriented dict to update, or None to create one. + record (dict): Single-row dict to insert. + idx (int): Target row index. + + Returns: + dict: The updated (or newly created) column-oriented dict. + """ if records is None: records = {} # initialize dist of lists structure if not records: for key in record: @@ -697,6 +1061,15 @@ def addrecord(records, record, idx): # convert a hash with multiple rows from pgmget() to an array of hashes @staticmethod def hash2array(hrecs, hkeys = None): + """Convert a column-oriented dict (from pgmget) to a list of row dicts. + + Args: + hrecs (dict): Column-oriented dict mapping field_name → list_of_values. + hkeys (list | None): Keys to include; uses all keys when None. + + Returns: + list[dict]: List of row dicts, one per row. + """ if not hkeys: hkeys = list(hrecs) acnt = len(hrecs[hkeys[0]]) if hrecs and hkeys[0] in hrecs else 0 arecs = [None]*acnt @@ -709,6 +1082,15 @@ def hash2array(hrecs, hkeys = None): # convert an array of hashes to a hash with multiple rows for pgmget() @staticmethod def array2hash(arecs, hkeys = None): + """Convert a list of row dicts to a column-oriented dict (for pgmget-style use). + + Args: + arecs (list[dict]): List of row dicts. + hkeys (list | None): Keys to include; uses all keys from arecs[0] when None. + + Returns: + dict: Column-oriented dict mapping field_name → list_of_values. + """ hrecs = {} acnt = len(arecs) if arecs else 0 if acnt > 0: @@ -725,6 +1107,15 @@ def array2hash(arecs, hkeys = None): # Return: a single number or list of two dependend on given opt @staticmethod def hashcount(records, opt = 0): + """Return the column count, row count, or both for a column-oriented dict. + + Args: + records (dict): Column-oriented dict from pgmget(). + opt (int): 0 = column count (default), 1 = row count, 2 = [col_count, row_count]. + + Returns: + int | list: Single count for opt 0/1, or [col_count, row_count] for opt 2. + """ ret = [0, 0] if records: clen = len(records) @@ -742,6 +1133,20 @@ def hashcount(records, opt = 0): # For unique join, a record in bdict must not be contained in adict already @staticmethod def joinhash(adict, bdict, default = None, unique = None): + """Concatenate two column-oriented dicts, filling missing keys with a default. + + For a unique join, a row from bdict is only appended when no row in adict + has identical values for all common keys. + + Args: + adict (dict | None): Base column-oriented dict; returned unchanged when bdict is falsy. + bdict (dict | None): Dict to append; returned when adict is falsy. + default: Fill value for keys absent in one of the dicts; default None. + unique: When truthy, skip bdict rows already present in adict. + + Returns: + dict: Merged column-oriented dict. + """ if not bdict: return adict if not adict: return bdict akeys = list(adict.keys()) @@ -787,13 +1192,23 @@ def joinhash(adict, bdict, default = None, unique = None): # Return: the joined list @staticmethod def joinarray(lst1, lst2, unique = None): + """Concatenate two lists, optionally skipping duplicates. + + Args: + lst1 (list | None): Base list; returned unchanged when lst2 is falsy. + lst2 (list | None): List to append; returned when lst1 is falsy. + unique: When truthy, only appends elements from lst2 not already in lst1. + + Returns: + list: Merged list. + """ if not lst2: return lst1 if not lst1: return lst2 cnt1 = len(lst1) cnt2 = len(lst2) if unique: - for i in (cnt2): - for j in (cnt1): + for i in range(cnt2): + for j in range(cnt1): if PgUtil.pgcmp(lst1[j], lst2[i]) != 0: break if j >= cnt1: lst1.append(lst2[i]) @@ -805,6 +1220,18 @@ def joinarray(lst1, lst2, unique = None): # Return: a reference to the cross-joined hash records @staticmethod def crosshash(ahash, bhash): + """Produce a Cartesian product of two column-oriented dicts. + + Every row in ahash is combined with every row in bhash, producing + acnt × bcnt rows in the result. + + Args: + ahash (dict | None): First column-oriented dict. + bhash (dict | None): Second column-oriented dict. + + Returns: + dict: Cross-joined column-oriented dict with all keys from both inputs. + """ if not bhash: return ahash if not ahash: return bhash akeys = list(ahash.keys()) @@ -823,6 +1250,14 @@ def crosshash(ahash, bhash): # strip database and table names for a field name @staticmethod def strip_field(field): + """Strip schema and table prefixes from a dot-qualified field name. + + Args: + field (str): Possibly qualified field name like 'schema.table.column'. + + Returns: + str: The bare column name after the last dot, or field unchanged when no dot. + """ ms = re.search(r'\.([^\.]+)$', field) if ms: field = ms.group(1) return field @@ -833,6 +1268,20 @@ def strip_field(field): # patterns: optional list of temporal patterns for order fields # Return: a sorted dict list def sorthash(self, pgrecs, flds, hash, patterns = None): + """Sort a column-oriented dict on one or more fields using quicksort. + + Lowercase field letters in flds indicate descending order; uppercase = ascending. + Temporal patterns in patterns are used to extract a comparable key for each value. + + Args: + pgrecs (dict): Column-oriented dict to sort. + flds (str | list): Sort-field letter codes; each must be a key in hash. + hash (dict): Field-code → (label, full_field_name, …) mapping. + patterns (list | None): Optional temporal format patterns, one per sort field. + + Returns: + dict: New column-oriented dict with rows in sorted order. + """ fcnt = len(flds) # count of fields to be sorted on # set sorting order, descenting (-1) or ascenting (1) # get the full field names to be sorted on @@ -876,6 +1325,15 @@ def sorthash(self, pgrecs, flds, hash, patterns = None): # Return: the number of days bewteen date1 and date2 @staticmethod def diffdate(date1, date2): + """Return the signed number of days between two date strings (date1 − date2). + + Args: + date1 (str | None): Later date in 'YYYY-MM-DD' format. + date2 (str | None): Earlier date in 'YYYY-MM-DD' format. + + Returns: + int: Positive when date1 > date2, negative when date1 < date2. + """ ut1 = ut2 = 0 if date1: ut1 = PgUtil.unixtime(date1) if date2: ut2 = PgUtil.unixtime(date2) @@ -884,6 +1342,15 @@ def diffdate(date1, date2): # Return: the number of seconds bewteen time1 and time2 @staticmethod def difftime(time1, time2): + """Return the signed number of seconds between two datetime strings (time1 − time2). + + Args: + time1 (str | None): Later datetime string. + time2 (str | None): Earlier datetime string. + + Returns: + int: Signed second difference. + """ ut1 = ut2 = 0 if time1: ut1 = PgUtil.unixtime(time1) if time2: ut2 = PgUtil.unixtime(time2) @@ -893,12 +1360,28 @@ def difftime(time1, time2): # Return: the number of days between date and '1970-01-01 00:00:00' @staticmethod def get_days(cdate): + """Return the number of days elapsed since the Unix epoch (1970-01-01). + + Args: + cdate (str | any): Date value; coerced to str when not already. + + Returns: + int: Day count since 1970-01-01. + """ return PgUtil.diffdate(str(cdate), '1970-01-01') # Function: get_month_days(date) # Return: the number of days in given month @staticmethod def get_month_days(cdate): + """Return the number of days in the month of a given date. + + Args: + cdate (str | any): Date value in 'YYYY-MM-…' format. + + Returns: + int: Days in the month (28-31), or 0 when the date cannot be parsed. + """ ms = re.match(r'^(\d+)-(\d+)', str(cdate)) if ms: yr = int(ms.group(1)) @@ -911,6 +1394,18 @@ def get_month_days(cdate): # Return: a date in format of YYYY-MM-DD thar all year/month/day are validated @staticmethod def validate_date(cdate): + """Clamp year, month, and day components of a date to valid ranges. + + Years below 1000 are assumed to be in the 2000s; years above 9999 are taken + mod 10000. Month and day are clamped to [1, 12] and [1, last_day_of_month]. + + Args: + cdate (str | any): Date string in 'YYYY-MM-DD' format. + + Returns: + str: Validated date string in 'YYYY-MM-DD' format, or cdate unchanged + when it cannot be parsed. + """ ms = re.match(r'^(\d+)-(\d+)-(\d+)', str(cdate)) if ms: (yr, mn, dy) = (int(m) for m in ms.groups()) @@ -934,17 +1429,43 @@ def validate_date(cdate): # Return: the date in format of "YYYY-MM-DD" for given number of days # from '1970-01-01 00:00:00' def get_date(self, days): + """Return the date that is a given number of days after the Unix epoch. + + Args: + days (int | str): Number of days since 1970-01-01. + + Returns: + str: Date string in 'YYYY-MM-DD' format. + """ return self.adddate('1970-01-01', 0, 0, int(days)) # compare date/hour and return the different hours @staticmethod def diffdatehour(date1, hour1, date2, hour2): + """Return the signed hour difference between two date+hour pairs. + + Missing hour values default to 23 (end-of-day). + + Args: + date1 (str): Date string for the first point. + hour1 (int | None): Hour (0-23) for the first point; defaults to 23. + date2 (str): Date string for the second point. + hour2 (int | None): Hour for the second point; defaults to 23. + + Returns: + int: (hour1 − hour2) + 24 × (date1 − date2 in days). + """ if hour1 is None: hour1 = 23 if hour2 is None: hour2 = 23 return (hour1 - hour2) + 24*PgUtil.diffdate(date1, date2) # hour difference between GMT and local time def diffgmthour(self): + """Return the hour difference between GMT and local time. + + Returns: + int: Local_hour − GMT_hour (positive east of UTC, negative west). + """ tg = time.gmtime() tl = time.localtime() dg = self.fmtdate(tg[0], tg[1], tg[2]) @@ -956,6 +1477,17 @@ def diffgmthour(self): # compare date and time (if given) and return 1, 0 and -1 @staticmethod def cmptime(date1, time1, date2, time2): + """Compare two date+time pairs and return a three-way comparison result. + + Args: + date1 (str): First date string. + time1 (str | None): First time string; defaults to '00:00:00' when None. + date2 (str): Second date string. + time2 (str | None): Second time string. + + Returns: + int: 1 if first > second, -1 if first < second, 0 if equal. + """ stime1 = PgUtil.join_datetime(date1, time1) stime2 = PgUtil.join_datetime(date2, time2) return PgUtil.pgcmp(stime1, stime2) @@ -965,6 +1497,19 @@ def cmptime(date1, time1, date2, time2): # nf: number of fractions of a month # Return: new date def addmonth(self, cdate, mf, nf = 1): + """Add a fractional number of months to a date. + + When nf < 2, delegates to adddate(). Otherwise uses 30-day month fractions + to compute the new date. + + Args: + cdate (str): Starting date in 'YYYY-MM-DD' format. + mf (int): Number of month fractions to add (negative to subtract). + nf (int): Number of fractions per month (1 = whole months, 2 = half, etc.). + + Returns: + str: Resulting date in 'YYYY-MM-DD' format. + """ if not mf: return cdate if not nf or nf < 2: return self.adddate(cdate, 0, mf, 0) ms = re.match(r'^(\d+)-(\d+)-(\d+)$', cdate) @@ -994,26 +1539,47 @@ def addmonth(self, cdate, mf, nf = 1): # add yr years & mn months to yearmonth ym in format YYYYMM @staticmethod def addyearmonth(ym, yr, mn): - if yr == None: yr = 0 - if mn == None: mn = 0 + """Add years and months to a compact year-month string (YYYYMM). + + Args: + ym (str): Base year-month in 'YYYYMM' format. + yr (int | None): Years to add; treated as 0 when None. + mn (int | None): Months to add; treated as 0 when None. + + Returns: + str: Resulting year-month in 'YYYYMM' format, or ym unchanged when it + cannot be parsed. + """ + if yr is None: yr = 0 + if mn is None: mn = 0 ms =re.match(r'^(\d\d\d\d)(\d\d)$', ym) if ms: (syr, smn) = ms.groups() - yr = int(syr) - mn = int(smn) - if mn < 0: - while mn < 0: - yr -= 1 - mn += 12 + nyr = int(syr) + yr + nmn = int(smn) + mn + if nmn < 0: + while nmn < 0: + nyr -= 1 + nmn += 12 else: - while mn > 12: - yr += 1 - mn -= 12 - ym = "{:04}{:02}".format(yr, mn) + while nmn > 12: + nyr += 1 + nmn -= 12 + ym = "{:04}{:02}".format(nyr, nmn) return ym # set number of days in Beburary for Leap year according PGLOG['NOLEAP'] def set_leap_mdays(self, year): + """Update MDAYS[0] (year length) and MDAYS[2] (February) for a given year. + + Honours PGLOG['NOLEAP']: when set, February always has 28 days. + + Args: + year (int): Year to evaluate for leap-year status. + + Returns: + int: 1 if year is a leap year (and NOLEAP is not set), 0 otherwise. + """ if not self.PGLOG['NOLEAP'] and calendar.isleap(year): self.MDAYS[0] = 366 self.MDAYS[2] = 29 @@ -1029,11 +1595,34 @@ def set_leap_mdays(self, year): # reutn 1 if is end of month def is_end_month(self, yr, mn, dy): + """Return 1 if the given day is the last day of its month, 0 otherwise. + + Args: + yr (int): Year. + mn (int): Month (1-12). + dy (int): Day of month. + + Returns: + int: 1 when dy equals the last day of month mn in year yr, else 0. + """ self.set_leap_mdays(yr) return 1 if dy == self.MDAYS[mn] else 0 # adust the year, month and day values that are out of ranges def adjust_ymd(self, yr, mn, dy): + """Normalise year, month, and day values that are out of calendar range. + + Carries months into/from years and days into/from months iteratively until + all three components are within valid ranges. Updates MDAYS for leap years. + + Args: + yr (int | None): Year component; defaults to 1970 when None. + mn (int | None): Month component (1-12); defaults to 1 when None. + dy (int | None): Day component; defaults to 1 when None. + + Returns: + list: [yr, mn, dy] all within valid calendar ranges. + """ if yr is None: yr = 1970 if mn is None: mn = 1 if dy is None: dy = 1 @@ -1075,6 +1664,22 @@ def adjust_ymd(self, yr, mn, dy): # dy: the number of days to add/subtract from the odate for positive/negative value) # Return: new date def adddate(self, cdate, yr, mn = 0, dy = 0, tofmt = None): + """Add years, months, and/or days to a date string. + + Handles month-end preservation: when the starting day is the last day of its + month and months are being added, the result lands on the last day of the + target month. + + Args: + cdate (str | any): Starting date in 'YYYY-MM-DD' format. + yr (int | str | None): Years to add (negative to subtract). + mn (int | str | None): Months to add; default 0. + dy (int | str | None): Days to add; default 0. + tofmt (str | None): Output date format; defaults to 'YYYY-MM-DD'. + + Returns: + str: Resulting date string, or cdate unchanged when it cannot be parsed. + """ if not cdate: return cdate if not isinstance(cdate, str): cdate = str(cdate) if yr is None: @@ -1104,6 +1709,16 @@ def adddate(self, cdate, yr, mn = 0, dy = 0, tofmt = None): # add given hours to the initial date and time def addhour(self, sdate, stime, nhour): + """Add a number of hours to a date+time pair, adjusting the date when needed. + + Args: + sdate (str | any | None): Starting date string. + stime (str | any | None): Starting time string in 'HH:…' format. + nhour (int | str): Hours to add (may be negative). + + Returns: + list: [new_date_str, new_time_str]. + """ if nhour and isinstance(nhour, str): nhour = int(nhour) if sdate and not isinstance(sdate, str): sdate = str(sdate) if stime and not isinstance(stime, str): stime = str(stime) @@ -1128,6 +1743,22 @@ def addhour(self, sdate, stime, nhour): # add given years, months, days and hours to the initial date and hour def adddatehour(self, sdate, nhour, yr, mn, dy, hr = 0): + """Add years, months, days, and hours to a date+hour pair. + + The hour increment hr is combined with nhour, then overflow/underflow is + carried into dy before calling adddate(). + + Args: + sdate (str | any | None): Starting date string. + nhour (int | str | None): Starting hour value. + yr (int): Years to add. + mn (int): Months to add. + dy (int): Days to add. + hr (int): Hours to add; default 0. + + Returns: + list: [new_date_str, new_hour_int]. + """ if sdate and not isinstance(sdate, str): sdate = str(sdate) if hr: if nhour != None: @@ -1148,6 +1779,24 @@ def adddatehour(self, sdate, nhour, yr, mn, dy, hr = 0): # add given yyyy, mm, dd, hh, nn, ss to sdatetime # if nf, add fraction of month only def adddatetime(self, sdatetime, yy, mm, dd, hh, nn, ss, nf = 0): + """Add year/month/day/hour/minute/second offsets to a datetime string. + + When nf > 1, the month increment is applied as fractional months via + addmonth() before any remaining date arithmetic. + + Args: + sdatetime (str | any): Starting datetime in 'YYYY-MM-DD HH:MM:SS' format. + yy (int): Years to add. + mm (int): Months to add (or fractional months when nf > 1). + dd (int): Days to add. + hh (int): Hours to add. + nn (int): Minutes to add. + ss (int): Seconds to add. + nf (int): Month fraction denominator; 0 or 1 = whole months. + + Returns: + str: Resulting datetime string in 'YYYY-MM-DD HH:MM:SS' format. + """ if sdatetime and not isinstance(sdatetime, str): sdatetime = str(sdatetime) (sdate, stime) = re.split(' ', sdatetime) if hh or nn or ss: (sdate, stime) = self.addtime(sdate, stime, hh, nn, ss) @@ -1157,22 +1806,24 @@ def adddatetime(self, sdatetime, yy, mm, dd, hh, nn, ss, nf = 0): if yy or mm or dd: sdate = self.adddate(sdate, yy, mm, dd) return "{} {}".format(sdate, stime) - # add given yyyy, mm, dd, hh, nn, ss to sdatetime - # if nf, add fraction of month only - def adddatetime(self, sdatetime, yy, mm, dd, hh, nn, ss, nf = 0): - if sdatetime and not isinstance(sdatetime, str): sdatetime = str(sdatetime) - (sdate, stime) = re.split(' ', sdatetime) - if hh or nn or ss: (sdate, stime) = self.addtime(sdate, stime, hh, nn, ss) - if nf: - sdate = self.addmonth(sdate, mm, nf) - mm = 0 - if yy or mm or dd: sdate = self.adddate(sdate, yy, mm, dd) - return "{} {}".format(sdate, stime) - # add given hours, minutes and seconds to the initial date and time def addtime(self, sdate, stime, h, m, s): + """Add hour, minute, and second offsets to a date+time pair. + + Normalises overflow/underflow across seconds → minutes → hours → days. + + Args: + sdate (str | any | None): Starting date string. + stime (str | any | None): Starting time in 'HH:MM:SS' format. + h (int): Hours to add. + m (int): Minutes to add. + s (int): Seconds to add. + + Returns: + list: [new_date_str, new_time_str] in 'YYYY-MM-DD' and 'HH:MM:SS' format. + """ if sdate and not isinstance(sdate, str): sdate = str(sdate) - if stime and not isinstance(stime, str): sdate = str(stime) + if stime and not isinstance(stime, str): stime = str(stime) ups = (60, 60, 24) tms = [0, 0, 0, 0] # (sec, min, hour, day) if s: tms[0] += s @@ -1200,6 +1851,18 @@ def addtime(self, sdate, stime, h, m, s): # add time interval array to datetime # opt = -1 - minus, 0 - begin time, 1 - add (default) def addintervals(self, sdatetime, intv, opt = 1): + """Apply a time-interval array to a datetime string. + + Args: + sdatetime (str | any): Starting datetime in 'YYYY-MM-DD HH:MM:SS' format. + intv (list | None): Interval values [yy, mm, dd, hh, nn, ss, nf]; missing + positions default to 0. + opt (int): 1 = add (default), -1 = subtract, 0 = advance one second first + (to move from end of current period to start of next). + + Returns: + str: Resulting datetime string. + """ if not isinstance(sdatetime, str): sdatetime = str(sdatetime) if not intv: return sdatetime tv = [0]*7 @@ -1219,6 +1882,17 @@ def addintervals(self, sdatetime, intv, opt = 1): # end of period if days == 0 # nf - number of fractions of a month, for unit of 'M' only def enddate(self, sdate, days, unit, nf = 0): + """Adjust a date to the end (or a specified day) of its year/month/week period. + + Args: + sdate (str | any | None): Input date string. + days (int | str | None): Target day within the period; 0 = last day of period. + unit (str): Period unit — 'Y' (year), 'M' (month), or 'W' (week). + nf (int): Month fraction denominator for unit='M'; 0 or 1 = whole months. + + Returns: + str: Adjusted date string in 'YYYY-MM-DD' format. + """ if sdate and not isinstance(sdate, str): sdate = str(sdate) if days and isinstance(days, str): days = int(days) if not (unit and unit in 'YMW'): return sdate @@ -1269,6 +1943,15 @@ def enddate(self, sdate, days, unit, nf = 0): # adjust end time to the specified h/n/s for frequency of hour/mimute/second def endtime(self, stime, unit): + """Adjust a time string to the end of its hour, minute, or second period. + + Args: + stime (str | any | None): Input time string; defaults to '00:00:00' when falsy. + unit (str): Period unit — 'H' (hour), 'N' (minute), or 'S' (second, no-op). + + Returns: + str: Adjusted time string in 'HH:MM:SS' format. + """ if stime and not isinstance(stime, str): stime = str(stime) if not (unit and unit in 'HNS'): return stime if stime: @@ -1282,10 +1965,23 @@ def endtime(self, stime, unit): elif unit != 'S': tm[0] = 23 tm[1] = tm[2] = 59 - return "{:02}:{:02}:{:02}".format(tm[0], tm[1]. tm[2]) + return "{:02}:{:02}:{:02}".format(tm[0], tm[1], tm[2]) # adjust end time to the specified h/n/s for frequency of year/month/week/day/hour/mimute/second def enddatetime(self, sdatetime, unit, days = 0, nf = 0): + """Adjust a datetime string to the end of the given calendar/time period. + + Delegates to enddate() for date units (Y/M/W) and endtime() for time units (H/N/S). + + Args: + sdatetime (str | any | None): Input datetime in 'YYYY-MM-DD HH:MM:SS' format. + unit (str): Period unit — Y, M, W, D, H, N, or S. + days (int): Target day for date period adjustment; 0 = last day. + nf (int): Month fraction denominator; 0 or 1 = whole months. + + Returns: + str: Adjusted datetime string in 'YYYY-MM-DD HH:MM:SS' format. + """ if sdatetime and not isinstance(sdatetime, str): sdatetime = str(sdatetime) if not (unit and unit in 'YMWDHNS'): return sdatetime (sdate, stime) = re.split(' ', sdatetime) @@ -1297,7 +1993,19 @@ def enddatetime(self, sdatetime, unit, days = 0, nf = 0): # get the string length dynamically @staticmethod - def get_column_length(colname, values): + def get_column_length(colname, values): + """Return the display width needed for a column, based on its values. + + Starts from the length of the column title (or 2 when colname is None) and + expands to accommodate the longest non-newline value string. + + Args: + colname (str | None): Column header label. + values (iterable): Column values to measure. + + Returns: + int: Maximum display width for the column. + """ clen = len(colname) if colname else 2 # initial column length as the length of column title for val in values: if val is None: continue @@ -1310,7 +2018,17 @@ def get_column_length(colname, values): # Function: hour2time() # Return: time string in format of date HH:MM:SS @staticmethod - def hour2time(sdate, nhour, endtime = 0): + def hour2time(sdate, nhour, endtime = 0): + """Build a time string (and optional datetime string) from a date and hour. + + Args: + sdate (str | any | None): Date portion; when truthy, prepended to the time. + nhour (int): Hour of day (0-23). + endtime (int): When non-zero, sets minutes and seconds to 59; else 00. + + Returns: + str: 'YYYY-MM-DD HH:MM:SS' when sdate is given, or 'HH:MM:SS' otherwise. + """ if sdate and not isinstance(sdate, str): sdate = str(sdate) stime = "{:02}:".format(nhour) if endtime: @@ -1326,6 +2044,14 @@ def hour2time(sdate, nhour, endtime = 0): # Return: list of date and hour @staticmethod def time2hour(stime): + """Split a datetime or time string into a [date, hour] pair. + + Args: + stime (str): Time or datetime string; 'YYYY-MM-DD HH:…' or 'HH:…'. + + Returns: + list: [date_str_or_None, hour_int_or_None]. + """ sdate = nhour = None times = stime.split(' ') if len(times) == 2: @@ -1338,6 +2064,16 @@ def time2hour(stime): # get the all column widths @staticmethod def all_column_widths(pgrecs, flds, tdict): + """Return display widths for a list of field code columns in a result dict. + + Args: + pgrecs (dict): Column-oriented dict from pgmget(). + flds (list): Ordered field code letters matching keys in tdict. + tdict (dict): Field-code → (label, full_field_name, …) mapping. + + Returns: + list[int]: Display width for each field in flds (0 when not in tdict). + """ colcnt = len(flds) lens = [0]*colcnt for i in range(colcnt): @@ -1350,6 +2086,16 @@ def all_column_widths(pgrecs, flds, tdict): # check a give value, return 1 if numeric, 0 therwise @staticmethod def pgnum(val): + """Return 1 when val is a valid numeric string, 0 otherwise. + + Recognises integers, decimals, and scientific notation. + + Args: + val: Value to test; coerced to str when not already. + + Returns: + int: 1 if numeric, 0 otherwise. + """ if not isinstance(val, str): val = str(val) ms = re.match(r'^\-{0,1}(\d+|\d+\.\d*|d*\.\d+)([eE]\-{0,1}\d+)*$', val) return 1 if ms else 0 @@ -1357,7 +2103,22 @@ def pgnum(val): # Function: pgcmp(val1, val2) # Return: 0 if both empty or two values are identilcal; -1 if val1 < val2; otherwise 1 @staticmethod - def pgcmp(val1, val2, ignorecase = 0, num = 0): + def pgcmp(val1, val2, ignorecase = 0, num = 0): + """Three-way comparison of two values with optional type normalisation. + + None is considered less than any non-None value. Mismatched types are coerced + to str (default) or int (when num is set). String comparison can be + case-insensitive. + + Args: + val1: First value. + val2: Second value. + ignorecase (int): When non-zero, lowercases strings before comparing. + num (int): When non-zero, coerces strings to int for numeric comparison. + + Returns: + int: 1 if val1 > val2, -1 if val1 < val2, 0 if equal. + """ if val1 is None: if val2 is None: return 0 @@ -1401,6 +2162,14 @@ def pgcmp(val1, val2, ignorecase = 0, num = 0): # Return: final file list with all the subdirectories expanded @staticmethod def recursive_files(infiles): + """Expand a list of file paths, recursively replacing directories with their contents. + + Args: + infiles (list[str]): Input file/directory paths. + + Returns: + list[str]: Flat list of file paths with all directories expanded. + """ ofiles = [] for file in infiles: if op.isdir(file): @@ -1416,6 +2185,19 @@ def recursive_files(infiles): # Return: index if found; -1 otherwise @staticmethod def asearch(lidx, hidx, key, list): + """Binary (or linear) search for an exact key in a sorted list. + + Uses linear search for ranges ≤ 10 elements, binary search otherwise. + + Args: + lidx (int): Inclusive lower index. + hidx (int): Exclusive upper index. + key: Value to search for. + list (list): Sorted list to search within. + + Returns: + int: Index of the matching element, or -1 when not found. + """ ret = -1 if (hidx - lidx) < 11: # use linear search for less than 11 items for midx in range(lidx, hidx): @@ -1423,7 +2205,7 @@ def asearch(lidx, hidx, key, list): ret = midx break else: - midx = (lidx + hidx)/2 + midx = (lidx + hidx) // 2 if key == list[midx]: ret = midx elif key < list[midx]: @@ -1439,6 +2221,20 @@ def asearch(lidx, hidx, key, list): # Return: index if found; -1 otherwise @staticmethod def psearch(lidx, hidx, key, list): + """Binary (or linear) search matching key against regex patterns in a sorted list. + + Uses linear search for ranges ≤ 10 elements, binary search otherwise. + Comparisons use re.search(list[midx], key) for matching. + + Args: + lidx (int): Inclusive lower index. + hidx (int): Exclusive upper index. + key (str): Value to match against patterns. + list (list[str]): Sorted list of regex patterns. + + Returns: + int: Index of the first matching pattern, or -1 when none match. + """ ret = -1 if (hidx - lidx) < 11: # use linear search for less than 11 items for midx in range(lidx, hidx): @@ -1458,6 +2254,21 @@ def psearch(lidx, hidx, key, list): # quicksort for pattern @staticmethod def quicksort(srecs, lo, hi, desc, cnt, nums = None): + """In-place quicksort for a list of record lists. + + Uses the middle element as pivot and recursively sorts sub-ranges. + + Args: + srecs (list): List of row lists; each row ends with a cached original index. + lo (int): Inclusive lower bound. + hi (int): Inclusive upper bound. + desc (list[int]): Per-column sort direction: 1 = ascending, -1 = descending. + cnt (int): Number of sort-key columns (not counting the index column). + nums (list[int] | None): Per-column numeric flag; 1 = numeric comparison. + + Returns: + list: The sorted srecs list. + """ i = lo j = hi mrec = srecs[int((lo+hi)/2)] @@ -1477,9 +2288,21 @@ def quicksort(srecs, lo, hi, desc, cnt, nums = None): if i < hi: srecs = PgUtil.quicksort(srecs, i, hi, desc, cnt, nums) return srecs - # compare two arrays + # compare two arrays @staticmethod def cmp_records(arec, brec, desc, cnt, nums): + """Compare two record lists on the first cnt columns using pgcmp(). + + Args: + arec (list): First record list. + brec (list): Second record list. + desc (list[int]): Per-column direction multipliers (1 or -1). + cnt (int): Number of columns to compare. + nums (list[int] | None): Per-column numeric flag. + + Returns: + int: Negative, zero, or positive comparison result. + """ for i in range(cnt): num = nums[i] if nums else 0 ret = PgUtil.pgcmp(arec[i], brec[i], 0, num) @@ -1490,6 +2313,17 @@ def cmp_records(arec, brec, desc, cnt, nums): # format one floating point value @staticmethod def format_float_value(val, precision = 2): + """Format a byte count as a human-readable string with unit suffix. + + Scales through B, KB, MB, GB, TB, PB with the specified decimal precision. + + Args: + val (int | float | None): Byte count; returns '' when None. + precision (int): Decimal places in the formatted number; default 2. + + Returns: + str: Formatted string like '1.23GB', or '' when val is None. + """ units = ('B', 'KB', 'MB', 'GB', 'TB', 'PB') if val is None: return '' @@ -1505,6 +2339,19 @@ def format_float_value(val, precision = 2): # return 1 if yes, 0 if not; or -1 if file not checkable @staticmethod def is_text_file(fname, blocksize = 256, threshhold = 0.1): + """Determine whether a file is an ASCII text file by sampling its content. + + Reads up to blocksize bytes and rejects the file if it contains null bytes or + if the proportion of non-printable-ASCII characters exceeds threshhold. + + Args: + fname (str): Path to the file to inspect. + blocksize (int): Number of bytes to sample; default 256. + threshhold (float): Maximum allowed fraction of non-text bytes; default 0.1. + + Returns: + int: 1 = text, 0 = binary, -1 = file does not exist or is not a regular file. + """ # File doesn't exist or is not a regular file if not op.exists(fname) or not op.isfile(fname): return -1 if op.getsize(fname) == 0: return 1 # Empty files are considered text