From 0ff26a2acf08619f10d28aa0c790b0e31537ff8a Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Fri, 8 Sep 2023 13:05:17 -0400 Subject: [PATCH 1/2] IN-907 Define classes for XML feeds Why these changes are being introduced: * Improve code coherency by encapsulating functions for creating and running XML feeds in classes based on feed type. How this addresses that need: * Update app module to use XML feed classes * Rename classes in app module: * Writer -> FileWriter * PipeWriter -> ConcurrentFileWriter * FtpFileWriter (fka FTPReader) -> FtpFile * Move tests for helper functions to test_helpers.py Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/IN-907 --- carbon/app.py | 344 +++--------------------------------------- carbon/feed.py | 341 +++++++++++++++++++++++++++++++++++++++++ tests/conftest.py | 47 +++--- tests/test_app.py | 258 ++++++++++++------------------- tests/test_helpers.py | 42 ++++++ 5 files changed, 527 insertions(+), 505 deletions(-) create mode 100644 carbon/feed.py create mode 100644 tests/test_helpers.py diff --git a/carbon/app.py b/carbon/app.py index e2ca243..0415def 100644 --- a/carbon/app.py +++ b/carbon/app.py @@ -3,327 +3,18 @@ import logging import os import threading -from contextlib import closing, contextmanager -from datetime import datetime from ftplib import FTP, FTP_TLS # nosec -from functools import partial -from typing import IO, TYPE_CHECKING, Any +from typing import IO, TYPE_CHECKING -from lxml import etree as ET # nosec -from sqlalchemy import func, select - -from carbon.database import DatabaseEngine, aa_articles, dlcs, orcids, persons -from carbon.helpers import get_group_name, get_hire_date_string, get_initials +from carbon.feed import ArticlesXmlFeed, PeopleXmlFeed if TYPE_CHECKING: - from collections.abc import Callable, Generator + from collections.abc import Callable from socket import socket -logger = logging.getLogger(__name__) - -# variables used in query for retrieving 'people' records -AREAS = ( - "ARCHITECTURE & PLANNING AREA", - "ENGINEERING AREA", - "HUMANITIES, ARTS, & SOCIAL SCIENCES AREA", - "SCIENCE AREA", - "SLOAN SCHOOL OF MANAGEMENT AREA", - "VP RESEARCH", - "CHANCELLOR'S AREA", - "OFFICE OF PROVOST AREA", - "PROVOST AREA", -) -PS_CODES = ( - "CFAN", - "CFAT", - "CFEL", - "CSRS", - "CSRR", - "COAC", - "COAR", - "L303", -) -TITLES = ( - "ADJUNCT ASSOCIATE PROFESSOR", - "ADJUNCT PROFESSOR", - "AFFILIATED ARTIST", - "ASSISTANT PROFESSOR", - "ASSOCIATE PROFESSOR", - "ASSOCIATE PROFESSOR (NOTT)", - "ASSOCIATE PROFESSOR (WOT)", - "ASSOCIATE PROFESSOR OF THE PRACTICE", - "INSTITUTE OFFICIAL - EMERITUS", - "INSTITUTE PROFESSOR (WOT)", - "INSTITUTE PROFESSOR EMERITUS", - "INSTRUCTOR", - "LECTURER", - "LECTURER II", - "POSTDOCTORAL ASSOCIATE", - "POSTDOCTORAL FELLOW", - "PRINCIPAL RESEARCH ASSOCIATE", - "PRINCIPAL RESEARCH ENGINEER", - "PRINCIPAL RESEARCH SCIENTIST", - "PROFESSOR", - "PROFESSOR (NOTT)", - "PROFESSOR (WOT)", - "PROFESSOR EMERITUS", - "PROFESSOR OF THE PRACTICE", - "RESEARCH ASSOCIATE", - "RESEARCH ENGINEER", - "RESEARCH FELLOW", - "RESEARCH SCIENTIST", - "RESEARCH SPECIALIST", - "SENIOR LECTURER", - "SENIOR POSTDOCTORAL ASSOCIATE", - "SENIOR POSTDOCTORAL FELLOW", - "SENIOR RESEARCH ASSOCIATE", - "SENIOR RESEARCH ENGINEER", - "SENIOR RESEARCH SCIENTIST", - "SENIOR RESEARCH SCIENTIST (MAP)", - "SPONSORED RESEARCH TECHNICAL STAFF", - "SPONSORED RESEARCH TECHNICAL SUPERVISOR", - "STAFF AFFILIATE", - "TECHNICAL ASSISTANT", - "TECHNICAL ASSOCIATE", - "VISITING ASSISTANT PROFESSOR", - "VISITING ASSOCIATE PROFESSOR", - "VISITING ENGINEER", - "VISITING LECTURER", - "VISITING PROFESSOR", - "VISITING RESEARCH ASSOCIATE", - "VISITING SCHOLAR", - "VISITING SCIENTIST", - "VISITING SENIOR LECTURER", - "PART-TIME FLEXIBLE/LL", -) - - -def _add_article(xml_file: IO, article: dict[str, Any]) -> None: - """Create an XML element representing an article and write it to an output file. - - The function will create a single 'ARTICLE' element that contains subelements - representing fields in a record from the 'AA_ARTICLE table'. The 'ARTICLE' - element is written to an XML file. - - Args: - xml_file (IO): A file-like object (stream) that writes to an XML file. - article (dict[str, Any]): A record from the AA_ARTICLE table. - """ - record = ET.Element("ARTICLE") - add_child(record, "AA_MATCH_SCORE", str(article["AA_MATCH_SCORE"])) - add_child(record, "ARTICLE_ID", article["ARTICLE_ID"]) - add_child(record, "ARTICLE_TITLE", article["ARTICLE_TITLE"]) - add_child(record, "ARTICLE_YEAR", article["ARTICLE_YEAR"]) - add_child(record, "AUTHORS", article["AUTHORS"]) - add_child(record, "DOI", article["DOI"]) - add_child(record, "ISSN_ELECTRONIC", article["ISSN_ELECTRONIC"]) - add_child(record, "ISSN_PRINT", article["ISSN_PRINT"]) - add_child(record, "IS_CONFERENCE_PROCEEDING", article["IS_CONFERENCE_PROCEEDING"]) - add_child(record, "JOURNAL_FIRST_PAGE", article["JOURNAL_FIRST_PAGE"]) - add_child(record, "JOURNAL_LAST_PAGE", article["JOURNAL_LAST_PAGE"]) - add_child(record, "JOURNAL_ISSUE", article["JOURNAL_ISSUE"]) - add_child(record, "JOURNAL_VOLUME", article["JOURNAL_VOLUME"]) - add_child(record, "JOURNAL_NAME", article["JOURNAL_NAME"]) - add_child(record, "MIT_ID", article["MIT_ID"]) - add_child(record, "PUBLISHER", article["PUBLISHER"]) - xml_file.write(record) - - -def _add_person(xml_file: IO, person: dict[str, Any]) -> None: - """Create an XML element representing a person and write it to an output file. - - The function will create a single 'record' element that contains subelements - representing fields from the 'HR_PERSON_EMPLOYEE_LIMITED', 'HR_ORG_UNIT', - and 'ORCID_TO_MITID' tables. The 'record' element is written to - to an XML file. - - Args: - xml_file (IO): A file-like object (stream) that writes to an XML file. - person (dict[str, Any]): A record from from a data model that includes fields - from the 'HR_PERSON_EMPLOYEE_LIMITED', 'HR_ORG_UNIT', and 'ORCID_TO_MITID' - tables. - """ - record = ET.Element("record") - add_child(record, "field", person["MIT_ID"], name="[Proprietary_ID]") - add_child(record, "field", person["KRB_NAME_UPPERCASE"], name="[Username]") - add_child( - record, - "field", - get_initials(person["FIRST_NAME"], person["MIDDLE_NAME"]), - name="[Initials]", - ) - add_child(record, "field", person["LAST_NAME"], name="[LastName]") - add_child(record, "field", person["FIRST_NAME"], name="[FirstName]") - add_child(record, "field", person["EMAIL_ADDRESS"], name="[Email]") - add_child(record, "field", "MIT", name="[AuthenticatingAuthority]") - add_child(record, "field", "1", name="[IsAcademic]") - add_child(record, "field", "1", name="[IsCurrent]") - add_child(record, "field", "1", name="[LoginAllowed]") - add_child( - record, - "field", - get_group_name(person["DLC_NAME"], person["PERSONNEL_SUBAREA_CODE"]), - name="[PrimaryGroupDescriptor]", - ) - add_child( - record, - "field", - get_hire_date_string(person["ORIGINAL_HIRE_DATE"], person["DATE_TO_FACULTY"]), - name="[ArriveDate]", - ) - add_child( - record, - "field", - person["APPOINTMENT_END_DATE"].strftime("%Y-%m-%d"), - name="[LeaveDate]", - ) - add_child(record, "field", person["ORCID"], name="[Generic01]") - add_child(record, "field", person["PERSONNEL_SUBAREA_CODE"], name="[Generic02]") - add_child(record, "field", person["ORG_HIER_SCHOOL_AREA_NAME"], name="[Generic03]") - add_child(record, "field", person["DLC_NAME"], name="[Generic04]") - add_child(record, "field", person.get("HR_ORG_LEVEL5_NAME"), name="[Generic05]") - xml_file.write(record) - - -def add_child( - parent: ET._Element, # noqa: SLF001 - element_name: str, - element_text: str | None = None, - **kwargs: str, -) -> ET._Element: # noqa: SLF001 - """Add a subelement to an existing element. - - Args: - parent (ET._Element): The root element. - element_name: The name of the subelement. - element_text (str | None, optional): The value stored in the subelement. - Defaults to None. - **kwargs (str): Keyword arguments representing attributes for the subelement. - The 'name' argument is set for 'people' elements. - - Returns: - ET._Element: The subelement. - """ - child = ET.SubElement(parent, element_name, attrib=kwargs) - child.text = element_text - return child - - -@contextmanager -def article_feed(output_file: IO) -> Generator: - """Generate XML feed of articles. - - Args: - output_file (IO): A file-like object (stream) into which normalized XML strings - strings are written. - - Yields: - Generator: A function that adds an 'ARTICLE' element to an 'ARTICLES' - root element. - """ - with ET.xmlfile(output_file, encoding="UTF-8") as xml_file: - xml_file.write_declaration() - with xml_file.element("ARTICLES"): - yield partial(_add_article, xml_file) - - -def articles(engine: DatabaseEngine) -> Generator[dict[str, Any], Any, None]: - """Create a generator of article records. + from carbon.database import DatabaseEngine - Yields: - Generator[dict[str, Any], Any, None]: Results matching the query submitted to - the Data Warehouse for retrieving 'articles' records. - """ - sql = ( - select(aa_articles) - .where(aa_articles.c.ARTICLE_ID.is_not(None)) - .where(aa_articles.c.ARTICLE_TITLE.is_not(None)) - .where(aa_articles.c.DOI.is_not(None)) - .where(aa_articles.c.MIT_ID.is_not(None)) - ) - with closing(engine().connect()) as connection: - result = connection.execute(sql) - for row in result: - yield dict(zip(result.keys(), row, strict=True)) - - -@contextmanager -def person_feed(output_file: IO) -> Generator: - """Generate XML feed of people. - - This is a streaming XML generator for people. Output will be - written to the provided output destination which can be a file - or file-like object. The context manager returns a function - which can be called repeatedly to add a person to the feed:: - - with person_feed(sys.stdout) as f: - f({"MIT_ID": "1234", ...}) - f({"MIT_ID": "5678", ...}) - - Args: - output_file (IO): A file-like object (stream) into which normalized XML - strings are written. - - Yields: - Generator: A function that adds a 'record' element to a 'records' root element. - . - """ - # namespace assigned to the xmlns attribute of the root 'records' element - symplectic_elements_namespace = "http://www.symplectic.co.uk/hrimporter" - qualified_tag_name = ET.QName(symplectic_elements_namespace, tag="records") - nsmap = {None: symplectic_elements_namespace} - - with ET.xmlfile(output_file, encoding="UTF-8") as xml_file: - xml_file.write_declaration() - with xml_file.element(tag=qualified_tag_name, nsmap=nsmap): - yield partial(_add_person, xml_file) - - -def people(engine: DatabaseEngine) -> Generator[dict[str, Any], Any, None]: - """Create a generator of 'people' records. - - Yields: - Generator[dict[str, Any], Any, None]: Results matching the query submitted to - the Data Warehouse for retrieving 'people' records. - """ - sql = ( - select( - persons.c.MIT_ID, - persons.c.KRB_NAME_UPPERCASE, - persons.c.FIRST_NAME, - persons.c.MIDDLE_NAME, - persons.c.LAST_NAME, - persons.c.EMAIL_ADDRESS, - persons.c.DATE_TO_FACULTY, - persons.c.ORIGINAL_HIRE_DATE, - dlcs.c.DLC_NAME, - persons.c.PERSONNEL_SUBAREA_CODE, - persons.c.APPOINTMENT_END_DATE, - orcids.c.ORCID, - dlcs.c.ORG_HIER_SCHOOL_AREA_NAME, - dlcs.c.HR_ORG_LEVEL5_NAME, - ) - .select_from(persons) - .outerjoin(orcids) - .join(dlcs) - .where(persons.c.EMAIL_ADDRESS.is_not(None)) - .where(persons.c.LAST_NAME.is_not(None)) - .where(persons.c.KRB_NAME_UPPERCASE.is_not(None)) - .where(persons.c.KRB_NAME_UPPERCASE != "UNKNOWN") - .where(persons.c.MIT_ID.is_not(None)) - .where(persons.c.ORIGINAL_HIRE_DATE.is_not(None)) - .where( - persons.c.APPOINTMENT_END_DATE # noqa: SIM300 - >= datetime(2009, 1, 1) # noqa: DTZ001 - ) - .where(func.upper(dlcs.c.ORG_HIER_SCHOOL_AREA_NAME).in_(AREAS)) - .where(persons.c.PERSONNEL_SUBAREA_CODE.in_(PS_CODES)) - .where(func.upper(persons.c.JOB_TITLE).in_(TITLES)) - ) - with closing(engine().connect()) as connection: - result = connection.execute(sql) - for row in result: - yield dict(zip(result.keys(), row, strict=True)) +logger = logging.getLogger(__name__) class CarbonFtpsTls(FTP_TLS): @@ -376,7 +67,7 @@ def storbinary( return self.voidresp() -class Writer: +class FileWriter: """A writer that outputs normalized XML strings to a specified file. Use this class to generate either a 'people' or 'articles' feed that is written @@ -393,17 +84,20 @@ def __init__(self, engine: DatabaseEngine, output_file: IO): def write(self, feed_type: str) -> None: """Write the specified feed type to the configured output.""" + xml_feed: PeopleXmlFeed | ArticlesXmlFeed if feed_type == "people": - with person_feed(self.output_file) as f: - for person in people(engine=self.engine): - f(person) + xml_feed = PeopleXmlFeed(engine=self.engine, output_file=self.output_file) + xml_feed.run(nsmap=xml_feed.namespace_mapping) elif feed_type == "articles": - with article_feed(self.output_file) as f: - for article in articles(engine=self.engine): - f(article) + xml_feed = ArticlesXmlFeed(engine=self.engine, output_file=self.output_file) + xml_feed.run() + xml_feed_record_count_message = ( + f"The feed has collected {xml_feed.record_count} '{feed_type}' records." + ) + logger.info(xml_feed_record_count_message) -class PipeWriter(Writer): +class ConcurrentFtpFileWriter(FileWriter): """A read/write carbon.app.Writer for the Symplectic Elements FTP server. This class is intended to provide a buffered read/write connecter. @@ -432,7 +126,7 @@ def write(self, feed_type: str) -> None: pipe.join() -class FtpFileWriter: +class FtpFile: """A file writer for the Symplectic Elements FTP server. The FtpFileWriter will read data from a provided feed and write the contents @@ -503,7 +197,7 @@ def run(self) -> None: with open(read_file, "rb") as buffered_reader, open( write_file, "wb" ) as buffered_writer: - ftp_file = FtpFileWriter( + ftp_file = FtpFile( content_feed=buffered_reader, user=self.config["SYMPLECTIC_FTP_USER"], password=self.config["SYMPLECTIC_FTP_PASS"], @@ -511,7 +205,7 @@ def run(self) -> None: host=self.config["SYMPLECTIC_FTP_HOST"], port=int(self.config["SYMPLECTIC_FTP_PORT"]), ) - PipeWriter( + ConcurrentFtpFileWriter( engine=self.engine, input_file=buffered_writer, ftp_output_file=ftp_file ).write(feed_type=self.config["FEED_TYPE"]) diff --git a/carbon/feed.py b/carbon/feed.py new file mode 100644 index 0000000..13cf24f --- /dev/null +++ b/carbon/feed.py @@ -0,0 +1,341 @@ +from abc import ABC, abstractmethod +from collections.abc import Generator +from contextlib import closing +from datetime import datetime +from typing import IO, Any, ClassVar + +from lxml import etree as ET +from sqlalchemy import func, select +from sqlalchemy.sql.selectable import Select + +from carbon.database import DatabaseEngine, aa_articles, dlcs, orcids, persons +from carbon.helpers import ( + get_group_name, + get_hire_date_string, + get_initials, +) + + +class BaseXmlFeed(ABC): + """Base XML feed class. + + This is the abstract class for creating XML feeds. The following class attributes + are unique per subclass of carbon.feed.BaseXmlFeed: + + root_element_name: The 'tag' assigned to the root Element. + query: The select statmenet submitted to the Data Warehouse to retrieve records. + + Attributes: + engine: A configured carbon.database.DatabaseEngine that can connect to the + Data Warehouse. + output_file: A file-like object (stream) into which normalized XML strings + strings are written. + + """ + + root_element_name: str | ET.QName = "" + query: Select = select() + record_count: int = 0 + + def __init__(self, engine: DatabaseEngine, output_file: IO): + self.engine = engine + self.output_file = output_file + + @property + def records(self) -> Generator[dict[str, Any], Any, None]: + """Create a generator of 'people' or 'article' records from the Data Warehouse. + + Yields: + Generator[dict[str, Any], Any, None]: Records that + match the query submitted to the Data Warehouse. + """ + with closing(self.engine().connect()) as connection: + result = connection.execute(self.query) + for row in result: + self.record_count += 1 + yield dict(zip(result.keys(), row, strict=True)) + + @abstractmethod + def _add_element(self, record: dict[str, Any]) -> None | ET._Element: # noqa: SLF001 + """Create an XML element for a provided record. + + Must be overridden by subclasses. + + Args: + record (dict[str, Any]): A record matching the query submitted to + the Data Warehouse. + + Returns: + None | ET._Element: A record XML element. + """ + + def _add_subelement( + self, + parent: ET._Element, # noqa: SLF001 + element_name: str, + element_text: str | None = None, + **kwargs: str, + ) -> ET._Element: # noqa: SLF001): + """Add an XML subelement to an existing element. + + Args: + parent (ET._Element): The parent element. + element_name (str): The name of the subelement. + element_text (str | None, optional): The value stored in the subelement. + Defaults to None. + **kwargs (str): Keyword arguments representing attributes for the subelement. + The 'name' argument is set for 'people' elements. + + Returns: + ET._Element: A subelement with text. + """ + subelement = ET.SubElement(parent, element_name, attrib=kwargs) + subelement.text = element_text + return subelement + + def run(self, **kwargs: dict[str, Any]) -> None: + """Generate a feed that streams normalized XML strings to an XML file.""" + with ET.xmlfile(self.output_file, encoding="UTF-8") as xml_file: + xml_file.write_declaration() + with xml_file.element(tag=self.root_element_name, **kwargs): + for record in self.records: + element = self._add_element(record) + xml_file.write(element) + + +class ArticlesXmlFeed(BaseXmlFeed): + """Articles XML feed class.""" + + root_element_name = "ARTICLES" + query = ( + select(aa_articles) + .where(aa_articles.c.ARTICLE_ID.is_not(None)) + .where(aa_articles.c.ARTICLE_TITLE.is_not(None)) + .where(aa_articles.c.DOI.is_not(None)) + .where(aa_articles.c.MIT_ID.is_not(None)) + ) + + def _add_element(self, record: dict[str, Any]) -> ET._Element: # noqa: SLF001 + """Create an XML element representing an article. + + The function will create a single 'ARTICLE' element that contains subelements + representing fields in a record from the 'AA_ARTICLE table'. + + Args: + record (dict[str, Any]): A record matching the query submitted to the + Data Warehouse for retrieving 'articles' records. + + Returns: + ET._Element: An articles XML element. + """ + article = ET.Element("ARTICLE") + self._add_subelement(article, "AA_MATCH_SCORE", str(record["AA_MATCH_SCORE"])) + self._add_subelement(article, "ARTICLE_ID", record["ARTICLE_ID"]) + self._add_subelement(article, "ARTICLE_TITLE", record["ARTICLE_TITLE"]) + self._add_subelement(article, "ARTICLE_YEAR", record["ARTICLE_YEAR"]) + self._add_subelement(article, "AUTHORS", record["AUTHORS"]) + self._add_subelement(article, "DOI", record["DOI"]) + self._add_subelement(article, "ISSN_ELECTRONIC", record["ISSN_ELECTRONIC"]) + self._add_subelement(article, "ISSN_PRINT", record["ISSN_PRINT"]) + self._add_subelement( + article, "IS_CONFERENCE_PROCEEDING", record["IS_CONFERENCE_PROCEEDING"] + ) + self._add_subelement(article, "JOURNAL_FIRST_PAGE", record["JOURNAL_FIRST_PAGE"]) + self._add_subelement(article, "JOURNAL_LAST_PAGE", record["JOURNAL_LAST_PAGE"]) + self._add_subelement(article, "JOURNAL_ISSUE", record["JOURNAL_ISSUE"]) + self._add_subelement(article, "JOURNAL_VOLUME", record["JOURNAL_VOLUME"]) + self._add_subelement(article, "JOURNAL_NAME", record["JOURNAL_NAME"]) + self._add_subelement(article, "MIT_ID", record["MIT_ID"]) + self._add_subelement(article, "PUBLISHER", record["PUBLISHER"]) + return article + + +class PeopleXmlFeed(BaseXmlFeed): + """People XML feed class. + + There are several class attributes that are required only for the 'people' XML feed: + + areas, ps_codes, title: A series of tuples containing strings used in + carbon.feed.PeopleXmlFeed.query. + symplectic_elements_namespace: The namespace assigned to the 'xmlns' + attribute of the root 'records' element. + namespace_mapping: A configuration required to clean up the 'xmlns' + attribute of the root 'records' element when serialized. + """ + + areas: tuple[str, ...] = ( + "ARCHITECTURE & PLANNING AREA", + "ENGINEERING AREA", + "HUMANITIES, ARTS, & SOCIAL SCIENCES AREA", + "SCIENCE AREA", + "SLOAN SCHOOL OF MANAGEMENT AREA", + "VP RESEARCH", + "CHANCELLOR'S AREA", + "OFFICE OF PROVOST AREA", + "PROVOST AREA", + ) + ps_codes: tuple[str, ...] = ( + "CFAN", + "CFAT", + "CFEL", + "CSRS", + "CSRR", + "COAC", + "COAR", + "L303", + ) + titles: tuple[str, ...] = ( + "ADJUNCT ASSOCIATE PROFESSOR", + "ADJUNCT PROFESSOR", + "AFFILIATED ARTIST", + "ASSISTANT PROFESSOR", + "ASSOCIATE PROFESSOR", + "ASSOCIATE PROFESSOR (NOTT)", + "ASSOCIATE PROFESSOR (WOT)", + "ASSOCIATE PROFESSOR OF THE PRACTICE", + "INSTITUTE OFFICIAL - EMERITUS", + "INSTITUTE PROFESSOR (WOT)", + "INSTITUTE PROFESSOR EMERITUS", + "INSTRUCTOR", + "LECTURER", + "LECTURER II", + "POSTDOCTORAL ASSOCIATE", + "POSTDOCTORAL FELLOW", + "PRINCIPAL RESEARCH ASSOCIATE", + "PRINCIPAL RESEARCH ENGINEER", + "PRINCIPAL RESEARCH SCIENTIST", + "PROFESSOR", + "PROFESSOR (NOTT)", + "PROFESSOR (WOT)", + "PROFESSOR EMERITUS", + "PROFESSOR OF THE PRACTICE", + "RESEARCH ASSOCIATE", + "RESEARCH ENGINEER", + "RESEARCH FELLOW", + "RESEARCH SCIENTIST", + "RESEARCH SPECIALIST", + "SENIOR LECTURER", + "SENIOR POSTDOCTORAL ASSOCIATE", + "SENIOR POSTDOCTORAL FELLOW", + "SENIOR RESEARCH ASSOCIATE", + "SENIOR RESEARCH ENGINEER", + "SENIOR RESEARCH SCIENTIST", + "SENIOR RESEARCH SCIENTIST (MAP)", + "SPONSORED RESEARCH TECHNICAL STAFF", + "SPONSORED RESEARCH TECHNICAL SUPERVISOR", + "STAFF AFFILIATE", + "TECHNICAL ASSISTANT", + "TECHNICAL ASSOCIATE", + "VISITING ASSISTANT PROFESSOR", + "VISITING ASSOCIATE PROFESSOR", + "VISITING ENGINEER", + "VISITING LECTURER", + "VISITING PROFESSOR", + "VISITING RESEARCH ASSOCIATE", + "VISITING SCHOLAR", + "VISITING SCIENTIST", + "VISITING SENIOR LECTURER", + "PART-TIME FLEXIBLE/LL", + ) + + symplectic_elements_namespace: str = "http://www.symplectic.co.uk/hrimporter" + namespace_mapping: ClassVar[dict] = {None: symplectic_elements_namespace} + + root_element_name: ET.QName = ET.QName(symplectic_elements_namespace, tag="records") + query = ( + select( + persons.c.MIT_ID, + persons.c.KRB_NAME_UPPERCASE, + persons.c.FIRST_NAME, + persons.c.MIDDLE_NAME, + persons.c.LAST_NAME, + persons.c.EMAIL_ADDRESS, + persons.c.DATE_TO_FACULTY, + persons.c.ORIGINAL_HIRE_DATE, + dlcs.c.DLC_NAME, + persons.c.PERSONNEL_SUBAREA_CODE, + persons.c.APPOINTMENT_END_DATE, + orcids.c.ORCID, + dlcs.c.ORG_HIER_SCHOOL_AREA_NAME, + dlcs.c.HR_ORG_LEVEL5_NAME, + ) + .select_from(persons) + .outerjoin(orcids) + .join(dlcs) + .where(persons.c.EMAIL_ADDRESS.is_not(None)) + .where(persons.c.LAST_NAME.is_not(None)) + .where(persons.c.KRB_NAME_UPPERCASE.is_not(None)) + .where(persons.c.KRB_NAME_UPPERCASE != "UNKNOWN") + .where(persons.c.MIT_ID.is_not(None)) + .where(persons.c.ORIGINAL_HIRE_DATE.is_not(None)) + .where( + persons.c.APPOINTMENT_END_DATE # noqa: SIM300 + >= datetime(2009, 1, 1) # noqa: DTZ001 + ) + .where(func.upper(dlcs.c.ORG_HIER_SCHOOL_AREA_NAME).in_(areas)) + .where(persons.c.PERSONNEL_SUBAREA_CODE.in_(ps_codes)) + .where(func.upper(persons.c.JOB_TITLE).in_(titles)) + ) + + def _add_element(self, record: dict[str, Any]) -> ET._Element: # noqa: SLF001 + """Create an XML element representing a person. + + The function will create a single 'record' element that contains subelements + representing fields from the 'HR_PERSON_EMPLOYEE_LIMITED', 'HR_ORG_UNIT', + and 'ORCID_TO_MITID' tables. + + Args: + record (dict[str, Any]): A record matching the query submitted to the + Data Warehouse for retrieving 'people' records. + + Returns: + ET._Element: A person XML element. + """ + person = ET.Element("record") + self._add_subelement(person, "field", record["MIT_ID"], name="[Proprietary_ID]") + self._add_subelement( + person, "field", record["KRB_NAME_UPPERCASE"], name="[Username]" + ) + self._add_subelement( + person, + "field", + get_initials(record["FIRST_NAME"], record["MIDDLE_NAME"]), + name="[Initials]", + ) + self._add_subelement(person, "field", record["LAST_NAME"], name="[LastName]") + self._add_subelement(person, "field", record["FIRST_NAME"], name="[FirstName]") + self._add_subelement(person, "field", record["EMAIL_ADDRESS"], name="[Email]") + self._add_subelement(person, "field", "MIT", name="[AuthenticatingAuthority]") + self._add_subelement(person, "field", "1", name="[IsAcademic]") + self._add_subelement(person, "field", "1", name="[IsCurrent]") + self._add_subelement(person, "field", "1", name="[LoginAllowed]") + self._add_subelement( + person, + "field", + get_group_name(record["DLC_NAME"], record["PERSONNEL_SUBAREA_CODE"]), + name="[PrimaryGroupDescriptor]", + ) + self._add_subelement( + person, + "field", + get_hire_date_string(record["ORIGINAL_HIRE_DATE"], record["DATE_TO_FACULTY"]), + name="[ArriveDate]", + ) + self._add_subelement( + person, + "field", + record["APPOINTMENT_END_DATE"].strftime("%Y-%m-%d"), + name="[LeaveDate]", + ) + self._add_subelement(person, "field", record["ORCID"], name="[Generic01]") + self._add_subelement( + person, "field", record["PERSONNEL_SUBAREA_CODE"], name="[Generic02]" + ) + self._add_subelement( + person, "field", record["ORG_HIER_SCHOOL_AREA_NAME"], name="[Generic03]" + ) + self._add_subelement(person, "field", record["DLC_NAME"], name="[Generic04]") + self._add_subelement( + person, "field", record.get("HR_ORG_LEVEL5_NAME"), name="[Generic05]" + ) + return person diff --git a/tests/conftest.py b/tests/conftest.py index f68979d..6f33cef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -92,31 +92,36 @@ def nonfunctional_engine(): # create XML elements representing 'people' and 'articles' records @pytest.fixture -def articles_element(): - element_maker = ElementMaker() - return element_maker.ARTICLES( - element_maker.ARTICLE( - element_maker.AA_MATCH_SCORE("0.9"), - element_maker.ARTICLE_ID("1234567"), - element_maker.ARTICLE_TITLE( +def articles_element_maker(): + return ElementMaker() + + +@pytest.fixture +def articles_element(articles_element_maker): + articles_elements = [ + articles_element_maker.ARTICLE( + articles_element_maker.AA_MATCH_SCORE("0.9"), + articles_element_maker.ARTICLE_ID("1234567"), + articles_element_maker.ARTICLE_TITLE( "Interaction between hatsopoulos microfluids and " "the Yawning Abyss of Chaos ☈." ), - element_maker.ARTICLE_YEAR("1999"), - element_maker.AUTHORS("McRandallson, Randall M.|Lord, Dark|☭"), - element_maker.DOI("10.0000/1234LETTERS56"), - element_maker.ISSN_ELECTRONIC("0987654"), - element_maker.ISSN_PRINT("01234567"), - element_maker.IS_CONFERENCE_PROCEEDING("0"), - element_maker.JOURNAL_FIRST_PAGE("666"), - element_maker.JOURNAL_LAST_PAGE("666"), - element_maker.JOURNAL_ISSUE("10"), - element_maker.JOURNAL_VOLUME("1"), - element_maker.JOURNAL_NAME("Bunnies"), - element_maker.MIT_ID("123456789"), - element_maker.PUBLISHER("MIT Press"), + articles_element_maker.ARTICLE_YEAR("1999"), + articles_element_maker.AUTHORS("McRandallson, Randall M.|Lord, Dark|☭"), + articles_element_maker.DOI("10.0000/1234LETTERS56"), + articles_element_maker.ISSN_ELECTRONIC("0987654"), + articles_element_maker.ISSN_PRINT("01234567"), + articles_element_maker.IS_CONFERENCE_PROCEEDING("0"), + articles_element_maker.JOURNAL_FIRST_PAGE("666"), + articles_element_maker.JOURNAL_LAST_PAGE("666"), + articles_element_maker.JOURNAL_ISSUE("10"), + articles_element_maker.JOURNAL_VOLUME("1"), + articles_element_maker.JOURNAL_NAME("Bunnies"), + articles_element_maker.MIT_ID("123456789"), + articles_element_maker.PUBLISHER("MIT Press"), ) - ) + ] + return articles_element_maker.ARTICLES(*articles_elements) @pytest.fixture diff --git a/tests/test_app.py b/tests/test_app.py index 93f42c1..3dc76f2 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,25 +1,11 @@ import os from io import BytesIO -from unittest.mock import patch import pytest -from freezegun import freeze_time from lxml import etree as ET -from carbon.app import ( - FtpFileWriter, - PipeWriter, - Writer, - add_child, - article_feed, - articles, - get_group_name, - get_initials, - people, - person_feed, -) -from carbon.config import load_config_values -from carbon.helpers import sns_log +from carbon.app import ConcurrentFtpFileWriter, FileWriter, FtpFile +from carbon.feed import ArticlesXmlFeed, PeopleXmlFeed pytestmark = pytest.mark.usefixtures("_load_data") symplectic_elements_namespace = "http://www.symplectic.co.uk/hrimporter" @@ -27,95 +13,16 @@ nsmap = {None: symplectic_elements_namespace} -def test_people_generates_people(functional_engine): - people_records = list(people(engine=functional_engine)) - person = people_records[0] - assert person["KRB_NAME_UPPERCASE"] == "FOOBAR" - person = people_records[1] - assert person["KRB_NAME_UPPERCASE"] == "THOR" - - -def test_people_adds_orcids(functional_engine): - people_records = list(people(engine=functional_engine)) - assert people_records[0]["ORCID"] == "http://example.com/1" - - -def test_people_excludes_records_without_email(functional_engine): - people_records = list(people(engine=functional_engine)) - people_without_emails = [ - person for person in people_records if person["EMAIL_ADDRESS"] is None - ] - assert len(people_without_emails) == 0 - - -def test_people_excludes_records_without_last_name(functional_engine): - people_records = list(people(engine=functional_engine)) - people_without_last_names = [ - person for person in people_records if person["LAST_NAME"] is None - ] - assert len(people_without_last_names) == 0 - - -def test_people_excludes_records_without_kerberos(functional_engine): - people_records = list(people(engine=functional_engine)) - people_without_kerberos = [ - person for person in people_records if person["KRB_NAME_UPPERCASE"] is None - ] - assert len(people_without_kerberos) == 0 - - -def test_people_excludes_records_without_mit_id(functional_engine): - people_records = list(people(engine=functional_engine)) - people_without_mit_id = [ - person for person in people_records if person["MIT_ID"] is None - ] - assert len(people_without_mit_id) == 0 - - -def test_initials_returns_first_and_middle(): - assert get_initials("Foo", "Bar") == "F B" - assert get_initials("Foo") == "F" - assert get_initials("F", "B") == "F B" - assert get_initials("Foo-bar", "Gaz") == "F-B G" - assert get_initials("Foo Bar-baz", "G") == "F B-B G" - assert get_initials("Foo", "") == "F" - assert get_initials("Foo", None) == "F" - assert get_initials("Gull-Þóris") == "G-Þ" - assert get_initials("владимир", "ильич", "ленин") == "В И Л" # noqa: RUF001 - assert get_initials("F. M.", "Laxdæla") == "F M L" - - -def test_add_child_adds_child_element(people_element_maker): - xml = people_element_maker.records( - people_element_maker.record("foobar", {"baz": "bazbar"}) - ) - element = ET.Element(_tag=qualified_tag_name, nsmap=nsmap) - add_child(element, "record", "foobar", baz="bazbar") - assert ET.tostring(element) == ET.tostring(xml) - - -def test_writer_writes_person_feed(functional_engine): - output_file = BytesIO() - writer = Writer(functional_engine, output_file) - writer.write("people") - xml = ET.XML(output_file.getvalue()) - xp = xml.xpath( - "/s:records/s:record/s:field[@name='[FirstName]']", - namespaces={"s": "http://www.symplectic.co.uk/hrimporter"}, - ) - assert xp[1].text == "Þorgerðr" - - -def test_pipewriter_writes_person_feed(reader, functional_engine): +def test_concurrent_ftp_file_writer_creates_people_xml_file(reader, functional_engine): read_file, write_file = os.pipe() with open(read_file, "rb") as buffered_reader, open( write_file, "wb" ) as buffered_writer: file = reader(buffered_reader) - writer = PipeWriter( + ftp_file_writer = ConcurrentFtpFileWriter( engine=functional_engine, input_file=buffered_writer, ftp_output_file=file ) - writer.write("people") + ftp_file_writer.write("people") people_element = ET.XML(file.data) people_first_names_xpath = people_element.xpath( "/s:records/s:record/s:field[@name='[FirstName]']", @@ -124,94 +31,127 @@ def test_pipewriter_writes_person_feed(reader, functional_engine): assert people_first_names_xpath[1].text == "Þorgerðr" -def test_ftp_file_writer_sends_file(ftp_server_wrapper): +def test_file_writer_creates_people_xml_file(functional_engine): + file_writer = FileWriter(engine=functional_engine, output_file=BytesIO()) + file_writer.write("people") + people_element = ET.XML(file_writer.output_file.getvalue()) + people_first_names_xpath = people_element.xpath( + "/s:records/s:record/s:field[@name='[FirstName]']", + namespaces={"s": "http://www.symplectic.co.uk/hrimporter"}, + ) + assert people_first_names_xpath[1].text == "Þorgerðr" + + +def test_ftp_file_uploads_file_to_server(ftp_server_wrapper): ftp_socket, ftp_directory = ftp_server_wrapper feed = BytesIO(b"File uploaded to FTP server.") - ftp = FtpFileWriter( + ftp_file = FtpFile( content_feed=feed, user="user", password="pass", # noqa: S106 path="/DEV", port=ftp_socket[1], ) - ftp() + ftp_file() with open(os.path.join(ftp_directory, "DEV")) as file: assert file.read() == "File uploaded to FTP server." -def test_person_feed_uses_namespace(): - output_file = BytesIO() - with person_feed(output_file): - pass - root = ET.XML(output_file.getvalue()) - assert root.tag == "{http://www.symplectic.co.uk/hrimporter}records" +def test_people_xml_feed_adds_subelement(functional_engine, people_element_maker): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + xml = people_element_maker.records( + people_element_maker.record("foobar", {"baz": "bazbar"}) + ) + element = ET.Element(_tag=qualified_tag_name, nsmap=nsmap) + people_xml_feed._add_subelement( # noqa: SLF001 + parent=element, element_name="record", element_text="foobar", baz="bazbar" + ) + assert ET.tostring(element) == ET.tostring(xml) -def test_person_feed_adds_person(people_records): - output_file = BytesIO() - record = people_records[0]["person"].copy() - record |= people_records[0]["orcid"] | people_records[0]["dlc"] - with person_feed(output_file) as write_to_file: - write_to_file(record) - person_element = ET.XML(output_file.getvalue()) - person_first_name_xpath = person_element.xpath( - "/s:records/s:record/s:field[@name='[FirstName]']", - namespaces={"s": "http://www.symplectic.co.uk/hrimporter"}, - ) - assert person_first_name_xpath[0].text == "Foobar" +def test_people_xml_feed_generates_people(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + assert next(people_records)["KRB_NAME_UPPERCASE"] == "FOOBAR" + assert next(people_records)["KRB_NAME_UPPERCASE"] == "THOR" -def test_person_feed_uses_utf8_encoding(people_records): - output_file = BytesIO() - record = people_records[1]["person"].copy() - record |= people_records[1]["orcid"] | people_records[1]["dlc"] - with person_feed(output_file) as write_to_file: - write_to_file(record) - person_element = ET.XML(output_file.getvalue()) - person_first_name_xpath = person_element.xpath( - "/s:records/s:record/s:field[@name='[FirstName]']", - namespaces={"s": "http://www.symplectic.co.uk/hrimporter"}, - ) - assert person_first_name_xpath[0].text == "Þorgerðr" +def test_people_xml_feed_query_adds_orcids(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + assert next(people_records)["ORCID"] == "http://example.com/1" -def test_group_name_adds_faculty(): - assert get_group_name("FOOBAR", "CFAT") == "FOOBAR Faculty" - assert get_group_name("FOOBAR", "CFAN") == "FOOBAR Faculty" +def test_people_xml_feed_query_excludes_records_without_email(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + people_without_emails = [ + person for person in people_records if person["EMAIL_ADDRESS"] is None + ] + assert len(people_without_emails) == 0 -def test_group_name_adds_non_faculty(): - assert get_group_name("FOOBAR", "COAC") == "FOOBAR Non-faculty" +def test_people_xml_feed_query_excludes_records_without_last_name(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + people_without_last_names = [ + person for person in people_records if person["LAST_NAME"] is None + ] + assert len(people_without_last_names) == 0 -def test_articles_generates_articles(functional_engine): - articles_records = list(articles(engine=functional_engine)) - assert "Yawning Abyss of Chaos" in articles_records[0]["ARTICLE_TITLE"] +def test_people_xml_feed_query_excludes_records_without_kerberos(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + people_without_kerberos = [ + person for person in people_records if person["KRB_NAME_UPPERCASE"] is None + ] + assert len(people_without_kerberos) == 0 -def test_article_feed_adds_article(articles_records, articles_element): - output_file = BytesIO() - with article_feed(output_file) as write_to_file: - write_to_file(articles_records[0]) - assert output_file.getvalue() == ET.tostring( - articles_element, encoding="UTF-8", xml_declaration=True - ) +def test_people_xml_feed_query_excludes_records_without_mit_id(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_records = people_xml_feed.records + people_without_mit_id = [ + person for person in people_records if person["MIT_ID"] is None + ] + assert len(people_without_mit_id) == 0 -def test_articles_skips_articles_without_required_fields(functional_engine): - articles_records = list(articles(engine=functional_engine)) - assert len(articles_records) == 1 +def test_people_xml_feed_uses_namespace(functional_engine): + people_xml_feed = PeopleXmlFeed(engine=functional_engine, output_file=BytesIO()) + people_xml_feed.run(nsmap=people_xml_feed.namespace_mapping) + people_element = ET.XML(people_xml_feed.output_file.getvalue()) + assert people_element.tag == "{http://www.symplectic.co.uk/hrimporter}records" -@freeze_time("2023-08-18") -def test_sns_log(caplog, stubbed_sns_client): - config_values = load_config_values() - with patch("boto3.client") as mocked_boto_client: - mocked_boto_client.return_value = stubbed_sns_client - sns_log(config_values, status="start") +def test_article_xml_feed_adds_subelement(articles_element_maker, functional_engine): + articles_xml_feed = ArticlesXmlFeed(engine=functional_engine, output_file=BytesIO()) + xml = articles_element_maker.ARTICLES(articles_element_maker.ARTICLE("foobar")) + element = ET.Element(_tag="ARTICLES") + articles_xml_feed._add_subelement( # noqa: SLF001 + parent=element, element_name="ARTICLE", element_text="foobar" + ) + assert ET.tostring(element) == ET.tostring(xml) - sns_log(config_values, status="success") - assert "Carbon run has successfully completed." in caplog.text - sns_log(config_values, status="fail") - assert "Carbon run has failed." in caplog.text +def test_articles_xml_feed_generates_articles(functional_engine): + articles_xml_feed = ArticlesXmlFeed(engine=functional_engine, output_file=BytesIO()) + articles_records = articles_xml_feed.records + assert "Yawning Abyss of Chaos" in next(articles_records)["ARTICLE_TITLE"] + + +def test_articles_xml_feed_query_excludes_records_without_required_fields( + functional_engine, +): + articles_xml_feed = ArticlesXmlFeed(engine=functional_engine, output_file=BytesIO()) + articles_records = articles_xml_feed.records + articles_without_required_fields = [ + article + for article in articles_records + if article["ARTICLE_ID"] is None + and article["ARTICLE_TITLE"] is None + and article["DOI"] is None + and article["MIT_ID"] is None + ] + assert len(articles_without_required_fields) == 0 diff --git a/tests/test_helpers.py b/tests/test_helpers.py new file mode 100644 index 0000000..4954d7f --- /dev/null +++ b/tests/test_helpers.py @@ -0,0 +1,42 @@ +from unittest.mock import patch + +from freezegun import freeze_time + +from carbon.config import load_config_values +from carbon.helpers import get_group_name, get_initials, sns_log + + +def test_group_name_adds_faculty(): + assert get_group_name("FOOBAR", "CFAT") == "FOOBAR Faculty" + assert get_group_name("FOOBAR", "CFAN") == "FOOBAR Faculty" + + +def test_group_name_adds_non_faculty(): + assert get_group_name("FOOBAR", "COAC") == "FOOBAR Non-faculty" + + +def test_initials_returns_first_and_middle(): + assert get_initials("Foo", "Bar") == "F B" + assert get_initials("Foo") == "F" + assert get_initials("F", "B") == "F B" + assert get_initials("Foo-bar", "Gaz") == "F-B G" + assert get_initials("Foo Bar-baz", "G") == "F B-B G" + assert get_initials("Foo", "") == "F" + assert get_initials("Foo", None) == "F" + assert get_initials("Gull-Þóris") == "G-Þ" + assert get_initials("владимир", "ильич", "ленин") == "В И Л" # noqa: RUF001 + assert get_initials("F. M.", "Laxdæla") == "F M L" + + +@freeze_time("2023-08-18") +def test_sns_log(caplog, stubbed_sns_client): + config_values = load_config_values() + with patch("boto3.client") as mocked_boto_client: + mocked_boto_client.return_value = stubbed_sns_client + sns_log(config_values, status="start") + + sns_log(config_values, status="success") + assert "Carbon run has successfully completed." in caplog.text + + sns_log(config_values, status="fail") + assert "Carbon run has failed." in caplog.text From 2a7e1424f079a305ebaff074c1c0f6310adbed2d Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Tue, 12 Sep 2023 12:39:25 -0400 Subject: [PATCH 2/2] Address comments --- carbon/app.py | 5 ++--- carbon/feed.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/carbon/app.py b/carbon/app.py index 0415def..5b1153c 100644 --- a/carbon/app.py +++ b/carbon/app.py @@ -91,10 +91,9 @@ def write(self, feed_type: str) -> None: elif feed_type == "articles": xml_feed = ArticlesXmlFeed(engine=self.engine, output_file=self.output_file) xml_feed.run() - xml_feed_record_count_message = ( - f"The feed has collected {xml_feed.record_count} '{feed_type}' records." + logger.info( + "The feed has collected %s '%s' records", xml_feed.record_count, feed_type ) - logger.info(xml_feed_record_count_message) class ConcurrentFtpFileWriter(FileWriter): diff --git a/carbon/feed.py b/carbon/feed.py index 13cf24f..8b1b6f8 100644 --- a/carbon/feed.py +++ b/carbon/feed.py @@ -33,7 +33,7 @@ class BaseXmlFeed(ABC): """ - root_element_name: str | ET.QName = "" + root_element_name: str = "" query: Select = select() record_count: int = 0 @@ -241,7 +241,7 @@ class PeopleXmlFeed(BaseXmlFeed): symplectic_elements_namespace: str = "http://www.symplectic.co.uk/hrimporter" namespace_mapping: ClassVar[dict] = {None: symplectic_elements_namespace} - root_element_name: ET.QName = ET.QName(symplectic_elements_namespace, tag="records") + root_element_name: str = str(ET.QName(symplectic_elements_namespace, tag="records")) query = ( select( persons.c.MIT_ID,