In [19]:
import os
import json
import collections
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import re
import urllib.parse

In [2]:
import spacy
from spacy.language import Language

from spacy_language_detection import LanguageDetector

In [3]:
from html import unescape
import unicodedata

In [4]:
CONFIG_PATH = "config.json"
OUT_FILE = "traintest.blogs.pq"
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        }
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [7]:
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)

spacy_nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
spacy_nlp_model.add_pipe("language_detector", last=True)

def get_lang(doc):
    mdoc = spacy_nlp_model(doc)
    return mdoc._.language

In [8]:
t_blogs = get_table("blogs", "articles")
t_links = get_table("blogs", "links")

In [10]:
with get_session() as session:
    stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
    stmt = stmt.limit(10)
    for row in session.execute(stmt):
        article_id = row[0]
        info = {"id": f"{article_id}"}
        body = row[2]
        if body is None or not body.strip():
            print(f"NOTE: skipping {info} (no body)")
            continue
        lang = get_lang(body)
        if lang["score"] < 0.99:
            shortened = body[:80]
            shortened = shortened[:shortened.rfind(" ")]
            print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
        if lang["language"] != "en":
            print(f"NOTE: skipping {info} (not english)")
            continue
        body = clean(strip_html(body))
        title = row[1]
        if title is None or not title.strip():
            fallback = body[:body.find("\n")][:80]
            fallback = fallback[:fallback.rfind(" ")]
            title = fallback
            print(f"NOTE: missing title [{info}]: {title}")
        title = clean(title)
        print("=TITLE=================")
        print(f"{title} [{info}]")
        print("=TEXT==================")
        print(body)
        print("=LINKS=================")
        lstmt = sa.select(t_links.c.href, t_links.c.linktext)
        lstmt = lstmt.where(t_links.c.article_id == article_id)
        for link in session.execute(lstmt):
            if "undp.org/" not in link[0]:
                continue
            print(f"{clean(strip_html(link[1]))} [{link[0]}]")
        print()

Youth Leadership Programme 6: Decade of Action launch [{'id': '1'}]
UNDP Bahrain, in partnership with the Ministry of Youth and Sport Affairs, is excited to announce the sixth edition of the Youth Leadership Programme (YLP) in the year 2020. This article contains all you need to know about YLP6 and how to be part of it!

What is the Youth Leadership Programme?
The YLP is an initiative under the UNDP Regional Bureau for Arab States, launched in 2015 to build the region's most dynamic network working at the intersection of youth, innovation and sustainable development. The programme aims to empower a generation of young leaders, changemakers, and social innovators, to grow their capacity in tackling the Sustainable Development Goals (SDGs) in their countries and the Arab region.
YLP has run five successful cycles and, as of end of 2019, has worked with over 20,000 youth participants and supported 7,000+ projects. YLP participants have launched and established successful small businesses,

In [66]:
skip_words = [
    "/about-us",
    "/contact-us",
    "/scam-alert",
    "/audit/",
    "/copyright-terms-use",
    "/our-team",
    "/legal-framework",
    "/our-focus",
    "/where-we-work",
    "/careers",
    "/quantum",
]

link_lookup = collections.defaultdict(set)
with get_session() as session:
    lstmt = sa.select(t_links.c.href, t_links.c.linktext)
    for link in session.execute(lstmt):
        if "undp.org/" not in link[0] or any(sw in link[0] for sw in skip_words):
            continue
        linktext = link[1]
        if not linktext:
            continue
        linktext = linktext.strip()
        if not linktext:
            continue
        while True:
            prev_linktext = linktext
            linktext = urllib.parse.unquote(linktext)
            if linktext == prev_linktext:
                break
        linktext = linktext.removesuffix("READ MORE")
        href = link[0]
        href_key = prettify_href(href)
        if href not in link_lookup:
            link_lookup[href_key].add(extract_linktext(href))
        link_lookup[href_key].add(linktext.strip())

stop_parts = [
    "content",
    "home",
    "en",
    "files",
    "press-releases",
    "publications",
]


def extract_linktext(href):
    url = urllib.parse.urlparse(href)
    path = f"{url.path}"
    for sp in stop_parts:
        path = path.replace(f"/{sp}/", "/")
    res = re.sub(r"\s+", " ", path.replace("/", " ").replace("-", " ")).strip().lower()
    res = res.removesuffix(".html")
    res = res.removesuffix(".docx")
    res = res.removesuffix(".pdf")
    return prettify_href(res)


def prettify_href(href):
    href = re.sub("--+", "-", href.replace("/", "-").replace(":", "-").replace(".", "-"))
    href = href.rstrip("-")
    while True:
        prev_href = href
        href = urllib.parse.unquote(href)
        if href == prev_href:
            break
    return href.strip()


bad_linktexts = [
    "home",
    "here",
    "",
]

link_lookup = {
    href_norm: texts_norm
    for href_norm, texts_norm in {
        href: [
            text
            for text in ltexts
            if all(text.lower() != blt for blt in bad_linktexts)
        ]
        for href, ltexts in link_lookup.items()
    }.items()
    if texts_norm
}
len(link_lookup)

1332

In [67]:
# link_lookup

{'https-www-arabstates-undp-org-content-rbas-en-home-democratic-governance-and-peacebuilding-youth-leadership-programme0-html': ['regional programme',
  'Youth Leadership Programme',
  'UNDP Regional Bureau for Arab States',
  'rbas democratic governance and peacebuilding youth leadership programme0'],
 'https-www-bh-undp-org-content-bahrain-en-home-sustainable-development-goals-html': ['Sustainable Development Goals (SDGs)',
  'bahrain sustainable development goals'],
 'https-www-bh-undp-org-content-bahrain-en-home-presscenter-articles-2019-ylp_5-html': ['We did this by',
  'bahrain presscenter articles 2019 ylp_5'],
 'https-feature-undp-org-coronavirus-vs-inequality-?utm_source=social&utm_medium=undp&utm_campaign=covid19-inequality': ['coronavirus vs inequality',
  'the long-term development challenges in mind'],
 'https-www-bh-undp-org-content-undp-en-home-2030-agenda-for-sustainable-development-html': ['undp 2030 agenda for sustainable development',
  '2030 Agenda for Sustainable D

In [None]:
LTEXT_CAND = "linktext_candidates.json"
with open(LTEXT_CAND) as

In [None]:
content = {
    "id": [],
    "stage": [],
    "db": [],
    # "country": [],  # NOTE: no country for now
    "title": [],
    "text": [],
    "lang": [],
}
ctags = {}
ctagnames = []
for (tag_id, tag_name) in all_tags:
    col_name = f"tag_{tag_name}"
    ctagnames.append(col_name)
    ctags[tag_id] = col_name
    content[col_name] = []
country_cache = {}
# with get_session() as session:
#     stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
#     stmt = stmt.limit(10)
#     for row in session.execute(stmt):
#         article_id = row[0]
#         info = {"id": f"{article_id}"}
#         body = row[2]
#         if body is None or not body.strip():
#             print(f"NOTE: skipping {info} (no body)")
#             continue
#         lang = get_lang(body)
#         if lang["score"] < 0.99:
#             shortened = body[:80]
#             shortened = shortened[:shortened.rfind(" ")]
#             print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
#         if lang["language"] != "en":
#             print(f"NOTE: skipping {info} (not english)")
#             continue
#         body = clean(strip_html(body))
#         title = row[1]
#         if title is None or not title.strip():
#             fallback = body[:body.find("\n")][:80]
#             fallback = fallback[:fallback.rfind(" ")]
#             title = fallback
#             print(f"NOTE: missing title [{info}]: {title}")
#         title = clean(title)
#         print("=TITLE=================")
#         print(f"{title} [{info}]")
#         print("=TEXT==================")
#         print(body)
#         print("=LINKS=================")
#         lstmt = sa.select(t_links.c.href, t_links.c.linktext)
#         lstmt = lstmt.where(t_links.c.article_id == article_id)
#         for link in session.execute(lstmt):
#             if "undp.org/" not in link[0]:
#                 continue
#             print(f"{clean(strip_html(link[1]))} [{link[0]}]")
#         print()
with get_session() as session:
    for (t_name, t_pads, t_tagging) in tables:
        stmt = sa.select(t_pads.c.id, t_pads.c.title, t_pads.c.full_text, t_pads.c.owner)
        stmt = stmt.where(t_pads.c.status >= 2)
        cur_ix = 0
        for row in session.execute(stmt):
            cur_ix += 1
            body = clean(strip_html(row[2]))
            pad_id = row[0]
            info = {"id": f"{pad_id}", "db": f"{t_name}"}
            if len(body) <= 80:
                print(f"short body ({len(body)}). skipping... [{info}]: {body}")
                continue
            if t_name == "sm":
                stage = "validation"
                if cur_ix in is_train:
                    stage = "train"
                if cur_ix in is_test:
                    stage = "test"
            else:
                stage = "validation"
            owner_uuid = row[3]
            country = country_cache.get(owner_uuid)
            if country is None:
                cstmt = sa.select(t_users.c.iso3).where(t_users.c.uuid == owner_uuid)
                country = session.execute(cstmt).scalar_one()
                # print(f"discovered new country: {country}")
                country_cache[owner_uuid] = country
            content["country"].append(country)
            content["id"].append(pad_id)
            content["stage"].append(stage)
            content["db"].append(t_name)
            title = row[1]
            if title is None:
                fallback = body[:body.find("\n")][:80]
                fallback = fallback[:fallback.rfind(" ")]
                title = fallback
                print(f"missing title [{info}]: {title}")
            content["title"].append(clean(title))
            content["text"].append(body)
            lang = get_lang(body)
            if lang["score"] < 0.99:
                shortened = body[:80]
                shortened = shortened[:shortened.rfind(" ")]
                print(f"low confidence language {lang} [{info}]: {shortened}")
            content["lang"].append(lang["language"])
            for cname in ctagnames:
                content[cname].append(False)
            tstmt = sa.select(t_tagging.c.tag_id)
            tstmt = tstmt.where(t_tagging.c.pad == row[0])
            for tag in session.execute(tstmt):
                cname = ctags.get(tag[0])
                if cname is not None:
                    content[cname][-1] = True
df = pd.DataFrame(content, columns=["stage", "id", "db", "country", "lang", "title", "text"] + sorted(ctagnames))
final_tags = []
for cname in ctagnames:
    if df[cname].all() or not df[cname].any():
        print(f"drop {cname}")
        del df[cname]
    else:
        final_tags.append(cname)