In [1]:
import os
import json
import collections
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import re
import urllib.parse

In [2]:
import spacy
from spacy.language import Language

from spacy_language_detection import LanguageDetector

In [3]:
from html import unescape
import unicodedata

In [4]:
CONFIG_PATH = "config.json"
LTEXT_CAND = "linktext_candidates.json"
OUT_FILE = "traintest.blogs.pq"
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        }
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [7]:
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)

spacy_nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
spacy_nlp_model.add_pipe("language_detector", last=True)

def get_lang(doc):
    mdoc = spacy_nlp_model(doc)
    return mdoc._.language

In [8]:
t_blogs = get_table("blogs", "articles")
t_links = get_table("blogs", "links")

In [9]:
with get_session() as session:
    stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
    stmt = stmt.limit(10)
    for row in session.execute(stmt):
        article_id = row[0]
        info = {"id": f"{article_id}"}
        body = row[2]
        if body is None or not body.strip():
            print(f"NOTE: skipping {info} (no body)")
            continue
        lang = get_lang(body)
        if lang["score"] < 0.99:
            shortened = body[:80]
            shortened = shortened[:shortened.rfind(" ")]
            print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
        if lang["language"] != "en":
            print(f"NOTE: skipping {info} (not english)")
            continue
        body = clean(strip_html(body))
        title = row[1]
        if title is None or not title.strip():
            fallback = body[:body.find("\n")][:80]
            fallback = fallback[:fallback.rfind(" ")]
            title = fallback
            print(f"NOTE: missing title [{info}]: {title}")
        title = clean(title)
        print("=TITLE=================")
        print(f"{title} [{info}]")
        print("=TEXT==================")
        print(body)
        print("=LINKS=================")
        lstmt = sa.select(t_links.c.href, t_links.c.linktext)
        lstmt = lstmt.where(t_links.c.article_id == article_id)
        for link in session.execute(lstmt):
            if "undp.org/" not in link[0]:
                continue
            print(f"{clean(strip_html(link[1]))} [{link[0]}]")
        print()

Youth Leadership Programme 6: Decade of Action launch [{'id': '1'}]
UNDP Bahrain, in partnership with the Ministry of Youth and Sport Affairs, is excited to announce the sixth edition of the Youth Leadership Programme (YLP) in the year 2020. This article contains all you need to know about YLP6 and how to be part of it!

What is the Youth Leadership Programme?
The YLP is an initiative under the UNDP Regional Bureau for Arab States, launched in 2015 to build the region's most dynamic network working at the intersection of youth, innovation and sustainable development. The programme aims to empower a generation of young leaders, changemakers, and social innovators, to grow their capacity in tackling the Sustainable Development Goals (SDGs) in their countries and the Arab region.
YLP has run five successful cycles and, as of end of 2019, has worked with over 20,000 youth participants and supported 7,000+ projects. YLP participants have launched and established successful small businesses,

In [10]:
skip_words = [
    "/about-us",
    "/contact-us",
    "/scam-alert",
    "/audit/",
    "/copyright-terms-use",
    "/our-team",
    "/legal-framework",
    "/our-focus",
    "/where-we-work",
    "/careers",
    "/quantum",
]


stop_parts = [
    "content",
    "home",
    "en",
    "files",
    "press-releases",
    "publications",
]


bad_linktexts = [
    "home",
    "here",
    "",
]


def extract_linktext(href):
    url = urllib.parse.urlparse(href)
    path = f"{url.path}"
    for sp in stop_parts:
        path = path.replace(f"/{sp}/", "/")
    res = re.sub(r"\s+", " ", path.replace("/", " ").replace("-", " ")).strip().lower()
    res = res.removesuffix(".html")
    res = res.removesuffix(".docx")
    res = res.removesuffix(".pdf")
    return prettify_href(res)


def prettify_href(href):
    href = re.sub("--+", "-", href.replace("/", "-").replace(":", "-").replace(".", "-"))
    href = href.rstrip("-")
    while True:
        prev_href = href
        href = urllib.parse.unquote(href)
        if href == prev_href:
            break
    return href.strip()


link_lookup = collections.defaultdict(set)
with get_session() as session:
    lstmt = sa.select(t_links.c.href, t_links.c.linktext)
    for link in session.execute(lstmt):
        if "undp.org/" not in link[0] or any(sw in link[0] for sw in skip_words):
            continue
        linktext = link[1]
        if not linktext:
            continue
        linktext = linktext.strip()
        if not linktext:
            continue
        while True:
            prev_linktext = linktext
            linktext = urllib.parse.unquote(linktext)
            if linktext == prev_linktext:
                break
        linktext = linktext.removesuffix("READ MORE")
        href = link[0]
        href_key = prettify_href(href)
        if href not in link_lookup:
            link_lookup[href_key].add(extract_linktext(href))
        link_lookup[href_key].add(linktext.strip())

link_lookup = {
    href_norm: texts_norm
    for href_norm, texts_norm in {
        href: [
            text
            for text in ltexts
            if all(text.lower() != blt for blt in bad_linktexts)
        ]
        for href, ltexts in link_lookup.items()
    }.items()
    if texts_norm
}
len(link_lookup)

1332

In [11]:
# link_lookup

In [12]:
with open(LTEXT_CAND, "w") as fout:
    print(json.dumps(link_lookup, indent=2, sort_keys=True), file=fout)

In [13]:
content = {
    "id": [],
    "stage": [],
    "db": [],
    # "country": [],  # NOTE: no country for now
    "title": [],
    "text": [],
    "lang": [],
}
ctags = {}
ctagnames = []
for tag_name in link_lookup.keys():
    col_name = f"tag_{tag_name}"
    ctagnames.append(col_name)
    content[col_name] = []
country_cache = {}
with get_session() as session:
    stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
    for row in session.execute(stmt):
        article_id = row[0]
        info = {"id": f"{article_id}"}
        db = "blogs"
        stage = "validation"
        body = row[2]
        if body is None or not body.strip():
            print(f"NOTE: skipping {info} (no body)")
            continue
        if len(body) <= 80:
            print(f"NOTE: short body ({len(body)}). skipping... [{info}]: {body}")
            continue
        lang = get_lang(body)
        if lang["score"] < 0.99:
            shortened = body[:80]
            shortened = shortened[:shortened.rfind(" ")]
            print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
        if lang["language"] != "en":
            print(f"NOTE: skipping {info} (not english)")
            continue
        body = clean(strip_html(body))
        title = row[1]
        if title is None or not title.strip():
            fallback = body[:body.find("\n")][:80]
            fallback = fallback[:fallback.rfind(" ")]
            title = fallback
            print(f"NOTE: missing title [{info}]: {title}")
        title = clean(title)
        content["id"].append(article_id)
        content["stage"].append(stage)
        content["db"].append(db)
        content["title"].append(title)
        content["text"].append(body)
        content["lang"].append(lang["language"])
        for cname in ctagnames:
            content[cname].append(False)
        lstmt = sa.select(t_links.c.href)
        lstmt = lstmt.where(t_links.c.article_id == article_id)
        for link in session.execute(lstmt):
            href_key = prettify_href(link[0])
            href_col_name = f"tag_{href_key}"
            if href_col_name not in content:
                continue
            content[href_col_name][-1] = True
                    
df = pd.DataFrame(
    content,
    columns=[
        "stage",
        "id",
        "db",
        # "country",
        "lang",
        "title",
        "text"
    ] + sorted(ctagnames))
final_tags = []
for cname in ctagnames:
    if df[cname].all() or not df[cname].any():
        print(f"drop {cname}")
        del df[cname]
    else:
        final_tags.append(cname)

NOTE: skipping {'id': '5'} (no body)
NOTE: skipping {'id': '7'} (no body)
NOTE: skipping {'id': '8'} (no body)
NOTE: skipping {'id': '9'} (no body)
NOTE: missing title [{'id': '11'}]: UNDP learning network: Accelerator
NOTE: skipping {'id': '12'} (no body)
NOTE: skipping {'id': '246'} (no body)
NOTE: skipping {'id': '18'} (no body)
NOTE: skipping {'id': '21'} (no body)
NOTE: skipping {'id': '22'} (no body)
NOTE: skipping {'id': '26'} (no body)
NOTE: skipping {'id': '27'} (no body)
NOTE: skipping {'id': '28'} (no body)
NOTE: skipping {'id': '29'} (no body)
NOTE: skipping {'id': '30'} (no body)
NOTE: skipping {'id': '31'} (no body)
NOTE: skipping {'id': '33'} (no body)
NOTE: skipping {'id': '34'} (no body)
NOTE: skipping {'id': '35'} (no body)
NOTE: skipping {'id': '36'} (no body)
NOTE: skipping {'id': '74'} (not english)
NOTE: skipping {'id': '39'} (no body)
NOTE: skipping {'id': '40'} (no body)
NOTE: skipping {'id': '41'} (no body)
NOTE: skipping {'id': '42'} (no body)
NOTE: skipping {

In [14]:
df

Unnamed: 0,stage,id,db,lang,title,text,tag_http-europeandcis-undp-org-blog-2012-04-05-the-right-to-health-care-a-myth-for-illegal-migrant-workers,tag_http-europeandcis-undp-org-blog-2015-07-01-all-aboard-the-big-data-express,tag_http-europeandcis-undp-org-blog-2015-09-23-big-data-for-development-what-were-doing-today,tag_http-europeandcis-undp-org-blog-2015-11-25-diagnose-and-treat-measuring-a-countrys-pulse-with-social-media,...,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-bfad30d14f82a381caf5fe0e57de2629ec45777fa64f19d0e15917d5e0bbbd4e-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-c540c6463debd324d1b9a1852adab342668038f888027d3edf4a061ea21d9bcd-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-ceb2924d1c61cca4285b9694e6c36fb6d7a7a807d6413ff04292971fd7fa0c67-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-d55b55db6a2f78767e9c275709a407f51fd8bd8c26cf588a1031de5df1924a35-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-undp_cy_fs_afendrika-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-undp_cy_fs_agiosevlalios-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-cy-undp_cy_fs_panagiaevangelisitria-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-gh-5645b993bb692ca561ff9e33ec45ccf089b01340107982f1dde2e9adafd050a7-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-sv-UNDP-RBLAC-GobernanzaEfectivaSPA-pdf,tag_https-www-undp-org-zh-china
0,validation,1,blogs,en,Youth Leadership Programme 6: Decade of Action...,"UNDP Bahrain, in partnership with the Ministry...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,validation,2,blogs,en,Youth Leadership Programme 8 Launch: Youth Tra...,In partnership with the Ministry of Youth and ...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,validation,3,blogs,en,Launching the Youth Leadership Programme 7: Cr...,In partnership with the Ministry of Youth and ...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,validation,4,blogs,en,COVID-19 Socio-Economic Impact Assessment in B...,"Updated on August 16, 2021\nAbout the SEIA Ini...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,validation,6,blogs,en,Youth Leadership Programme 9 Launch: Together ...,In partnership with the Ministry of Youth Affa...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,validation,685,blogs,en,Changing behaviour: lessons in community waste...,"By Allen Anie, Head of Experimentation, UNDP A...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
243,validation,688,blogs,en,Harnessing Africa’s creativity and innovation ...,"Photo © Leika Production\n\nOn April 21, we ce...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
244,validation,691,blogs,en,Up-scaling Innovation towards the SDGs with Mi...,Female waterpreneurs dispensing treated water ...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
245,validation,692,blogs,en,Youth Innovation for Human and Planetary Health,Felix in his vegetable greenhouse. Photo: Prai...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df.to_parquet(OUT_FILE)

In [16]:
df.shape

(247, 315)