In [1]:
import os
import json
import collections
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import re
import urllib.parse

In [2]:
import spacy
from spacy.language import Language

from spacy_language_detection import LanguageDetector

In [3]:
from html import unescape
import unicodedata

In [4]:
CONFIG_PATH = "config.json"
LTEXT_CAND = "linktext_candidates.json"
OUT_FILE = "traintest.blogs.pq"
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        }
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [7]:
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)

spacy_nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
spacy_nlp_model.add_pipe("language_detector", last=True)

def get_lang(doc):
    max_len = 1000000
    if len(doc) > max_len:
        doc = doc[:max_len]
        doc = doc[:doc.rfind(" ")]
    mdoc = spacy_nlp_model(doc)
    return mdoc._.language

In [8]:
t_blogs = get_table("blogs", "articles")
t_links = get_table("blogs", "links")

In [9]:
with get_session() as session:
    stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
    stmt = stmt.limit(10)
    for row in session.execute(stmt):
        article_id = row[0]
        info = {"id": f"{article_id}"}
        body = row[2]
        body = None if body is None else body.removeprefix("null")
        if body is None or not body.strip():
            print(f"NOTE: skipping {info} (no body)")
            continue
        lang = get_lang(body)
        if lang["score"] < 0.99:
            shortened = body[:80]
            shortened = shortened[:shortened.rfind(" ")]
            print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
        if lang["language"] != "en":
            print(f"NOTE: skipping {info} (not english)")
            continue
        body = clean(strip_html(body))
        title = row[1]
        if title is None or not title.strip():
            fallback = body[:body.find("\n")][:80]
            fallback = fallback[:fallback.rfind(" ")]
            title = fallback
            print(f"NOTE: missing title [{info}]: {title}")
        title = clean(title)
        print("=TITLE=================")
        print(f"{title} [{info}]")
        print("=TEXT==================")
        print(body)
        print("=LINKS=================")
        lstmt = sa.select(t_links.c.href, t_links.c.linktext)
        lstmt = lstmt.where(t_links.c.article_id == article_id)
        for link in session.execute(lstmt):
            if "undp.org/" not in link[0]:
                continue
            print(f"{clean(strip_html(link[1]))} [{link[0]}]")
        print()

Nilab’s Unwavering Spirit: Volunteering for Change [{'id': '1'}]
Nilab raising awareness on SDGs during the social event organised by UNDP Afghanistan. © UNDP Afghanistan / S. Omer Sadaat / 2018
Imagine a situation where just going to school puts your life at risk.
Under Taliban rule in Afghanistan, this was the situation for Nilab Aria and many others before 2001. In second grade, she had to be homeschooled by her aunt and mother. Nilab had been forced to wear a burqa since she was eight years old.
Today, Nilab is a United Nations Volunteer. Her passion for education and professionalism started back then.
Hidden from sight, and from the Taliban, she took short courses in English and Science in a basement, where she studied by candlelight. As an additional precaution, the students were forced to pretend that that they were taking Islamic studies, the only studies permitted under the Taliban regime.
One day, however, someone informed the Taliban, and the underground classes were closed 

In [10]:
skip_words = [
    "/about-us",
    "/contact-us",
    "/scam-alert",
    "/audit/",
    "/copyright-terms-use",
    "/our-team",
    "/legal-framework",
    "/our-focus",
    "/where-we-work",
    "/careers",
    "/quantum",
]


stop_parts = [
    "content",
    "home",
    "en",
    "files",
    "press-releases",
    "publications",
]


bad_linktexts = [
    "home",
    "here",
    "",
]


def extract_linktext(href):
    url = urllib.parse.urlparse(href)
    path = f"{url.path}"
    for sp in stop_parts:
        path = path.replace(f"/{sp}/", "/")
    res = re.sub(r"\s+", " ", path.replace("/", " ").replace("-", " ")).strip().lower()
    res = res.removesuffix(".html")
    res = res.removesuffix(".docx")
    res = res.removesuffix(".pdf")
    return prettify_href(res)


def prettify_href(href):
    href = re.sub("--+", "-", href.replace("/", "-").replace(":", "-").replace(".", "-"))
    href = href.rstrip("-")
    while True:
        prev_href = href
        href = urllib.parse.unquote(href)
        if href == prev_href:
            break
    return href.strip()


link_lookup = collections.defaultdict(set)
with get_session() as session:
    lstmt = sa.select(t_links.c.href, t_links.c.linktext)
    for link in session.execute(lstmt):
        if "undp.org/" not in link[0] or any(sw in link[0] for sw in skip_words):
            continue
        linktext = link[1]
        if not linktext:
            continue
        linktext = linktext.strip()
        if not linktext:
            continue
        while True:
            prev_linktext = linktext
            linktext = urllib.parse.unquote(linktext)
            if linktext == prev_linktext:
                break
        linktext = linktext.removesuffix("READ MORE")
        href = link[0]
        href_key = prettify_href(href)
        if href not in link_lookup:
            link_lookup[href_key].add(extract_linktext(href))
        link_lookup[href_key].add(linktext.strip())

link_lookup = {
    href_norm: texts_norm
    for href_norm, texts_norm in {
        href: [
            text
            for text in ltexts
            if all(text.lower() != blt for blt in bad_linktexts)
        ]
        for href, ltexts in link_lookup.items()
    }.items()
    if texts_norm
}
len(link_lookup)

3790

In [11]:
# link_lookup

In [12]:
with open(LTEXT_CAND, "w") as fout:
    print(json.dumps(link_lookup, indent=2, sort_keys=True), file=fout)

In [13]:
content = {
    "id": [],
    "stage": [],
    "db": [],
    # "country": [],  # NOTE: no country for now
    "title": [],
    "text": [],
    "lang": [],
}
ctags = {}
ctagnames = []
for tag_name in link_lookup.keys():
    col_name = f"tag_{tag_name}"
    ctagnames.append(col_name)
    content[col_name] = []
country_cache = {}
no_english = 0
no_body = 0
short_body = 0
accepted = 0
total = 0
with get_session() as session:
    stmt = sa.select(t_blogs.c.id, t_blogs.c.title, t_blogs.c.content)
    for row in session.execute(stmt):
        total += 1
        article_id = row[0]
        info = {"id": f"{article_id}"}
        db = "blogs"
        stage = "validation"
        body = row[2]
        body = None if body is None else body.removeprefix("null")
        if body is None or not body.strip():
            # print(f"NOTE: skipping {info} (no body)")
            no_body += 1
            continue
        if len(body) <= 80:
            # print(f"NOTE: short body ({len(body)}). skipping... [{info}]: {body}")
            short_body += 1
            continue
        lang = get_lang(body)
        if lang["score"] < 0.99:
            shortened = body[:80]
            shortened = shortened[:shortened.rfind(" ")]
            print(f"NOTE: low confidence language {lang} [{info}]: {shortened}")
        if lang["language"] != "en":
            # print(f"NOTE: skipping {info} (not english)")
            no_english += 1
            continue
        body = clean(strip_html(body))
        title = row[1]
        if title is None or not title.strip():
            fallback = body[:body.find("\n")][:80]
            fallback = fallback[:fallback.rfind(" ")]
            title = fallback
            print(f"NOTE: missing title [{info}]: {title}")
        title = clean(title)
        accepted += 1
        content["id"].append(article_id)
        content["stage"].append(stage)
        content["db"].append(db)
        content["title"].append(title)
        content["text"].append(body)
        content["lang"].append(lang["language"])
        for cname in ctagnames:
            content[cname].append(False)
        lstmt = sa.select(t_links.c.href)
        lstmt = lstmt.where(t_links.c.article_id == article_id)
        for link in session.execute(lstmt):
            href_key = prettify_href(link[0])
            href_col_name = f"tag_{href_key}"
            if href_col_name not in content:
                continue
            content[href_col_name][-1] = True
                    
df = pd.DataFrame(
    content,
    columns=[
        "stage",
        "id",
        "db",
        # "country",
        "lang",
        "title",
        "text"
    ] + sorted(ctagnames))
final_tags = []
for cname in ctagnames:
    if df[cname].all() or not df[cname].any():
        print(f"drop {cname}")
        del df[cname]
    else:
        final_tags.append(cname)

print(f"skipped {no_english} ({no_english / total * 100.0:.2f}%) for no english")
print(f"skipped {no_body} ({no_body / total * 100.0:.2f}%) for no body")
print(f"skipped {short_body} ({short_body / total * 100.0:.2f}%) for short body")
print(f"accepted {accepted} ({accepted / total * 100.0:.2f}%)")
print(f"total {total}")

NOTE: missing title [{'id': '198'}]: UNDP learning network: Accelerator
NOTE: low confidence language {'language': 'UNKNOWN', 'score': 0.0} [{'id': '390'}]: 

រាជធានីភ្នំពេញ ថ្ងៃទី ០៩ ខែ ធ្នូ ឆ្នាំ ២០២១៖
NOTE: missing title [{'id': '439'}]: References
NOTE: missing title [{'id': '1029'}]: UNDP established ‘Innovation for Development’ in 2018. Informed by the
NOTE: low confidence language {'language': 'ru', 'score': 0.5714279787415073} [{'id': '1183'}]: 

Пандемияның халыққа әсерін және үкіметтің халықтың пандемиядан айығуы үшін
NOTE: low confidence language {'language': 'ru', 'score': 0.571427368518357} [{'id': '1184'}]: 
 
Жаңашыл ойлау дегеніміз не? Бұл біздің проблемаларымызды шешу үшін жаңа
NOTE: low confidence language {'language': 'uk', 'score': 0.7142852857506132} [{'id': '1187'}]: 

Халықаралық мүгедектігі бар адамдар күні қарсаңында біз БҰҰ волонтері және
NOTE: low confidence language {'language': 'ru', 'score': 0.7142842491742025} [{'id': '1188'}]: 

Стратегиялық жоспарлау құ

In [14]:
df

Unnamed: 0,stage,id,db,lang,title,text,tag_http-europeandcis-undp-org-blog-2012-04-05-the-right-to-health-care-a-myth-for-illegal-migrant-workers,tag_http-europeandcis-undp-org-blog-2015-07-01-all-aboard-the-big-data-express,tag_http-europeandcis-undp-org-blog-2015-09-23-big-data-for-development-what-were-doing-today,tag_http-europeandcis-undp-org-blog-2015-11-25-diagnose-and-treat-measuring-a-countrys-pulse-with-social-media,...,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-np-UNDP-NP-Annual-Report-2020-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-pacific-YouthCoLabEvent_29-31Jan2019-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-pacific-YouthCoLabEvent_29-31Jan2019_0-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-pg-6d8cf1b1c88d2951281c44a2eee0484296484cedd4c6bbe7028e05af915476bb-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-ph-5f4681e429ce32c3081f2dbfe4f2faa0f1204c98fbb3833a690809b50383e45a-pdf,tag_https-www-undp-org-sites-g-files-zskgke326-files-migration-sv-UNDP-RBLAC-GobernanzaEfectivaSPA-pdf,tag_https-www-undp-org-zh-china,tag_https-www-uz-undp-org-content-uzbekistan-en-home-blog-2020-one-weekend-and-600-ways-to-fight-covid-19-html,tag_https-www-vn-undp-org-content-vietnam-en-home-blog-Experimentation-part2-html,tag_https-www1-undp-org-content-oslo-governance-centre-en-home-presscenter-media-contacts-html
0,validation,1,blogs,en,Nilab’s Unwavering Spirit: Volunteering for Ch...,Nilab raising awareness on SDGs during the soc...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,validation,2,blogs,en,Debate for Peace: Events Held in Herat and Jal...,"Zamzama, Laghman University student speaks abo...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,validation,3,blogs,en,The Path to Afghan Peace Economy Within SDGs F...,The fourth day of the week-long roundtable on ...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,validation,7,blogs,en,HOME\nAFGHANISTAN\nPUBLICATIONS\nSALAM ANNUAL ...,Returnees and Internally Displaced Persons are...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,validation,8,blogs,en,HOME\nAFGHANISTAN\nPUBLICATIONS\nSALAM SECOND ...,"In the second quarter of 2018, tangible moment...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,validation,1879,blogs,en,An Experiment on Satellite Remote Sensing of P...,MMDA Officials inspect solid waste collected f...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
763,validation,1880,blogs,en,Zero Extreme Poverty: Moonshot or Suntok sa Bu...,A typical pre-COVID19 scene of congestion in t...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
764,validation,1881,blogs,en,#TawidCOVID Innovation Challenge,This new normal is disproportionately hurting ...,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
765,validation,1882,blogs,en,Local Convergence for Zero Poverty: Experiment...,"Photo by the author in Pinagbuhatan, Pasig som...",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df.to_parquet(OUT_FILE)

In [16]:
df.shape

(767, 983)