In [1]:
import os
import json
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

In [2]:
CONFIG_PATH = "config.json"
OUT_FILE = "traintest.pq"
VERBOSE = False

In [3]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID"
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy()
        }
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    engine = sa.create_engine(
        f"{db['dialect']}://{db['user']}:{db['passwd']}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [4]:
# global tables
t_tags = get_table("login", "tags")

# solution mapping tables
t_sm_pads = get_table("sm", "pads")
t_sm_tagging = get_table("sm", "tagging")

# action plan tables
t_ap_pads = get_table("ap", "pads")
t_ap_tagging = get_table("ap", "tagging")

# experiments tables
t_exp_pads = get_table("exp", "pads")
t_exp_tagging = get_table("exp", "tagging")

In [5]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_sm_pads.c.id))
    stmt = stmt.where(t_sm_pads.c.status >= 2)
    sm_pad_count = int(session.execute(stmt).one()[0])
    print(sm_pad_count)

3201


In [6]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_ap_pads.c.id))
    stmt = stmt.where(t_ap_pads.c.status >= 2)
    ap_pad_count = int(session.execute(stmt).one()[0])
    print(ap_pad_count)

864


In [7]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_exp_pads.c.id))
    stmt = stmt.where(t_exp_pads.c.status >= 2)
    exp_pad_count = int(session.execute(stmt).one()[0])
    print(exp_pad_count)

197


In [8]:
tags = {}
with get_session() as session:
    stmt = sa.select(t_tags.c.id, t_tags.c.name, t_tags.c.type)
    for row in session.execute(stmt):
        tags[row[0]] = (row[1], row[2])

In [9]:
with get_session() as session:
    stmt = sa.select(t_sm_pads.c.id, t_sm_pads.c.title, t_sm_pads.c.sections, t_sm_pads.c.full_text)
    stmt = stmt.where(t_sm_pads.c.status >= 2)
    stmt = stmt.limit(10)
    for row in session.execute(stmt):
        print("=TITLE=================")
        print(row[1])
        print("=TEXT==================")
        print(row[3])
        print("=TAGS==================")
        tstmt = sa.select(t_sm_tagging.c.tag_id)
        tstmt = tstmt.where(t_sm_tagging.c.pad == row[0])
        for tag in session.execute(tstmt):
            t_name, t_type = tags[tag[0]]
            if t_type == "thematic_areas":
                print(t_name)
        print()

The power of faith facing of the weakness of the means of a young self-taught innovator
The power of faith facing of the weakness of the means of a young self-taught innovator


			Mamadou Saliou Diallo 

Whatsapp Via Moussa CAMARA +224624976073

NO

   In a village where everyone thinks that not going far in their studies is an end in itself, where relentlessness and dedication are not considered as real arguments to support a citizen in his project, this is the decor in which evolves Mamadou Saliou Diallo, an autodidact whose level of study is of the 5th year of primary school; is developing a pico-dam to supply electricity to surrounding villages.

               In the process of developing the energy sector to solve the power problem in Guinea, the government has electrified major cities, prefectures, and sub-prefectures. However, it emerges that despite these enormous efforts, shortcomings remain. So to provide a quick and low-cost solution, our friend “Actor's Name” gave himself

In [10]:
sm_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_sm_tagging.c.tag_id, sa.func.count(t_sm_tagging.c.tag_id))
    stmt = stmt.group_by(t_sm_tagging.c.tag_id).order_by(sa.func.count(t_sm_tagging.c.tag_id).desc())
    for theme in [True, False]:
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            print(f"{row[1] / sm_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
            sm_tag_counts[t_name] = int(row[1])
        print()

8.84% 283 circular economy (thematic_areas)
7.34% 235 agriculture (thematic_areas)
6.25% 200 waste management (thematic_areas)
6.22% 199 covid-19 response (thematic_areas)
5.62% 180 innovation (thematic_areas)
5.28% 169 health (thematic_areas)
4.81% 154 recycling (thematic_areas)
4.72% 151 environment friendly (thematic_areas)
4.44% 142 entrepreneurship (thematic_areas)
4.40% 141 youth (thematic_areas)
4.03% 129 clean energy (thematic_areas)
4.03% 129 food security (thematic_areas)
3.91% 125 education (thematic_areas)
3.62% 116 technology (thematic_areas)
3.47% 111 energy (thematic_areas)
3.47% 111 climate change (thematic_areas)
3.34% 107 employment (thematic_areas)
3.34% 107 micro small medium enterprises (thematic_areas)
3.19% 102 solid waste management (thematic_areas)
3.12% 100 solar energy (thematic_areas)
3.12% 100 metalic waste (thematic_areas)
2.66% 85 covid-19 (thematic_areas)
2.59% 83 youth and unemployment (thematic_areas)
2.28% 73 access to basic services (thematic_areas)


In [11]:
ap_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_ap_tagging.c.tag_id, sa.func.count(t_ap_tagging.c.tag_id))
    stmt = stmt.group_by(t_ap_tagging.c.tag_id).order_by(sa.func.count(t_ap_tagging.c.tag_id).desc())
    for theme in [True, False]:
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            print(f"{row[1] / ap_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
            ap_tag_counts[t_name] = int(row[1])
        print()

4.40% 38 circular economy (thematic_areas)
3.70% 32 waste management (thematic_areas)
3.24% 28 community empowerment (thematic_areas)
3.24% 28 behavioral insights (thematic_areas)
3.24% 28 co-creation (thematic_areas)
3.12% 27 climate change (thematic_areas)
3.12% 27 digital transformation (thematic_areas)
3.12% 27 youth and unemployment (thematic_areas)
2.89% 25 public sector innovation (thematic_areas)
2.78% 24 behavioral change (thematic_areas)
2.66% 23 digital inclusion (thematic_areas)
2.55% 22 innovation (thematic_areas)
2.31% 20 access to basic services (thematic_areas)
2.20% 19 portfolio approach (thematic_areas)
2.08% 18 government innovation (thematic_areas)
2.08% 18 entrepreneurship (thematic_areas)
1.97% 17 employment (thematic_areas)
1.97% 17 access to information (thematic_areas)
1.97% 17 blue economy (thematic_areas)
1.97% 17 system transformation (thematic_areas)
1.85% 16 agriculture (thematic_areas)
1.85% 16 informality (thematic_areas)
1.85% 16 community  involment, y

In [12]:
exp_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_exp_tagging.c.tag_id, sa.func.count(t_exp_tagging.c.tag_id))
    stmt = stmt.group_by(t_exp_tagging.c.tag_id).order_by(sa.func.count(t_exp_tagging.c.tag_id).desc())
    for theme in [True, False]:
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            print(f"{row[1] / exp_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
            exp_tag_counts[t_name] = int(row[1])
        print()

11.17% 22 waste management (thematic_areas)
6.09% 12 behavioral insights (thematic_areas)
5.08% 10 access to information (thematic_areas)
4.06% 8 access to basic services (thematic_areas)
3.55% 7 co-creation (thematic_areas)
3.55% 7 public sector innovation (thematic_areas)
3.05% 6 employment (thematic_areas)
3.05% 6 behavioral change (thematic_areas)
3.05% 6 knowledge management (thematic_areas)
2.54% 5 prototyping (thematic_areas)
2.54% 5 informal businesses (thematic_areas)
2.54% 5 covid-19 response (thematic_areas)
2.54% 5 awareness raising (thematic_areas)
2.54% 5 big data analysis (thematic_areas)
2.54% 5 waste (thematic_areas)
2.54% 5 informality (thematic_areas)
2.54% 5 informal employment (thematic_areas)
2.03% 4 climate change (thematic_areas)
2.03% 4 grassroot solutions (thematic_areas)
2.03% 4 innovation (thematic_areas)
2.03% 4 better livelihood (thematic_areas)
2.03% 4 digital participation (thematic_areas)
2.03% 4 citizen engagment and feedback (thematic_areas)
1.52% 3 d

In [13]:
all_tags = [(tid, tname) for (tid, (tname, ttype)) in tags.items() if ttype == "thematic_areas"]
all_tags

[(736, 'ocean'),
 (747, 'formalising the informal'),
 (757, 'productivity'),
 (1278, 'flooding'),
 (835, 'reducing gender inequalities'),
 (839, 'cooperativism'),
 (842, 'nutrition'),
 (846, 'logistics'),
 (850, 'transportation'),
 (616, 'Acupuntura Urbana'),
 (2, 'access'),
 (141, 'gender focus'),
 (142, 'gender violence'),
 (143, 'good health and well-being'),
 (144, 'governance'),
 (145, 'governance cluster (co)'),
 (146, 'government'),
 (147, 'government innovation'),
 (204, 'pest control'),
 (205, 'planting'),
 (206, 'plastic'),
 (207, 'plastic alternative'),
 (208, 'plastic and pollution'),
 (209, 'plastic waste management'),
 (210, 'policy innovation'),
 (211, 'pollution reduction'),
 (212, 'portfolio approach'),
 (213, 'positive deviance'),
 (214, 'private sector engagement'),
 (215, 'process innovation'),
 (216, 'protecting the environment'),
 (217, 'prototyping'),
 (218, 'public amenities'),
 (219, 'public consultations'),
 (220, 'public health'),
 (221, 'public policy planni

In [14]:
rng = np.random.default_rng(42)

In [15]:
train_size = 1000
test_size = 1000
train_test_ixs = list(rng.choice(list(range(sm_pad_count)), train_size + test_size, replace=False))
is_train = set(train_test_ixs[:train_size])
is_test = set(train_test_ixs[train_size:])

In [16]:
content = {
    "stage": [],
    "db": [],
    "title": [],
    "text": [],
}
ctags = {}
ctagnames = []
for (tag_id, tag_name) in all_tags:
    col_name = f"tag_{tag_name}"
    ctagnames.append(col_name)
    ctags[tag_id] = col_name
    content[col_name] = []
tables = [
    ("sm", t_sm_pads, t_sm_tagging),
    ("ap", t_ap_pads, t_ap_tagging),
    ("exp", t_exp_pads, t_exp_tagging),
]
with get_session() as session:
    for (t_name, t_pads, t_tagging) in tables:
        stmt = sa.select(t_pads.c.id, t_pads.c.title, t_pads.c.full_text)
        stmt = stmt.where(t_pads.c.status >= 2)
        cur_ix = 0
        for row in session.execute(stmt):
            cur_ix += 1
            if t_name == "sm":
                stage = "validation"
                if cur_ix in is_train:
                    stage = "train"
                if cur_ix in is_test:
                    stage = "test"
            else:
                stage = "validation"
            content["stage"].append(stage)
            content["db"].append(t_name)
            content["title"].append(row[1])
            content["text"].append(row[2])
            for cname in ctagnames:
                content[cname].append(False)
            tstmt = sa.select(t_tagging.c.tag_id)
            tstmt = tstmt.where(t_tagging.c.pad == row[0])
            for tag in session.execute(tstmt):
                cname = ctags.get(tag[0])
                if cname is not None:
                    content[cname][-1] = True
df = pd.DataFrame(content, columns=["stage", "db", "title", "text"] + sorted(ctagnames))
final_tags = []
for cname in ctagnames:
    if df[cname].all() or not df[cname].any():
        print(f"drop {cname}")
        del df[cname]
    else:
        final_tags.append(cname)

drop tag_UNDP
drop tag_public_sector
drop tag_private_sector
drop tag_health insurance
drop tag_youth leadership in private sector
drop tag_mutant__entrep
drop tag_automation
drop tag_informal settlement
drop tag_windmill
drop tag_Unemployed
drop tag_early childhood
drop tag_board games
drop tag_Political and Cultural peace
drop tag_digital manufacturing
drop tag_panels
drop tag_ethnomathematics
drop tag_notebooks
drop tag_justice sociale
drop tag_oceans
drop tag_waste2wealth
drop tag_natural ingredient
drop tag_crvs
drop tag_discrimination
drop tag_biogarddener
drop tag_accountability
drop tag_satellite internet
drop tag_learning communities
drop tag_stem
drop tag_project management\
drop tag_homemade
drop tag_reporting
drop tag_systemic design
drop tag_payment system
drop tag_light urban intervention
drop tag_mobile data
drop tag_global information systems(gis)
drop tag__mejora_relaciones_en_la_comuni
drop tag_gender identity
drop tag_user
drop tag_placemaking
drop tag_business conti

In [17]:
df

Unnamed: 0,stage,db,title,text,tag_3d printing,tag_Acupuntura Urbana,tag_Bebidas Tradicionales,tag_Bicicleta,tag_Bicycle,tag_Cadenas de Valor,...,tag_wood gas,tag_wood stove,tag_woodchips,tag_worker safety,tag_youth,tag_youth activism,tag_youth and unemployment,tag_youth empowerment,tag_youth informality,tag_zero waste
0,validation,sm,The power of faith facing of the weakness of t...,The power of faith facing of the weakness of t...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,validation,sm,local three lines power agregetor,local three lines power agregetor\n\n\n\t\t\tD...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,test,sm,Teflon REGULATOR,Teflon REGULATOR\n\n\n\t\t\tThe TEFLON or PTFE...,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,test,sm,Public Lights auto managed,Public Lights auto managed \n\n\n\t\t\tSolutio...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,test,sm,ORGANIC WASTE Matanizer !,ORGANIC WASTE Matanizer !\n\n\n\t\t\tHere is a...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4257,validation,exp,Reducing the Use of Single-Use Plastic Bags in...,Reducing the Use of Single-Use Plastic Bags in...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4258,validation,exp,Using DPPD to identify greater indigenous poli...,Using DPPD to identify greater indigenous poli...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4259,validation,exp,"Local Convergence: Promoting Agile, Adaptive, ...","Local Convergence: Promoting Agile, Adaptive, ...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4260,validation,exp,Marine Litter: Behavioral Insights Experiment ...,Marine Litter: Behavioral Insights Experiment ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
for cname in final_tags:
    train_cat_count = df.loc[df["stage"] == "train", cname].sum()
    test_cat_count = df.loc[df["stage"] == "test", cname].sum()
    validation_cat_count = df.loc[df["stage"] == "validation", cname].sum()
    text = f"{train_cat_count} {test_cat_count} {validation_cat_count} {cname}"
    print(text)
    if not test_cat_count:
        print("^" * len(text))

7 3 14 tag_ocean
3 1 3 tag_formalising the informal
3 6 6 tag_productivity
1 2 6 tag_flooding
2 2 7 tag_reducing gender inequalities
10 9 11 tag_cooperativism
3 0 3 tag_nutrition
^^^^^^^^^^^^^^^^^^^
1 1 1 tag_logistics
9 7 13 tag_transportation
0 1 1 tag_Acupuntura Urbana
1 1 6 tag_access
4 8 20 tag_gender focus
7 7 13 tag_gender violence
5 10 7 tag_good health and well-being
1 1 8 tag_governance
0 0 1 tag_governance cluster (co)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2 2 7 tag_government
1 1 17 tag_government innovation
0 1 1 tag_pest control
10 8 9 tag_planting
5 4 11 tag_plastic
12 8 16 tag_plastic alternative
2 5 9 tag_plastic and pollution
10 9 17 tag_plastic waste management
0 0 5 tag_policy innovation
^^^^^^^^^^^^^^^^^^^^^^^^^^^
9 4 13 tag_pollution reduction
0 0 15 tag_portfolio approach
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4 2 4 tag_positive deviance
1 1 8 tag_private sector engagement
0 0 5 tag_process innovation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
22 22 28 tag_protecting the environment
4 2 13

In [19]:
df.to_parquet(OUT_FILE)