In [1]:
import os
import json
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import urllib.parse

In [2]:
CONFIG_PATH = "config.json"
OUT_FILE = "traintest.pq"
VERBOSE = False

In [3]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID"
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy()
        }
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [4]:
# global tables
t_tags = get_table("login", "tags")

# solution mapping tables
t_sm_pads = get_table("sm", "pads")
t_sm_tagging = get_table("sm", "tagging")

# action plan tables
t_ap_pads = get_table("ap", "pads")
t_ap_tagging = get_table("ap", "tagging")

# experiments tables
t_exp_pads = get_table("exp", "pads")
t_exp_tagging = get_table("exp", "tagging")

In [5]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_sm_pads.c.id))
    stmt = stmt.where(t_sm_pads.c.status >= 2)
    sm_pad_count = int(session.execute(stmt).one()[0])
    print(sm_pad_count)

3192


In [6]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_ap_pads.c.id))
    stmt = stmt.where(t_ap_pads.c.status >= 2)
    ap_pad_count = int(session.execute(stmt).one()[0])
    print(ap_pad_count)

863


In [7]:
with get_session() as session:
    stmt = sa.select(sa.func.count(t_exp_pads.c.id))
    stmt = stmt.where(t_exp_pads.c.status >= 2)
    exp_pad_count = int(session.execute(stmt).one()[0])
    print(exp_pad_count)

202


In [8]:
tags = {}
with get_session() as session:
    stmt = sa.select(t_tags.c.id, t_tags.c.name, t_tags.c.type)
    for row in session.execute(stmt):
        tags[row[0]] = (row[1], row[2])

In [9]:
with get_session() as session:
    stmt = sa.select(t_sm_pads.c.id, t_sm_pads.c.title, t_sm_pads.c.sections, t_sm_pads.c.full_text)
    stmt = stmt.where(t_sm_pads.c.status >= 2)
    stmt = stmt.limit(3)
    for row in session.execute(stmt):
        print("=TITLE=================")
        print(row[1])
        print("=TEXT==================")
        print(row[3])
        print("=TAGS==================")
        tstmt = sa.select(t_sm_tagging.c.tag_id)
        tstmt = tstmt.where(t_sm_tagging.c.pad == row[0])
        for tag in session.execute(tstmt):
            t_name, t_type = tags[tag[0]]
            if t_type == "thematic_areas":
                print(t_name)
        print()

The power of faith facing of the weakness of the means of a young self-taught innovator
The power of faith facing of the weakness of the means of a young self-taught innovator


			Mamadou Saliou Diallo 

Whatsapp Via Moussa CAMARA +224624976073

NO

   In a village where everyone thinks that not going far in their studies is an end in itself, where relentlessness and dedication are not considered as real arguments to support a citizen in his project, this is the decor in which evolves Mamadou Saliou Diallo, an autodidact whose level of study is of the 5th year of primary school; is developing a pico-dam to supply electricity to surrounding villages.

               In the process of developing the energy sector to solve the power problem in Guinea, the government has electrified major cities, prefectures, and sub-prefectures. However, it emerges that despite these enormous efforts, shortcomings remain. So to provide a quick and low-cost solution, our friend “Actor's Name” gave himself

In [10]:
sm_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_sm_tagging.c.tag_id, sa.func.count(t_sm_tagging.c.tag_id))
    stmt = stmt.group_by(t_sm_tagging.c.tag_id).order_by(sa.func.count(t_sm_tagging.c.tag_id).desc())
    for theme in [True, False]:
        print_count = 10
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            if print_count > 0:
                print(f"{row[1] / sm_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
                print_count -= 1
            sm_tag_counts[t_name] = int(row[1])
        print()

8.87% 283 circular economy (thematic_areas)
7.36% 235 agriculture (thematic_areas)
6.27% 200 waste management (thematic_areas)
6.23% 199 covid-19 response (thematic_areas)
5.64% 180 innovation (thematic_areas)
5.29% 169 health (thematic_areas)
4.82% 154 recycling (thematic_areas)
4.73% 151 environment friendly (thematic_areas)
4.48% 143 youth (thematic_areas)
4.45% 142 entrepreneurship (thematic_areas)

38.03% 1214 Sustainable cities and communities (sdgs)
32.49% 1037 Responsible consumption and production (sdgs)
26.69% 852 Decent work and economic growth (sdgs)
26.69% 852 Industry, innovation and infrastructure (sdgs)
24.94% 796 Climate action (sdgs)
22.99% 734 Good health and well-being (sdgs)
16.60% 530 Affordable and clean energy (sdgs)
15.23% 486 No poverty (sdgs)
14.19% 453 Zero hunger (sdgs)
13.25% 423 Reduced innequalities (sdgs)



In [11]:
ap_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_ap_tagging.c.tag_id, sa.func.count(t_ap_tagging.c.tag_id))
    stmt = stmt.group_by(t_ap_tagging.c.tag_id).order_by(sa.func.count(t_ap_tagging.c.tag_id).desc())
    for theme in [True, False]:
        print_count = 10
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            if print_count > 0:
                print(f"{row[1] / ap_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
                print_count -= 1
            ap_tag_counts[t_name] = int(row[1])
        print()

4.40% 38 circular economy (thematic_areas)
3.71% 32 waste management (thematic_areas)
3.36% 29 co-creation (thematic_areas)
3.24% 28 community empowerment (thematic_areas)
3.24% 28 behavioral insights (thematic_areas)
3.13% 27 youth and unemployment (thematic_areas)
3.13% 27 climate change (thematic_areas)
3.13% 27 digital transformation (thematic_areas)
2.90% 25 public sector innovation (thematic_areas)
2.78% 24 behavioral change (thematic_areas)

41.71% 360 Co-creation (methods)
39.98% 345 Collective Intelligence (methods)
34.41% 297 Solutions Mapping (methods)
30.36% 262 focus group (datasources)
27.93% 241 Prototyping (methods)
27.35% 236 surveys (datasources)
26.88% 232 System Thinking (methods)
26.77% 231 Decent work and economic growth (sdgs)
25.84% 223 Sensemaking (methods)
25.72% 222 direct interviews (datasources)



In [12]:
exp_tag_counts = {}
with get_session() as session:
    stmt = sa.select(t_exp_tagging.c.tag_id, sa.func.count(t_exp_tagging.c.tag_id))
    stmt = stmt.group_by(t_exp_tagging.c.tag_id).order_by(sa.func.count(t_exp_tagging.c.tag_id).desc())
    for theme in [True, False]:
        print_count = 10
        for row in session.execute(stmt):
            t_name, t_type = tags[row[0]]
            if (t_type == "thematic_areas") != theme:
                continue
            if print_count > 0:
                print(f"{row[1] / exp_pad_count * 100.0:.2f}% {row[1]} {t_name} ({t_type})")
                print_count -= 1
            exp_tag_counts[t_name] = int(row[1])
        print()

10.89% 22 waste management (thematic_areas)
6.44% 13 behavioral insights (thematic_areas)
4.95% 10 access to information (thematic_areas)
3.96% 8 access to basic services (thematic_areas)
3.47% 7 co-creation (thematic_areas)
3.47% 7 public sector innovation (thematic_areas)
2.97% 6 knowledge management (thematic_areas)
2.97% 6 employment (thematic_areas)
2.97% 6 behavioral change (thematic_areas)
2.48% 5 covid-19 response (thematic_areas)

50.99% 103 Sustainable cities and communities (sdgs)
39.60% 80 Partnerships for the goals (sdgs)
37.62% 76 Climate action (sdgs)
31.68% 64 Decent work and economic growth (sdgs)
30.20% 61 Reduced innequalities (sdgs)
26.73% 54 Industry, innovation and infrastructure (sdgs)
20.79% 42 Responsible consumption and production (sdgs)
20.79% 42 No poverty (sdgs)
18.81% 38 Peace, justice and strong institutions (sdgs)
16.34% 33 Good health and well-being (sdgs)



In [13]:
all_tags = [(tid, tname) for (tid, (tname, ttype)) in tags.items() if ttype == "thematic_areas"]
all_tags[:10], all_tags[-10:]

([(736, 'ocean'),
  (747, 'formalising the informal'),
  (757, 'productivity'),
  (1278, 'flooding'),
  (835, 'reducing gender inequalities'),
  (839, 'cooperativism'),
  (842, 'nutrition'),
  (846, 'logistics'),
  (850, 'transportation'),
  (616, 'Acupuntura Urbana')],
 [(4048, 'social innovation platform'),
  (4050, 'adolescents'),
  (4061, 'impact investing'),
  (4201, 'energy efficiency audits'),
  (4203, 'households'),
  (4259, 'digital government'),
  (4204, 'thermal camera'),
  (4261, 'digital payments'),
  (4205, 'residential energy efficiency'),
  (3257, 'epower agregetor')])

In [14]:
rng = np.random.default_rng(42)

In [15]:
train_size = 1000
test_size = 1000
train_test_ixs = list(rng.choice(list(range(sm_pad_count)), train_size + test_size, replace=False))
is_train = set(train_test_ixs[:train_size])
is_test = set(train_test_ixs[train_size:])

In [16]:
content = {
    "stage": [],
    "db": [],
    "title": [],
    "text": [],
}
ctags = {}
ctagnames = []
for (tag_id, tag_name) in all_tags:
    col_name = f"tag_{tag_name}"
    ctagnames.append(col_name)
    ctags[tag_id] = col_name
    content[col_name] = []
tables = [
    ("sm", t_sm_pads, t_sm_tagging),
    ("ap", t_ap_pads, t_ap_tagging),
    ("exp", t_exp_pads, t_exp_tagging),
]
with get_session() as session:
    for (t_name, t_pads, t_tagging) in tables:
        stmt = sa.select(t_pads.c.id, t_pads.c.title, t_pads.c.full_text)
        stmt = stmt.where(t_pads.c.status >= 2)
        cur_ix = 0
        for row in session.execute(stmt):
            cur_ix += 1
            if t_name == "sm":
                stage = "validation"
                if cur_ix in is_train:
                    stage = "train"
                if cur_ix in is_test:
                    stage = "test"
            else:
                stage = "validation"
            content["stage"].append(stage)
            content["db"].append(t_name)
            content["title"].append(row[1])
            content["text"].append(row[2])
            for cname in ctagnames:
                content[cname].append(False)
            tstmt = sa.select(t_tagging.c.tag_id)
            tstmt = tstmt.where(t_tagging.c.pad == row[0])
            for tag in session.execute(tstmt):
                cname = ctags.get(tag[0])
                if cname is not None:
                    content[cname][-1] = True
df = pd.DataFrame(content, columns=["stage", "db", "title", "text"] + sorted(ctagnames))
final_tags = []
for cname in ctagnames:
    if df[cname].all() or not df[cname].any():
        print(f"drop {cname}")
        del df[cname]
    else:
        final_tags.append(cname)

drop tag_UNDP
drop tag_public_sector
drop tag_private_sector
drop tag_health insurance
drop tag_youth leadership in private sector
drop tag_mutant__entrep
drop tag_automation
drop tag_informal settlement
drop tag_windmill
drop tag_Unemployed
drop tag_early childhood
drop tag_board games
drop tag_Political and Cultural peace
drop tag_digital manufacturing
drop tag_panels
drop tag_ethnomathematics
drop tag_notebooks
drop tag_justice sociale
drop tag_waste2wealth
drop tag_natural ingredient
drop tag_crvs
drop tag_discrimination
drop tag_biogarddener
drop tag_accountability
drop tag_satellite internet
drop tag_learning communities
drop tag_stem
drop tag_project management\
drop tag_homemade
drop tag_reporting
drop tag_systemic design
drop tag_payment system
drop tag_light urban intervention
drop tag_mobile data
drop tag_global information systems(gis)
drop tag__mejora_relaciones_en_la_comuni
drop tag_gender identity
drop tag_user
drop tag_placemaking
drop tag_business continuity
drop tag_a

In [17]:
df

Unnamed: 0,stage,db,title,text,tag_3d printing,tag_Acupuntura Urbana,tag_Bebidas Tradicionales,tag_Bicicleta,tag_Bicycle,tag_Cadenas de Valor,...,tag_wood gas,tag_wood stove,tag_woodchips,tag_worker safety,tag_youth,tag_youth activism,tag_youth and unemployment,tag_youth empowerment,tag_youth informality,tag_zero waste
0,validation,sm,The power of faith facing of the weakness of t...,The power of faith facing of the weakness of t...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,validation,sm,local three lines power agregetor,local three lines power agregetor\n\n\n\t\t\tD...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,test,sm,Teflon REGULATOR,Teflon REGULATOR\n\n\n\t\t\tThe TEFLON or PTFE...,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,train,sm,Public Lights auto managed,Public Lights auto managed \n\n\n\t\t\tSolutio...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,test,sm,ORGANIC WASTE Matanizer !,ORGANIC WASTE Matanizer !\n\n\n\t\t\tHere is a...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4252,validation,exp,Reducing the Use of Single-Use Plastic Bags in...,Reducing the Use of Single-Use Plastic Bags in...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4253,validation,exp,Using DPPD to identify greater indigenous poli...,Using DPPD to identify greater indigenous poli...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4254,validation,exp,"Local Convergence: Promoting Agile, Adaptive, ...","Local Convergence: Promoting Agile, Adaptive, ...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4255,validation,exp,Marine Litter: Behavioral Insights Experiment ...,Marine Litter: Behavioral Insights Experiment ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
for cname in final_tags:
    train_cat_count = df.loc[df["stage"] == "train", cname].sum()
    test_cat_count = df.loc[df["stage"] == "test", cname].sum()
    validation_cat_count = df.loc[df["stage"] == "validation", cname].sum()
    text = f"{train_cat_count} {test_cat_count} {validation_cat_count} {cname}"
    print(text)
    if not test_cat_count:
        print("^" * len(text))

5 9 11 tag_ocean
1 1 5 tag_formalising the informal
0 8 8 tag_productivity
1 3 5 tag_flooding
2 1 9 tag_reducing gender inequalities
10 10 10 tag_cooperativism
5 0 2 tag_nutrition
^^^^^^^^^^^^^^^^^^^
0 2 1 tag_logistics
5 13 11 tag_transportation
0 2 0 tag_Acupuntura Urbana
1 0 7 tag_access
^^^^^^^^^^^^^^^^
9 4 19 tag_gender focus
4 5 18 tag_gender violence
7 7 9 tag_good health and well-being
0 0 10 tag_governance
^^^^^^^^^^^^^^^^^^^^^
0 0 1 tag_governance cluster (co)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2 1 8 tag_government
1 1 17 tag_government innovation
0 0 2 tag_pest control
^^^^^^^^^^^^^^^^^^^^^^
8 7 13 tag_planting
6 5 9 tag_plastic
6 16 14 tag_plastic alternative
3 5 8 tag_plastic and pollution
7 11 18 tag_plastic waste management
0 0 5 tag_policy innovation
^^^^^^^^^^^^^^^^^^^^^^^^^^^
5 7 14 tag_pollution reduction
0 0 15 tag_portfolio approach
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1 4 5 tag_positive deviance
2 0 8 tag_private sector engagement
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
0 1 6

In [19]:
df.to_parquet(OUT_FILE)