In [1]:
import pandas as pd
import requests
import json
import re
import sys
import numpy as np
import os
import nltk
import pickle
from sklearn.utils import shuffle
from importlib import reload
from langdetect import detect
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from dateutil.parser import parse
import spacy
from time import time

sys.path.append('../src')

from commons import elastic
from text_processing import text_normalizer
text_normalizer = reload(text_normalizer)
from text_processing import duplicate_finder
from utilities import excel_writer, excel_reader, utils
from text_processing import abbreviations_resolver
abbreviations_resolver = reload(abbreviations_resolver)

stemmer = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

_abbreviations_resolver = abbreviations_resolver.AbbreviationsResolver([])
_abbreviations_resolver.load_model("../model/abbreviations_dicts")



# Prepare data from txt to excel files

In [21]:
all_docs = {}
global_id = 0
mappings_for_columns = {
    "T1  - ": "title",
    "A1  - ": "authors",
    "AB  - ": "abstract",
    "DO  - ": "doi",
    "KW  - ": "keywords",
    "PY  - ": "year",
    "UR  - ": "url"
}
for file in os.listdir("../tmp/folder_with_egm_files/"):
    filename = file.split(".")[0]
    file = os.path.join("../tmp/folder_with_egm_files/", file)
    keywords_started = False
    with open(file, "r", encoding="utf-8") as f:
        for line in f.readlines():
            if not line.strip():
                global_id += 1
                continue
            if global_id not in all_docs:
                all_docs[global_id] = {}
                for key_ in mappings_for_columns:
                    all_docs[global_id][mappings_for_columns[key_]] = []
                all_docs[global_id]["File"] = filename
            if line.startswith("KW  - "):
                keywords_started = True
                continue
            if keywords_started and re.search("^[A-Z0-9]{2}  - ", line):
                keywords_started = False
            if keywords_started:
                all_docs[global_id]["keywords"].append(line.strip().lower())
            for start_symbols in mappings_for_columns:
                if start_symbols == "KW  - ":
                    continue
                column_name = mappings_for_columns[start_symbols]
                if line.startswith(start_symbols):
                    all_docs[global_id][column_name].append(line.replace(start_symbols, "").strip())

In [22]:
for _id in all_docs:
    for column in all_docs[_id]:
        if type(all_docs[_id][column]) != str:
            all_docs[_id][column] = ";".join(all_docs[_id][column])

In [23]:
df = pd.DataFrame(all_docs.values())

In [24]:
for i in range(len(df)):
    if df["year"].values[i].strip():
        df["year"].values[i] = int(df["year"].values[i])

In [25]:
excel_writer.ExcelWriter().save_df_in_excel(df, "../tmp/combined_egm_data.xlsx")

Saving...
Saved to ../tmp/combined_egm_data.xlsx


# Check Lab studies and Geo names

In [2]:
df = excel_reader.ExcelReader().read_df_from_excel("../tmp/combined_egm_data.xlsx")

Read file ../tmp/combined_egm_data.xlsx: 0.11s
Processed file ../tmp/combined_egm_data.xlsx: 0.06s


In [3]:
df["identificators"] = ""

In [5]:
# if needed
df["keywords"] = ""
df["title"] = df["Title"]
df["abstract"] = df["Abstract"]

In [4]:
df["File"].value_counts()

Included (abstract)      100
Lab studies              100
Non-causal studies       100
High-income countries    100
No intervention          100
Name: File, dtype: int64

In [4]:
from text_processing import search_engine_insensitive_to_spelling
from text_processing import all_column_filler
search_engine_inverted_index = search_engine_insensitive_to_spelling.SearchEngineInsensitiveToSpelling(
        load_abbreviations = True)
search_engine_inverted_index.create_inverted_index(df)

Processed 0 articles
Processed 499 articles
Processed 0 abbreviations
Processed 3000 abbreviations
Processed 6000 abbreviations
Processed 9000 abbreviations
Processed 12000 abbreviations
Processed 15000 abbreviations
Processed 18000 abbreviations
Processed 21000 abbreviations
Processed 24000 abbreviations
Processed 27000 abbreviations
Processed 27129 abbreviations


In [5]:
_all_column_filler = all_column_filler.AllColumnFiller()
df = _all_column_filler.fill_columns_for_df(
        df, search_engine_inverted_index, _abbreviations_resolver, settings_json = {"columns":[
            #{"column_filler_class":"GeoNameFinder", "world_bank_regions_file": "../data/IncomeLevelDivisionCountries.xlsx"},
            {"column_filler_class": "StudyTypeLabeller", "folder": "../model/study_type_multi_new_data_with_keywords_agg_more_sub_groups_more_data_1",
             "meta_folder": "../model/study_type_multi_meta_new_data_with_keywords_agg_more_sub_groups_more_data_1",
             "scibert_model_folder": "../model/scibert_scivocab_uncased", "model_with_agg": True},
        ]})

Started processing  {'column_filler_class': 'StudyTypeLabeller', 'folder': '../model/study_type_multi_new_data_with_keywords_agg_more_sub_groups_more_data_1', 'meta_folder': '../model/study_type_multi_meta_new_data_with_keywords_agg_more_sub_groups_more_data_1', 'scibert_model_folder': '../model/scibert_scivocab_uncased', 'model_with_agg': True}
High level label: Book chapter
High level label: Field study
High level label: Greenhouse study
High level label: Impact evaluation
High level label: Laboratory study
High level label: Meta analysis
High level label: Modelling study
High level label: Observational study
High level label: Randomized controlled trials
High level label: Review paper
High level label: Systematic review
High level label: Interview
High level label: Survey/Questionnaire
High level label: Experimental research
High level label: Economic simulation
High level label: Landscape/environmental simulation
High level label: Policy analysis
Labelled articles with outcomes: 21

INFO:tensorflow:Using config: {'_model_dir': '../model/study_type_multi_new_data_with_keywords_agg_more_sub_groups_more_data_1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000023F28DA26A0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Config is done
INFO:tensorflow:Writing example 0 of 500


INFO:tensorflow:Writing example 0 of 500


INFO:tensorflow:*** Example ***


INFO:tensorflow:*** Example ***


INFO:tensorflow:guid: 


INFO:tensorflow:guid: 


INFO:tensorflow:tokens: [CLS] effect of educational program for prevention of health hazards in poultry processing sla ##ughter ##house ' s workers in egypt [SEP] background : poultry workers are exposed to a variety of occupational health hazards on a daily basis . aim ( s ) : to assess the effect of educational program for prevention of health hazards on knowledge and self - reported practices of poultry processing sla ##ughter ##house ' s workers in egypt . method ( s ) : a quasi - experimental one group pretest - post ##test design was utilized . setting ; the study was conducted in large poultry sla ##ughter ##house at el men ##of ##ia govern ##ora ##te . a systematic random sample of 306 poultry workers was selected . two tools were used to collect data , the first is poultry processing sla ##ughter ##house ' s health hazards questionnaire and the other is an observational checklist for poultry work environment . result ( s ) : reveals that 3 . 4 % , 97 . 7 % and 88 . 8 % of work

INFO:tensorflow:tokens: [CLS] effect of educational program for prevention of health hazards in poultry processing sla ##ughter ##house ' s workers in egypt [SEP] background : poultry workers are exposed to a variety of occupational health hazards on a daily basis . aim ( s ) : to assess the effect of educational program for prevention of health hazards on knowledge and self - reported practices of poultry processing sla ##ughter ##house ' s workers in egypt . method ( s ) : a quasi - experimental one group pretest - post ##test design was utilized . setting ; the study was conducted in large poultry sla ##ughter ##house at el men ##of ##ia govern ##ora ##te . a systematic random sample of 306 poultry workers was selected . two tools were used to collect data , the first is poultry processing sla ##ughter ##house ' s health hazards questionnaire and the other is an observational checklist for poultry work environment . result ( s ) : reveals that 3 . 4 % , 97 . 7 % and 88 . 8 % of work

INFO:tensorflow:input_ids: 102 907 131 6336 1618 168 5200 131 947 16227 121 20739 2307 17615 14683 8901 2505 112 5555 121 21946 103 2740 862 20739 5555 220 4724 147 106 3835 131 11069 947 16227 191 106 4122 2525 205 2579 145 112 546 862 147 1285 111 907 131 6336 1618 168 5200 131 947 16227 191 1767 137 1968 579 1214 5423 131 20739 2307 17615 14683 8901 2505 112 5555 121 21946 205 551 145 112 546 862 106 8889 579 1798 482 583 29342 579 1422 5528 899 241 6744 205 2707 1814 111 527 241 2728 121 1135 20739 17615 14683 8901 235 847 1801 1010 426 3141 7936 282 205 106 5158 1533 1498 131 27962 20739 5555 241 2350 205 502 3674 267 501 147 9921 453 422 111 705 165 20739 2307 17615 14683 8901 2505 112 947 16227 5692 137 111 494 165 130 11220 22919 168 20739 697 1451 205 1186 145 112 546 862 8234 198 239 205 286 1863 422 8276 205 450 1863 137 8203 205 493 1863 131 5555 883 11675 615 131 1767 121 382 422 1422 137 589 692 2732 1222 205 969 170 205 239 1863 422 7380 205 158 1863 137 9046 205 239 186

INFO:tensorflow:input_ids: 102 907 131 6336 1618 168 5200 131 947 16227 121 20739 2307 17615 14683 8901 2505 112 5555 121 21946 103 2740 862 20739 5555 220 4724 147 106 3835 131 11069 947 16227 191 106 4122 2525 205 2579 145 112 546 862 147 1285 111 907 131 6336 1618 168 5200 131 947 16227 191 1767 137 1968 579 1214 5423 131 20739 2307 17615 14683 8901 2505 112 5555 121 21946 205 551 145 112 546 862 106 8889 579 1798 482 583 29342 579 1422 5528 899 241 6744 205 2707 1814 111 527 241 2728 121 1135 20739 17615 14683 8901 235 847 1801 1010 426 3141 7936 282 205 106 5158 1533 1498 131 27962 20739 5555 241 2350 205 502 3674 267 501 147 9921 453 422 111 705 165 20739 2307 17615 14683 8901 2505 112 947 16227 5692 137 111 494 165 130 11220 22919 168 20739 697 1451 205 1186 145 112 546 862 8234 198 239 205 286 1863 422 8276 205 450 1863 137 8203 205 493 1863 131 5555 883 11675 615 131 1767 121 382 422 1422 137 589 692 2732 1222 205 969 170 205 239 1863 422 7380 205 158 1863 137 9046 205 239 186

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:*** Example ***


INFO:tensorflow:*** Example ***


INFO:tensorflow:guid: 


INFO:tensorflow:guid: 


INFO:tensorflow:tokens: [CLS] " i matter , i learn , i decide " : an impact evaluation on knowledge , attitudes , and rights to prevent adolescent pregnancy [SEP] adolescent pregnancy is considered a priority public health issue because of its implications in the lives of young mothers , their children , and the well - being of the general population . in this paper , we describe an intervention targeting adolescents ( aged 11 - 19 years old ) in a rural context and estimate its impact on key outcomes relevant to early pregnancy prevention : knowledge and self - efficacy concerning sexual and reproductive health , knowledge of sexual and reproductive rights , and attitudes toward gender roles . our study used a quasi - experimental design comprising 74 ##7 adolescents . three difference - in - differences models ( raw , adjusted , and by exposure level ) with fixed effects estimated the changes in all outcome measures . our results showed that the intervention community had a significa

INFO:tensorflow:tokens: [CLS] " i matter , i learn , i decide " : an impact evaluation on knowledge , attitudes , and rights to prevent adolescent pregnancy [SEP] adolescent pregnancy is considered a priority public health issue because of its implications in the lives of young mothers , their children , and the well - being of the general population . in this paper , we describe an intervention targeting adolescents ( aged 11 - 19 years old ) in a rural context and estimate its impact on key outcomes relevant to early pregnancy prevention : knowledge and self - efficacy concerning sexual and reproductive health , knowledge of sexual and reproductive rights , and attitudes toward gender roles . our study used a quasi - experimental design comprising 74 ##7 adolescents . three difference - in - differences models ( raw , adjusted , and by exposure level ) with fixed effects estimated the changes in all outcome measures . our results showed that the intervention community had a significa

INFO:tensorflow:input_ids: 102 1554 259 4067 422 259 6714 422 259 10130 1554 862 130 2141 2166 191 1767 422 8826 422 137 4040 147 3363 11644 5564 103 11644 5564 165 1574 106 7531 1771 947 3060 923 131 633 5214 121 111 11937 131 3182 8659 422 547 1808 422 137 111 804 579 1558 131 111 1196 1638 205 121 238 1203 422 185 3401 130 3832 6947 8088 145 6485 1021 579 371 1320 4289 546 121 106 7046 2220 137 2812 633 2141 191 1519 2952 2884 147 1926 5564 5200 862 1767 137 1968 579 4684 6275 5471 137 8042 947 422 1767 131 5471 137 8042 4040 422 137 8826 2536 4703 5370 205 580 527 501 106 8889 579 1798 899 12201 7667 30145 8088 205 874 1673 579 121 579 1595 1262 145 6908 422 5731 422 137 214 2718 615 546 190 2612 1056 2595 111 1334 121 355 3095 2554 205 580 545 1367 198 111 3832 2928 883 106 684 3523 121 355 2952 422 137 238 3523 241 2331 121 1052 975 2072 111 3435 579 2718 615 131 3832 1031 147 106 602 2928 205 580 527 2315 1775 198 106 2928 579 791 3832 422 24756 191 6124 5471 2870 422 165 106 67

INFO:tensorflow:input_ids: 102 1554 259 4067 422 259 6714 422 259 10130 1554 862 130 2141 2166 191 1767 422 8826 422 137 4040 147 3363 11644 5564 103 11644 5564 165 1574 106 7531 1771 947 3060 923 131 633 5214 121 111 11937 131 3182 8659 422 547 1808 422 137 111 804 579 1558 131 111 1196 1638 205 121 238 1203 422 185 3401 130 3832 6947 8088 145 6485 1021 579 371 1320 4289 546 121 106 7046 2220 137 2812 633 2141 191 1519 2952 2884 147 1926 5564 5200 862 1767 137 1968 579 4684 6275 5471 137 8042 947 422 1767 131 5471 137 8042 4040 422 137 8826 2536 4703 5370 205 580 527 501 106 8889 579 1798 899 12201 7667 30145 8088 205 874 1673 579 121 579 1595 1262 145 6908 422 5731 422 137 214 2718 615 546 190 2612 1056 2595 111 1334 121 355 3095 2554 205 580 545 1367 198 111 3832 2928 883 106 684 3523 121 355 2952 422 137 238 3523 241 2331 121 1052 975 2072 111 3435 579 2718 615 131 3832 1031 147 106 602 2928 205 580 527 2315 1775 198 106 2928 579 791 3832 422 24756 191 6124 5471 2870 422 165 106 67

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:*** Example ***


INFO:tensorflow:*** Example ***


INFO:tensorflow:guid: 


INFO:tensorflow:guid: 


INFO:tensorflow:tokens: [CLS] effectiveness of the women ' s development team leaders in delivering nutrition education on pulse spr ##out ##ing in southern ethiopia [SEP] effectively implemented nutrition education can provide participants with the knowledge and skills to make healthy food choices in the context of their lifestyle ##s and economic resources . in ethiopia , the government equip ##s health extension workers ( he ##ws ) to provide nutrition education to communities by enabling he ##ws to transfer knowledge to women ' s development team leaders ( wd ##tl ##s ) who in turn share the knowledge with the one - to - five network leaders ( 1 - 5 ##n ##wl ) and members . the objective of this study was to examine the effectiveness of wd ##tl ##s in delivering nutrition education to women as the intervention group ( ig ) . this was compared to having trained he ##ws educ ##ate women directly ( the positive control group , pc ##g ) , and having women receive no specific education 

INFO:tensorflow:tokens: [CLS] effectiveness of the women ' s development team leaders in delivering nutrition education on pulse spr ##out ##ing in southern ethiopia [SEP] effectively implemented nutrition education can provide participants with the knowledge and skills to make healthy food choices in the context of their lifestyle ##s and economic resources . in ethiopia , the government equip ##s health extension workers ( he ##ws ) to provide nutrition education to communities by enabling he ##ws to transfer knowledge to women ' s development team leaders ( wd ##tl ##s ) who in turn share the knowledge with the one - to - five network leaders ( 1 - 5 ##n ##wl ) and members . the objective of this study was to examine the effectiveness of wd ##tl ##s in delivering nutrition education to women as the intervention group ( ig ) . this was compared to having trained he ##ws educ ##ate women directly ( the positive control group , pc ##g ) , and having women receive no specific education 

INFO:tensorflow:input_ids: 102 4826 131 111 2007 2505 112 1120 4832 13189 121 17934 8556 2870 191 4989 8274 521 140 121 7663 25822 103 5419 3812 8556 2870 300 1584 1914 190 111 1767 137 5561 147 2113 4093 2599 7938 121 111 2220 131 547 13379 30113 137 3587 2965 205 121 25822 422 111 4270 6064 30113 947 3840 5555 145 299 4563 546 147 1584 8556 2870 147 5904 214 10619 299 4563 147 2268 1767 147 2007 2505 112 1120 4832 13189 145 19456 6687 30113 546 975 121 3216 4456 111 1767 190 111 482 579 147 579 2539 934 13189 145 158 579 305 30111 18367 546 137 3087 205 111 3201 131 238 527 241 147 4423 111 4826 131 19456 6687 30113 121 17934 8556 2870 147 2007 188 111 3832 583 145 2648 546 205 238 241 1031 147 2773 7222 299 4563 2138 217 2007 2533 145 111 1532 602 583 422 3658 30123 546 422 137 2773 2007 5380 425 1154 2870 145 1980 602 583 422 4996 30123 546 205 106 3306 5460 3303 899 241 501 205 874 1059 8740 123 145 23070 131 17088 2325 546 267 1434 451 1090 179 2350 263 334 111 19456 6687 30113 2

INFO:tensorflow:input_ids: 102 4826 131 111 2007 2505 112 1120 4832 13189 121 17934 8556 2870 191 4989 8274 521 140 121 7663 25822 103 5419 3812 8556 2870 300 1584 1914 190 111 1767 137 5561 147 2113 4093 2599 7938 121 111 2220 131 547 13379 30113 137 3587 2965 205 121 25822 422 111 4270 6064 30113 947 3840 5555 145 299 4563 546 147 1584 8556 2870 147 5904 214 10619 299 4563 147 2268 1767 147 2007 2505 112 1120 4832 13189 145 19456 6687 30113 546 975 121 3216 4456 111 1767 190 111 482 579 147 579 2539 934 13189 145 158 579 305 30111 18367 546 137 3087 205 111 3201 131 238 527 241 147 4423 111 4826 131 19456 6687 30113 121 17934 8556 2870 147 2007 188 111 3832 583 145 2648 546 205 238 241 1031 147 2773 7222 299 4563 2138 217 2007 2533 145 111 1532 602 583 422 3658 30123 546 422 137 2773 2007 5380 425 1154 2870 145 1980 602 583 422 4996 30123 546 205 106 3306 5460 3303 899 241 501 205 874 1059 8740 123 145 23070 131 17088 2325 546 267 1434 451 1090 179 2350 263 334 111 19456 6687 30113 2

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:*** Example ***


INFO:tensorflow:*** Example ***


INFO:tensorflow:guid: 


INFO:tensorflow:guid: 


INFO:tensorflow:tokens: [CLS] impact of a growth enhancement support scheme on coc ##oa yield and income of coc ##oa farmers in os ##un state , nigeria [SEP] background . in an attempt to improve the yield of coc ##oa and farmers income , the federal government of nigeria in 2012 introduced the coc ##oa growth enhancement support ( ges ) scheme that subsid ##ized farm inputs to farmers . this article examines the effects of the scheme on coc ##oa yield and the income of coc ##oa farmers in os ##un state . material and methods . a multi ##stage sampling procedure was used to obtain data from 208 coc ##oa farmers of whom there were 100 participants and 108 non - participants of the scheme . data collected were analyzed using descriptive statistics , the binary logit regression model and the propensity score matching ( ps ##m ) model . results . descriptive statistics revealed no mean difference between some socioeconomic characteristics among the categories of farmers in the study area s

INFO:tensorflow:tokens: [CLS] impact of a growth enhancement support scheme on coc ##oa yield and income of coc ##oa farmers in os ##un state , nigeria [SEP] background . in an attempt to improve the yield of coc ##oa and farmers income , the federal government of nigeria in 2012 introduced the coc ##oa growth enhancement support ( ges ) scheme that subsid ##ized farm inputs to farmers . this article examines the effects of the scheme on coc ##oa yield and the income of coc ##oa farmers in os ##un state . material and methods . a multi ##stage sampling procedure was used to obtain data from 208 coc ##oa farmers of whom there were 100 participants and 108 non - participants of the scheme . data collected were analyzed using descriptive statistics , the binary logit regression model and the propensity score matching ( ps ##m ) model . results . descriptive statistics revealed no mean difference between some socioeconomic characteristics among the categories of farmers in the study area s

INFO:tensorflow:input_ids: 102 2141 131 106 1503 6586 1385 2631 191 7864 8212 2210 137 5021 131 7864 8212 12288 121 3581 164 1098 422 17964 103 2740 205 121 130 5809 147 1658 111 2210 131 7864 8212 137 12288 5021 422 111 8012 4270 131 17964 121 4950 3376 111 7864 8212 1503 6586 1385 145 6194 546 2631 198 16263 645 5947 5671 147 12288 205 238 2148 15817 111 1056 131 111 2631 191 7864 8212 2210 137 111 5021 131 7864 8212 12288 121 3581 164 1098 205 1440 137 1045 205 106 869 7743 3597 2272 241 501 147 831 453 263 20645 7864 8212 12288 131 7861 461 267 1287 1914 137 11008 699 579 1914 131 111 2631 205 453 2760 267 2549 487 10363 4530 422 111 5067 25565 3089 437 137 111 15310 2867 4740 145 1409 30119 546 437 205 545 205 10363 4530 2861 425 1108 1673 467 693 13044 2087 1247 111 4468 131 12288 121 111 527 1590 555 188 5430 1243 422 19323 2899 422 1407 137 2870 205 111 545 911 2861 198 6163 121 1061 4270 3832 3996 422 1899 147 3840 2522 137 1899 147 8388 267 684 10687 131 6163 121 111 6194 263

INFO:tensorflow:input_ids: 102 2141 131 106 1503 6586 1385 2631 191 7864 8212 2210 137 5021 131 7864 8212 12288 121 3581 164 1098 422 17964 103 2740 205 121 130 5809 147 1658 111 2210 131 7864 8212 137 12288 5021 422 111 8012 4270 131 17964 121 4950 3376 111 7864 8212 1503 6586 1385 145 6194 546 2631 198 16263 645 5947 5671 147 12288 205 238 2148 15817 111 1056 131 111 2631 191 7864 8212 2210 137 111 5021 131 7864 8212 12288 121 3581 164 1098 205 1440 137 1045 205 106 869 7743 3597 2272 241 501 147 831 453 263 20645 7864 8212 12288 131 7861 461 267 1287 1914 137 11008 699 579 1914 131 111 2631 205 453 2760 267 2549 487 10363 4530 422 111 5067 25565 3089 437 137 111 15310 2867 4740 145 1409 30119 546 437 205 545 205 10363 4530 2861 425 1108 1673 467 693 13044 2087 1247 111 4468 131 12288 121 111 527 1590 555 188 5430 1243 422 19323 2899 422 1407 137 2870 205 111 545 911 2861 198 6163 121 1061 4270 3832 3996 422 1899 147 3840 2522 137 1899 147 8388 267 684 10687 131 6163 121 111 6194 263

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:*** Example ***


INFO:tensorflow:*** Example ***


INFO:tensorflow:guid: 


INFO:tensorflow:guid: 


INFO:tensorflow:tokens: [CLS] first - line tuberculosis treatment with double - dose rifamp ##icin is well tolerated . [SEP] objective : to compare the occurrence of unf ##avour ##able treatment and safety outcomes of double - dose rifamp ##icin rm ##p ; 20 mg / kg / d , intervention with standard dose 10 mg / kg / d , control in a first - line tuberculosis tb treatment regimen for smear - positive tb patients in bangladesh . design : this was a randomised clinical trial . the primary efficacy and safety endpoints were the occurrence of an unf ##avour ##able treatment outcome death , failure , relapse or loss to follow - up and the occurrence of any serious drug - related adverse event sae . results : in primary efficacy analysis , among 34 ##3 control and 34 ##7 intervention patients , respectively 15 . 5 % and 11 . 8 % had an unf ##avour ##able outcome . in safety analysis , among 34 ##9 intervention and 35 ##2 control patients , respectively 4 . 3 % and 2 . 6 % experienced an sae . 

INFO:tensorflow:tokens: [CLS] first - line tuberculosis treatment with double - dose rifamp ##icin is well tolerated . [SEP] objective : to compare the occurrence of unf ##avour ##able treatment and safety outcomes of double - dose rifamp ##icin rm ##p ; 20 mg / kg / d , intervention with standard dose 10 mg / kg / d , control in a first - line tuberculosis tb treatment regimen for smear - positive tb patients in bangladesh . design : this was a randomised clinical trial . the primary efficacy and safety endpoints were the occurrence of an unf ##avour ##able treatment outcome death , failure , relapse or loss to follow - up and the occurrence of any serious drug - related adverse event sae . results : in primary efficacy analysis , among 34 ##3 control and 34 ##7 intervention patients , respectively 15 . 5 % and 11 . 8 % had an unf ##avour ##able outcome . in safety analysis , among 34 ##9 intervention and 35 ##2 control patients , respectively 4 . 3 % and 2 . 6 % experienced an sae . 

INFO:tensorflow:input_ids: 102 705 579 972 9757 922 190 3917 579 2750 26994 11274 165 804 16539 205 103 3201 862 147 3745 111 5836 131 11490 28907 318 922 137 4104 2952 131 3917 579 2750 26994 11274 4842 30121 1814 1012 1529 1352 5036 1352 128 422 3832 190 1235 2750 566 1529 1352 5036 1352 128 422 602 121 106 705 579 972 9757 6226 922 11685 168 24333 579 1532 6226 568 121 23547 205 899 862 238 241 106 14136 1329 3303 205 111 1916 4684 137 4104 15753 267 111 5836 131 130 11490 28907 318 922 3095 2889 422 3018 422 12269 234 1738 147 589 579 692 137 111 5836 131 843 6083 1698 579 1482 5386 2607 29647 205 545 862 121 1916 4684 669 422 1247 3154 30138 602 137 3154 30145 3832 568 422 1222 884 205 305 1863 137 1021 205 493 1863 883 130 11490 28907 318 3095 205 121 4104 669 422 1247 3154 30141 3832 137 2638 30132 602 568 422 1222 286 205 239 1863 137 170 205 370 1863 5770 130 29647 205 407 1595 267 302 684 205 461 241 106 1357 1268 5836 131 29647 30113 422 4458 214 106 1268 5836 131 7221 6006 

INFO:tensorflow:input_ids: 102 705 579 972 9757 922 190 3917 579 2750 26994 11274 165 804 16539 205 103 3201 862 147 3745 111 5836 131 11490 28907 318 922 137 4104 2952 131 3917 579 2750 26994 11274 4842 30121 1814 1012 1529 1352 5036 1352 128 422 3832 190 1235 2750 566 1529 1352 5036 1352 128 422 602 121 106 705 579 972 9757 6226 922 11685 168 24333 579 1532 6226 568 121 23547 205 899 862 238 241 106 14136 1329 3303 205 111 1916 4684 137 4104 15753 267 111 5836 131 130 11490 28907 318 922 3095 2889 422 3018 422 12269 234 1738 147 589 579 692 137 111 5836 131 843 6083 1698 579 1482 5386 2607 29647 205 545 862 121 1916 4684 669 422 1247 3154 30138 602 137 3154 30145 3832 568 422 1222 884 205 305 1863 137 1021 205 493 1863 883 130 11490 28907 318 3095 205 121 4104 669 422 1247 3154 30141 3832 137 2638 30132 602 568 422 1222 286 205 239 1863 137 170 205 370 1863 5770 130 29647 205 407 1595 267 302 684 205 461 241 106 1357 1268 5836 131 29647 30113 422 4458 214 106 1268 5836 131 7221 6006 

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


INFO:tensorflow:label: [0, 0, 0, 0, 0, 0, 0] (id = [0, 0, 0, 0, 0, 0, 0])


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


Used for model gpu 0

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.


Instructions for updating:
Use keras.layers.dense instead.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from ../model/study_type_multi_new_data_with_keywords_agg_more_sub_groups_more_data_1\model.ckpt-6831


INFO:tensorflow:Restoring parameters from ../model/study_type_multi_new_data_with_keywords_agg_more_sub_groups_more_data_1\model.ckpt-6831


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


KeyboardInterrupt: 

In [7]:
excel_writer.ExcelWriter().save_df_in_excel(df, "../tmp/combined_data_processed_new.xlsx")

Saving...
Saved to ../tmp/combined_data_processed_new.xlsx


# Lab studies evaluation

In [15]:
lab_study_true_vals = []
lab_study_true_pred = []
for i in range(len(df)):
    if df["File"].values[i] == "Lab studies":
        lab_study_true_vals.append(1)
    else:
        lab_study_true_vals.append(0)
    if "Laboratory study" in df["study_type"].values[i]:
        lab_study_true_pred.append(1)
    else:
        lab_study_true_pred.append(0)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(lab_study_true_vals, lab_study_true_pred))  
print(classification_report(lab_study_true_vals, lab_study_true_pred))

[[389  11]
 [ 74  26]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       400
           1       0.70      0.26      0.38       100

   micro avg       0.83      0.83      0.83       500
   macro avg       0.77      0.62      0.64       500
weighted avg       0.81      0.83      0.80       500



# High income countries evaluation

In [9]:
geo_locations_info_true_vals = []
geo_locations_info_pred = []
for i in range(len(df)):
    if df["File"].values[i] == 'Lab studies':
        continue
    if df["File"].values[i] == 'High-income countries':
        geo_locations_info_true_vals.append(1)
    else:
        geo_locations_info_true_vals.append(0)
    if 'Low-income countries(995$ or less)' in df["world_bankdivision_regions"].values[i] or 'Middle-income countries($996 to $3,895)' in df["world_bankdivision_regions"].values[i] or "Upper-middle-income countries ($3,896 to $12,055)" in df["world_bankdivision_regions"].values[i]:
        geo_locations_info_pred.append(0)
    elif len(df["world_bankdivision_regions"].values[i]) == 0:
        geo_locations_info_pred.append(0)
    else:
        geo_locations_info_pred.append(1)

In [10]:
geo_locations_info_true_vals = []
geo_locations_info_pred = []
for i in range(len(df)):
    if df["File"].values[i] == 'Lab studies':
        continue
    if df["File"].values[i] == 'High-income countries':
        geo_locations_info_true_vals.append(1)
    else:
        if "LMICs" not in df["world_bankdivision_regions"].values[i] and "Transitional countries" in df["world_bankdivision_regions"].values[i]:
            geo_locations_info_true_vals.append(1)
        else:
            geo_locations_info_true_vals.append(0)
        #geo_locations_info_true_vals.append(0)
    if "LMICs" in df["world_bankdivision_regions"].values[i]:
        geo_locations_info_pred.append(0)
    elif len(df["world_bankdivision_regions"].values[i]) == 0:
        geo_locations_info_pred.append(0)
    else:
        geo_locations_info_pred.append(1)

In [9]:
#baseline
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(geo_locations_info_true_vals, geo_locations_info_pred))
print(classification_report(geo_locations_info_true_vals, geo_locations_info_pred))

[[279  21]
 [ 23  77]]
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       300
           1       0.79      0.77      0.78       100

   micro avg       0.89      0.89      0.89       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.89      0.89      0.89       400



In [11]:
# recalculated with labels changed for transitional countries not excluded in the dataset
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(geo_locations_info_true_vals, geo_locations_info_pred))
print(classification_report(geo_locations_info_true_vals, geo_locations_info_pred))

[[279   6]
 [ 23  92]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       285
           1       0.94      0.80      0.86       115

   micro avg       0.93      0.93      0.93       400
   macro avg       0.93      0.89      0.91       400
weighted avg       0.93      0.93      0.93       400



In [11]:
geo_locations_info_true_vals = []
geo_locations_info_pred = []
for i in range(len(df)):
    if df["File"].values[i] != 'Included (abstract)':
        continue
    if df["File"].values[i] == 'High-income countries':
        geo_locations_info_true_vals.append(1)
    else:
        geo_locations_info_true_vals.append(0)
    if 'Low-income countries(995$ or less)' in df["world_bankdivision_regions"].values[i] or 'Middle-income countries($996 to $3,895)' in df["world_bankdivision_regions"].values[i] or "Upper-middle-income countries ($3,896 to $12,055)" in df["world_bankdivision_regions"].values[i]:
        geo_locations_info_pred.append(0)
    elif len(df["world_bankdivision_regions"].values[i]) == 0:
        geo_locations_info_pred.append(0)
    else:
        geo_locations_info_pred.append(1)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(geo_locations_info_true_vals, geo_locations_info_pred)

0.97

# Geo names extraction accuracy

In [14]:
geo_names_df = excel_reader.ExcelReader().read_df_from_excel("../tmp/labelled_geo_names.xlsx")

Read file ../tmp/labelled_geo_names.xlsx: 0.07s
Processed file ../tmp/labelled_geo_names.xlsx: 0.08s


In [15]:
geo_names_df.columns

Index(['abstract', 'comments', 'countries_mentioned', 'country_groups',
       'districts', 'edit_url', 'geo_regions', 'keywords', 'labelled',
       'provinces', 'responsible_person', 'tags', 'title'],
      dtype='object')

In [16]:
print("Labelled data: %d All data: %d" % (len(geo_names_df[geo_names_df["labelled"] == 1]), len(geo_names_df)))

Labelled data: 169 All data: 500


In [28]:
geo_names_df_with_labelled = geo_names_df[geo_names_df["labelled"] == 1]

In [25]:
geo_names_df_wo_labelled = geo_names_df_with_labelled[["title","abstract","keywords"]].copy()
geo_names_df_wo_labelled["identificators"] = ""

In [26]:
from text_processing import search_engine_insensitive_to_spelling
from text_processing import all_column_filler
search_engine_inverted_index_geo_names = search_engine_insensitive_to_spelling.SearchEngineInsensitiveToSpelling(
        load_abbreviations = True)
search_engine_inverted_index_geo_names.create_inverted_index(geo_names_df_wo_labelled)

Processed 0 articles
Processed 168 articles
Processed 0 abbreviations
Processed 3000 abbreviations
Processed 6000 abbreviations
Processed 9000 abbreviations
Processed 12000 abbreviations
Processed 15000 abbreviations
Processed 18000 abbreviations
Processed 21000 abbreviations
Processed 24000 abbreviations
Processed 27000 abbreviations
Processed 27129 abbreviations


In [27]:
_all_column_filler = all_column_filler.AllColumnFiller()
geo_names_df_wo_labelled = _all_column_filler.fill_columns_for_df(
        geo_names_df_wo_labelled, search_engine_inverted_index_geo_names, _abbreviations_resolver, settings_json = {"columns":[
            {"column_filler_class":"GeoNameFinder", "world_bank_regions_file": "../data/IncomeLevelDivisionCountries.xlsx"},
        ]})

Started processing  {'column_filler_class': 'GeoNameFinder', 'world_bank_regions_file': '../data/IncomeLevelDivisionCountries.xlsx'}
Read file ../data/GeoRegions.xlsx: 0.02s
Processed file ../data/GeoRegions.xlsx: 0.00s
Read file ../data/map_additional_regions.xlsx: 0.01s
Processed file ../data/map_additional_regions.xlsx: 0.00s
Read file ../data/IncomeLevelDivisionCountries.xlsx: 0.01s
Processed file ../data/IncomeLevelDivisionCountries.xlsx: 0.00s
Read file ../data/map_additional_country_names.xlsx: 0.00s
Processed file ../data/map_additional_country_names.xlsx: 0.00s
Read file ../data/map_country_adjectives.xlsx: 0.01s
Processed file ../data/map_country_adjectives.xlsx: 0.01s
Read file ../data/map_currencies.xlsx: 0.01s
Processed file ../data/map_currencies.xlsx: 0.00s
Read file ../data/map_country_groups.xlsx: 0.01s
Processed file ../data/map_country_groups.xlsx: 0.01s
Read file ../data/map_country_groups_income_level.xlsx: 0.00s
Processed file ../data/map_country_groups_income_lev

In [30]:
def get_mean_intersection_over_union(predicted_vals, true_vals):
    cnt = 0
    cnt_intersection_over_union = 0.0
    for pred, true in zip(predicted_vals, true_vals):
        cnt += 1
        cnt_intersect = len(set(pred).intersection(set(true)))
        cnt_union = len(set(pred).union(set(true)))
        if cnt_union == 0:
            cnt_intersection_over_union += 1
        else:
            cnt_intersection_over_union += (cnt_intersect/cnt_union)
    return cnt_intersection_over_union/cnt if cnt else 0.0

for column in ['countries_mentioned', 'country_groups','districts', 'geo_regions', 'provinces']:
    print("Column: ", column)
    print(get_mean_intersection_over_union(geo_names_df_wo_labelled[column].values, geo_names_df_with_labelled[column].values))

Column:  countries_mentioned
0.9119822485207102
Column:  country_groups
0.945759368836292
Column:  districts
0.9644970414201184
Column:  geo_regions
0.9452662721893491
Column:  provinces
0.935897435897436


In [31]:
geo_names_df_wo_labelled_restricted_by_title = geo_names_df_wo_labelled.copy()

In [32]:
from text_processing import search_engine_insensitive_to_spelling
from text_processing import all_column_filler
search_engine_inverted_index_geo_names_title = search_engine_insensitive_to_spelling.SearchEngineInsensitiveToSpelling(
        load_abbreviations = True, columns_to_process=["title"])
search_engine_inverted_index_geo_names_title.create_inverted_index(geo_names_df_wo_labelled_restricted_by_title)

Processed 0 articles
Processed 168 articles
Processed 0 abbreviations
Processed 3000 abbreviations
Processed 6000 abbreviations
Processed 9000 abbreviations
Processed 12000 abbreviations
Processed 15000 abbreviations
Processed 18000 abbreviations
Processed 21000 abbreviations
Processed 24000 abbreviations
Processed 27000 abbreviations
Processed 27129 abbreviations


In [34]:
_all_column_filler = all_column_filler.AllColumnFiller()
geo_names_df_wo_labelled_restricted_by_title = _all_column_filler.fill_columns_for_df(
        geo_names_df_wo_labelled_restricted_by_title, search_engine_inverted_index_geo_names_title, _abbreviations_resolver, settings_json = {"columns":[
            {"column_filler_class":"GeoNameFinder", "world_bank_regions_file": "../data/IncomeLevelDivisionCountries.xlsx",
            "columns_to_process":["title"], "prefix_for_columns": "title_", "check_only_country_names_and_divisions": True},
        ]})

Started processing  {'column_filler_class': 'GeoNameFinder', 'world_bank_regions_file': '../data/IncomeLevelDivisionCountries.xlsx', 'columns_to_process': ['title'], 'prefix_for_columns': 'title_', 'check_only_country_names_and_divisions': True}
Read file ../data/GeoRegions.xlsx: 0.01s
Processed file ../data/GeoRegions.xlsx: 0.01s
Read file ../data/map_additional_regions.xlsx: 0.01s
Processed file ../data/map_additional_regions.xlsx: 0.00s
Read file ../data/IncomeLevelDivisionCountries.xlsx: 0.01s
Processed file ../data/IncomeLevelDivisionCountries.xlsx: 0.01s
Read file ../data/map_additional_country_names.xlsx: 0.01s
Processed file ../data/map_additional_country_names.xlsx: 0.00s
Read file ../data/map_country_adjectives.xlsx: 0.01s
Processed file ../data/map_country_adjectives.xlsx: 0.01s
Read file ../data/map_currencies.xlsx: 0.01s
Processed file ../data/map_currencies.xlsx: 0.00s
Read file ../data/map_country_groups.xlsx: 0.01s
Processed file ../data/map_country_groups.xlsx: 0.01s
R

In [35]:
geo_names_df_wo_labelled_restricted_by_title.head()

Unnamed: 0,title,abstract,keywords,identificators,countries_mentioned,country_codes,provinces,districts,country_groups,geo_regions,world_bankdivision_regions,title_countries_mentioned,title_country_codes,title_provinces,title_districts,title_country_groups,title_geo_regions,title_world_bankdivision_regions
331,The effect of education based on reasoned acti...,Introduction: Studies showed that about 25 per...,analytical techniques;behavior;studies;man;hom...,,[Iran],[IR],[Iran/Fars],[Iran/Fars/Sepidan],[developing countries],"[Middle East and North Africa, Asia]",[LMICs],[Iran],[IR],[Iran/Fars],[Iran/Fars/Sepidan],[],[Middle East and North Africa],[LMICs]
332,Motivational interview on having Pap test amon...,"Background. Cognitive and mental factors, such...",counseling;fisher exact test;health attitudes;...,,[Iran],[IR],[Iran/Markazi],[Iran/Markazi/Shazand],[],[Middle East and North Africa],[LMICs],[],[],[],[],[],[],[]
333,Effectiveness of intervention due to feedback ...,Background: The limited supply of red blood ce...,red blood cell transfusion;blood transportatio...,,[Iran],[IR],[Iran/Razavi Khorasan],[Iran/Razavi Khorasan/Mashhad],[],[Middle East and North Africa],[LMICs],[],[],[],[],[],[],[]
334,Evaluation of the clinical pharmacist role in ...,Background Lower urinary tract symptoms due to...,"aged;aged, 80 and over;humans;jordan;*lower ur...",,[Jordan],[JO],[Jordan/Amman],[Jordan/Amman/Amman],[],[Middle East and North Africa],[LMICs],[],[],[],[],[],[],[]
335,Use of propensity score matching to create cou...,The design of HIV prevention trials in the con...,hiv prevention;vaccine effectiveness;hepatitis...,,[Uganda],[UG],[],[],[],[Sub-Saharan Africa],[LMICs],[],[],[],[],[],[],[]


In [37]:
for i in range(len(geo_names_df_wo_labelled_restricted_by_title)):
    if geo_names_df_wo_labelled_restricted_by_title["title_countries_mentioned"].values[i]:
        geo_names_df_wo_labelled_restricted_by_title["countries_mentioned"].values[i] = geo_names_df_wo_labelled_restricted_by_title["title_countries_mentioned"].values[i]
        for column in ["provinces", "districts"]:
            new_array = []
            for district in geo_names_df_wo_labelled_restricted_by_title[column].values[i]:
                if district.split("/")[0] in geo_names_df_wo_labelled_restricted_by_title["countries_mentioned"].values[i]:
                    new_array.append(district)
            geo_names_df_wo_labelled_restricted_by_title[column].values[i] = new_array

In [38]:
for column in ['countries_mentioned', 'country_groups','districts', 'geo_regions', 'provinces']:
    print("Column: ", column)
    print(get_mean_intersection_over_union(geo_names_df_wo_labelled_restricted_by_title[column].values, geo_names_df_with_labelled[column].values))

Column:  countries_mentioned
0.8982448725363706
Column:  country_groups
0.945759368836292
Column:  districts
0.9704142011834319
Column:  geo_regions
0.9452662721893491
Column:  provinces
0.9437869822485208
