In [2]:
import os

project_name = "biorxiv"
conn_string = "postgresql://postgres@fonduer-postgres-dev:5432/"

dataset_path = "data/"
export_path = os.path.join(dataset_path, "contrisnap.json")
documents_path = os.path.join(dataset_path, "converted_contri")


from LabelstudioToFonduer.to_fonduer import parse_export
export = parse_export(export_path)

from LabelstudioToFonduer.fonduer_tools import save_create_project
save_create_project(conn_string=conn_string, project_name=project_name)


from fonduer import Meta, init_logging
init_logging(log_dir=os.path.join(dataset_path, "logs"))
session = Meta.init(conn_string + project_name).Session()


[2023-05-23 12:08:17,252][INFO] fonduer.meta:49 - Setting logging directory to: data/logs/2023-05-23_12-08-17
[2023-05-23 12:08:17,254][INFO] fonduer.meta:134 - Connecting user:postgres to fonduer-postgres-dev:5432/biorxiv
[2023-05-23 12:08:17,447][INFO] fonduer.meta:162 - Initializing the storage schema


In [3]:
from LabelstudioToFonduer.document_processor import My_HTMLDocPreprocessor
from fonduer.parser import Parser
doc_preprocessor = My_HTMLDocPreprocessor(documents_path, max_docs=5)

from LabelstudioToFonduer.lingual_parser import ModifiedSpacyParser
exceptions = [".NET", "Sr.", ".WEB", ".de", "Jr.", "Inc.", "Senior.", "p.", "m."]
my_parser = ModifiedSpacyParser(lang="en", split_exceptions=exceptions)

corpus_parser = Parser(session, 
    lingual_parser=my_parser, 
    structural=True, 
    lingual=True, 
    flatten=[])
    
corpus_parser.apply(doc_preprocessor, parallelism=8)

from fonduer.parser.models import Document, Sentence

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

docs = session.query(Document).order_by(Document.name).all()

[2023-05-23 12:08:19,466][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/3 [00:00<?, ?it/s]

Documents: 3
Sentences: 13945


In [4]:
from fonduer.candidates.models import mention_subclass
AuthShort = mention_subclass("AuthShort")
Task = mention_subclass("Task")

from fonduer.candidates import MentionNgrams
AuthShort_ngrams = MentionNgrams(n_max=export.ngrams("Author short")[1] + 5, n_min=export.ngrams("Author short")[0])
Task_ngrams = MentionNgrams(n_max=export.ngrams("Task")[1] + 5, n_min=export.ngrams("Task")[0])


from fonduer.candidates.matchers import LambdaFunctionMatcher
authshort = export.lable_entitis("Author short")
task = export.lable_entitis("Task")


def is_AuthShort(mention):
    if mention.get_span() in authshort:
        return True
    else:
        False


def is_task(mention):
    if mention.get_span() in task:
        return True
    else:
        False


authshort_matcher = LambdaFunctionMatcher(func=is_AuthShort)
task_matcher = LambdaFunctionMatcher(func=is_task)

In [5]:
from fonduer.candidates import MentionExtractor
mention_extractor = MentionExtractor(
    session,
    [AuthShort, Task],
    [AuthShort_ngrams, Task_ngrams],
    [authshort_matcher, task_matcher],
)


from fonduer.candidates.models import Mention
mention_extractor.apply(docs)
num_title = session.query(AuthShort).count()
num_date = session.query(Task).count()

print(f"Total Mentions: {session.query(Mention).count()} ({num_title} titles, {num_date} dates)")

[2023-05-23 12:11:03,265][INFO] fonduer.candidates.mentions:467 - Clearing table: auth_short
[2023-05-23 12:11:03,277][INFO] fonduer.candidates.mentions:467 - Clearing table: task
[2023-05-23 12:11:03,279][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/3 [00:00<?, ?it/s]

Total Mentions: 87 (79 titles, 8 dates)


In [6]:
from fonduer.candidates.models import candidate_subclass
AuthShortTask = candidate_subclass("AuthShortTask", [AuthShort, Task])


from fonduer.candidates import CandidateExtractor
candidate_extractor = CandidateExtractor(session, [AuthShortTask])
candidate_extractor.apply(docs)

from LabelstudioToFonduer.to_fonduer import ToFonduer
converter = ToFonduer(label_studio_export=export, fonduer_session=session)

[2023-05-23 12:11:24,319][INFO] fonduer.candidates.candidates:137 - Clearing table auth_short_task (split 0)
[2023-05-23 12:11:24,331][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
converter.gold_table

[]

In [8]:
from fonduer.supervision.models import GoldLabel
from fonduer.supervision import Labeler
labeler = Labeler(session, [AuthShortTask])

labeler.apply(
    docs=docs,
    lfs=[[converter.is_gold]],
    table=GoldLabel,
    train=True,
    parallelism=8,
)

[2023-05-23 12:11:27,900][INFO] fonduer.supervision.labeler:330 - Clearing Labels (split ALL)
  query = self.session.query(table).filter(table.candidate_id.in_(sub_query))
[2023-05-23 12:11:27,910][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
train_cands = candidate_extractor.get_candidates()
all_gold = labeler.get_gold_labels(train_cands)

  .filter(candidate_class.id.in_(sub_query))


In [27]:
converter.gold_table

[]

In [17]:
train_cands

[[AuthShortTask(AuthShort(SpanMention("RA", sentence=19607, chars=[67,68], words=[11,11])), Task(SpanMention("designed the methodology and the analysis", sentence=20541, chars=[32,72], words=[12,17]))),
  AuthShortTask(AuthShort(SpanMention("RA", sentence=19607, chars=[67,68], words=[11,11])), Task(SpanMention("analysed the data", sentence=20542, chars=[5,21], words=[2,4]))),
  AuthShortTask(AuthShort(SpanMention("RA", sentence=19607, chars=[67,68], words=[11,11])), Task(SpanMention("contributed to data analysis", sentence=20543, chars=[25,52], words=[10,13]))),
  AuthShortTask(AuthShort(SpanMention("RA", sentence=19607, chars=[67,68], words=[11,11])), Task(SpanMention("were involved in designing the original studies for which the present data were collected", sentence=20544, chars=[47,135], words=[20,33]))),
  AuthShortTask(AuthShort(SpanMention("RA", sentence=19607, chars=[67,68], words=[11,11])), Task(SpanMention("participated in data collection", sentence=20545, chars=[46,76], word

In [None]:
train_cands = candidate_extractor.get_candidates()
all_gold = labeler.get_gold_labels(train_cands)


print("Gold labels found:", all_gold[0].sum(), "from", len(export.documents))
print("Documents successfully transfered:")

golds = []
for k, v in zip(all_gold[0], train_cands[0]):
    if k:
        golds.append(v)
        print(v.document.name)
print(golds)

(1, 4)