In [2]:
!createdb -h fonduer-postgres-dev -U postgres jkracht
#!dropdb -h fonduer-postgres-dev -U postgres jkracht

In [244]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import subprocess
import logging
from pprint import pprint

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'nltk'])

from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')

from fonduer.candidates.matchers import LambdaFunctionMatcher, Union, Intersect, RegexMatchSpan

from fonduer import Meta, init_logging
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser
from fonduer.parser.models import Document, Sentence

PARALLEL = 3
ATTRIBUTE = "jkracht"
conn_string = f'postgresql://postgres@fonduer-postgres-dev:5432/{ATTRIBUTE}'

init_logging(log_dir="logs")

session = Meta.init(conn_string).Session()

# Document parser
docs_path = "data/test_collection1/"
doc_preprocessor = HTMLDocPreprocessor(docs_path)

corpus_parser = Parser(session, structural=True, lingual=True)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[2023-05-24 21:03:04,370][INFO] fonduer.meta:53 - Logging was already initialized to use logs/2023-05-24_09-02-18.  To configure logging manually, call fonduer.init_logging before initialiting Meta.
[2023-05-24 21:03:04,373][INFO] fonduer.meta:135 - Connecting user:postgres to fonduer-postgres-dev:5432/jkracht
[2023-05-24 21:03:04,373][INFO] fonduer.meta:162 - Initializing the storage schema
[2023-05-24 21:03:04,420][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/1 [00:00<?, ?it/s]

In [245]:
from fonduer.parser.models import Document, Section, Table, Cell, Paragraph, Sentence, Figure, Caption
print(f"Num Docs: {session.query(Document).count()}")

print(f"Num Sections: {session.query(Section).count()}")
print(f"Num Tables: {session.query(Table).count()}")
print(f"Num Cells: {session.query(Cell).count()}")
print(f"Num Sentences: {session.query(Sentence).count()}")
print(f"Num Paragraphs: {session.query(Paragraph).count()}")

print(f"Num Figures: {session.query(Figure).count()}")
print(f"Num Captions: {session.query(Caption).count()}")

Num Docs: 1
Num Sections: 1
Num Tables: 0
Num Cells: 0
Num Sentences: 8
Num Paragraphs: 4
Num Figures: 0
Num Captions: 0


In [250]:
# mention
from fonduer.candidates.models import mention_subclass
from fonduer.candidates import MentionExtractor
from fonduer.candidates import MentionNgrams
from fonduer.candidates.mentions import MentionSentences, MentionCaptions, MentionParagraphs, MentionDocuments, MentionTables
from fonduer.utils.data_model_utils.textual import get_neighbor_sentence_ngrams, get_left_ngrams
Task = mention_subclass("Task")


list_of_headlines = ["Acknowledgements", "Acknowledgement", "acknowledgements", "acknowledgement", 
                     #"Contributions", "Contribution", "contribution", "contributions",
                     #"Credits", "Credit", "credits", "credit",
                     "Überschrift 1"]
    
def mention_span_in_acknowledments_matches_verb(mention):
    span_string = mention.get_span()

    try:
        # get last paragraphs first sentence (headline of the paragraph)
        headline_of_last_paragraph = session.query(Paragraph).get(mention.sentence.paragraph_id-1).sentences[0].text

        #x = get_neighbor_sentence_ngrams(mention, attrib="words", n_min=1, n_max=1)
        x = get_left_ngrams(mention, attrib="words", n_min=1, n_max=1)
        print("x\n")
        print(span_string)
        print(list(x))
        #get_neighbor_sentence_ngrams()
        # sentence_of_span = mention.sentence.text
        # x = get_left_ngrams(mention, n = 1)
        # print(sentence_of_span.x)


        # check if last headline is listed to extract mentions of
        #if headline_of_last_paragraph in list_of_headlines:
        if any(option in headline_of_last_paragraph for option in list_of_headlines):

            #test if span is a verb
            for word in wn.synsets(span_string):
                if word.pos() == "v": # and word.name().split(".")[0] == span_string.lower():
                    return True # case: span is a ver in a wanted paragraph

            return False # case: span is not a verb
        
        else:
            return False # case: span not in wanted paragraph
    except:
        return False # case: no prior paragraph


    

matcher_task1 = LambdaFunctionMatcher(func = mention_span_in_acknowledments_matches_verb)


#task_sentences = MentionSentences() ## hier muss noch die Mention-Space eingetragen werden
#task_paragraph = MentionParagraphs()
task_space = MentionNgrams(n_min=1, n_max=1)

matchers_task = Intersect(
    matcher_task1,
)

docs = session.query(Document).all()

mention_extractor = MentionExtractor(
    session,
    [Task],
    [task_space], #task_sentences #task_space
    [matchers_task],
    parallelism=PARALLEL,
)

mention_extractor.apply(docs, parallelism=PARALLEL, clear=True)
print(
    f"Number of Tasks: {session.query(Task).count()}",
)


[2023-05-24 21:12:11,148][INFO] fonduer.candidates.mentions:467 - Clearing table: task
[2023-05-24 21:12:11,150][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/1 [00:00<?, ?it/s]

x

2
x

3
x

4
x

5
x

6
x

!
x

7
x

8
x

9
x

10
x

!
x

11
x

12
x

!
x

13
x

14
x

!
x

11
x

12
x

13
x

14
x

!
x

15
x

16
x

17
x

18
x

19
x

20
x

.
Number of Tasks: 0


In [219]:
for i in session.query(Document).all():
    print(i)

Document test copy
Document test copy 2
Document test


In [220]:
a = 0
for mention in session.query(Task).all():
    print(mention.context.get_span())
    #print("\n")
    if a > 100:
        break
    a += 1

will
Blub


In [216]:
a = session.query(Paragraph).get(3325)

print(dir(a))

['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']


In [193]:
for a in session.query(Paragraph).all():
    print(a.id)
    print(a.sentences[0].text)

28984
Überschrift 1
28987
I have this paragraph!
28989
Überschrift 2
28992
Diesen Abschnitt will ich nicht haben!
28985
Überschrift 1
28988
I got this paragraph!
28990
Überschrift 2
28993
Diesen Abschnitt will ich nicht haben!
28991
Überschrift 1
28994
I want this paragraph!
28995
Überschrift 2
28997
Diesen Abschnitt will ich nicht haben!


In [229]:
a = session.query(Task).all()
for b in a:
    print(b.id)


# b = a.sentences[0].text

get_neighbor_sentence_ngrams(b)

1869
1870


<generator object get_neighbor_sentence_ngrams at 0x7f456fb490d0>

In [214]:
a = session.query(Sentence).get(29011)
a.text
a.get_neighbor_sentence_ngrams(window = 1, n= 1)

'Bla Blub.'

In [202]:
mention_id = 1
window = 2  # Die Anzahl der vorherigen und nachfolgenden Sätze, die berücksichtigt werden sollen
n = 3  # Die Anzahl der Wörter in jedem n-Gramm

mention = session.query(Task).filter(Task.id == mention_id).first()
sentence = session.query(Sentence).filter(Sentence.id == mention.sentence_id).first()

ngrams = sentence.get_neighbor_sentence_ngrams(window=window, n=n)

AttributeError: 'NoneType' object has no attribute 'sentence_id'

In [37]:
# Testbereich

def mention_span_in_acknowledments_matches_verb(mention):
    # read in span/word
    span_string = mention

    # Task(SpanMention("RefWorks Tagged", sentence=61332, chars=[0,14], words=[0,1]))
    
    for word in span_string.split(): # mention.get_span()[0]
        #print(word)
        word_filtered = word.replace("!","")
        try:
            if wn.synsets(word_filtered)[0].pos() == "v":
                #print(word)
                return word_filtered
        except:
            pass
    return False

a = "I played soccer!"

In [38]:
mention_span_in_acknowledments_matches_verb(a)

'played'

In [109]:
wn.synsets("want")[0].pos()

'n'

In [124]:
for a in wn.synsets("want"):
    print(a.pos())
    print(a.name().split(".")[0])

n
privation
n
lack
n
need
n
wish
v
desire
v
want
v
want
v
want
v
want


In [None]:
    sentence_of_span = mention.sentence.text
    #print(sentence_of_span)
    #x = mention._get_table()
    # print(mention.sentence.paragraph_id)
    # #print(mention.document_id)
    # print("x")
    # print(mention.sentence.paragraph_id-1)
    #print(mention.sentence.paragraph_id-1)
    # a = session.query(Paragraph).get(mention.sentence.paragraph_id-1)
    #print(a.sentences[0].text)

In [186]:

# if any in 

a = ["abc", "bbb", "adf"]
headline_of_last_paragraph = "Acknowledgment"

if any(option in headline_of_last_paragraph for option in list_of_headlines):
    print("T")
else:
    print("F")

F
