# Imports

In [1]:
import pandas as pd

import spacy
from spacy import displacy

from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
from spacy.tokens import Token

Token.set_extension("ignore", default=False, force=True)

nlp = spacy.load("en_core_web_trf")

- Wir definieren unseren Beispielsatz:

In [26]:
sentence = "Hi Jonas, we are now authorized in d-bru-04 and p-bru-01 on Grafana. Entry best not via direct link, but via Rancher>Monitoring>Grafana Greetings Dominik"
sentence

'Hi Jonas, we are now authorized in d-bru-04 and p-bru-01 on Grafana. Entry best not via direct link, but via Rancher>Monitoring>Grafana Greetings Dominik'

# Generate Rules
- Anwendung der Sprachpipeline

In [27]:
doc = nlp(sentence)

In [28]:
displacy.render(doc, style='dep')

In [29]:
def set_ignore(matcher, doc, id, matches):
    for _, start, end in matches:
        for tok in doc[start:end]:
            tok._.ignore = True

## Grußformeln (Ende)

In [30]:
match_texts = Matcher(nlp.vocab)
pattern = [
    [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "PROPN"}],
    [{"POS": "NOUN"}, {"POS": "PROPN"}],
    ]
match_texts.add("Grußformeln", pattern, on_match=set_ignore)

In [31]:
matches = match_texts(doc)

print(matches)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

[(1981270878604495065, 32, 34)]
1981270878604495065 Grußformeln 32 34 Greetings Dominik


## Begrüßungsformeln (EN)

In [32]:
match_texts = Matcher(nlp.vocab)
hello_synoym = ["hello", "hi", "greetings", "welcome", "hey", "olla", "hi-ya", "howdy"]

pattern = [
    [{"LOWER": {"IN": hello_synoym}}, {"POS": "PROPN"}, {"IS_PUNCT": True}],  
    [{"LOWER": {"IN": hello_synoym}}, {"POS": "ADV"}, {"IS_PUNCT": True}],
    [{"LOWER": {"IN": hello_synoym}}, {"POS": "PRON"}, {"IS_PUNCT": True}],
]

match_texts.add("Begrusungformeln", pattern, on_match=set_ignore) 

In [33]:
matches = match_texts(doc)

print(matches)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

[(1917771495493254596, 0, 3)]
1917771495493254596 Begrusungformeln 0 3 Hi Jonas,


## Resultat

In [34]:
matches = match_texts(doc)

print(matches)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

[(1917771495493254596, 0, 3)]
1917771495493254596 Begrusungformeln 0 3 Hi Jonas,


In [35]:
toks = [tok.text + tok.whitespace_ for tok in doc if not tok._.ignore]
cleaned_text = "".join(toks)
cleaned_text = cleaned_text[0].upper() + cleaned_text[1:]
print(cleaned_text)

We are now authorized in d-bru-04 and p-bru-01 on Grafana. Entry best not via direct link, but via Rancher>Monitoring>Grafana 


# Multiprocessing

In [36]:
# docs = nlp.pipe(sentences['Original Text'], n_process=4)

# Augmentation

In [37]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

Backtranslating does not work

In [38]:
# aug = naw.BackTranslationAug(
#     from_model_name='facebook/wmt19-en-de', 
#     to_model_name='facebook/wmt19-de-en'
# )

# back_translated = sentences["Original Text EN"].apply(lambda x: aug.augment(x)[0])

# print(back_translated)

# sentences_back_translated = sentences
# sentences_back_translated['Original Text EN'] = back_translated

# sentences_backtrans = sentences.append(sentences_back_translated)
# sentences.info()

Use Synonyms instead

In [39]:
sentences

NameError: name 'sentences' is not defined

In [None]:
aug = naw.SynonymAug(aug_src='wordnet', aug_max=2)

synonym_words = sentences["Original Text EN"].apply(lambda x: aug.augment(x,4))

sentences_synonym_words = sentences.copy()
sentences_synonym_words['Original Text EN'] = synonym_words
sentences_synonym_words = sentences_synonym_words.explode('Original Text EN')
sentences_synonym_words = sentences_synonym_words.reset_index()

sentences_synonym = sentences.append(sentences_synonym_words)

sentences_synonym

  sentences_synonym = sentences.append(sentences_synonym_words)


Unnamed: 0,ID,Bereinigter Text,Tags/Kategorie,Eingereicht von,Original Text,Original Text EN,index
0,1,Die Tabelle “Subscriptions” im Schema “CRM” en...,"DWH, SQL",,"Hey, du kannst die Tabelle “Subscriptions” im ...","Hey, you can use the ""Subscriptions"" table in ...",
1,2,Maximal 3 Urlaubstage mit ins nächste Jahr neh...,HR,,"Hallo Marcel, hier noch eine kurze Info für di...","Hello Marcel, here is a short info for you: Yo...",
2,3,CR bedeutet Conversion Rate.,"Allgemeine Definitionen, CRM",,"Hi Marcel, CR bedeutet in unserem Kontext Conv...","Hi Marcel, CR in our context means conversion ...",
3,4,AE bedeutet Account Executive.,"Allgemeine Definitionen, CRM",,"AE = Account Executive, also Sales Mitarbeiter...","AE = Account Executive, i.e. Sales employees w...",
4,5,Mit “Abrechnungsdatum” ist das Attribut “Purch...,"Allgemeine Definitionen, DWH",,"Hi Marcel, ja genau mit “Abrechnungsdatum” ist...","Hi Marcel, yes exactly with ""Billing Date"" the...",
...,...,...,...,...,...,...,...
103,26,Wie kann ich die Berechtigungen / Sap-User bea...,"HR, SAP",,"Hallo Milagros, ich habe aktuell nur einen Nut...","Hi Milagros, I currently have only one substan...",25.0
104,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0
105,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0
106,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0


Swap parts

In [None]:
aug = naw.RandomWordAug(action="swap", aug_max=2)

swapped_words = sentences_synonym["Original Text EN"].apply(lambda x: aug.augment(x,4))

sentences_swapped_words = sentences_synonym.copy()
sentences_swapped_words['Original Text EN'] = swapped_words
sentences_swapped_words = sentences_swapped_words.explode('Original Text EN')
sentences_swapped_words = sentences_swapped_words.reset_index()

sentences_synonym_swap = sentences_synonym.append(sentences_swapped_words)

sentences_synonym_swap

  sentences_synonym_swap = sentences_synonym.append(sentences_swapped_words)


Unnamed: 0,ID,Bereinigter Text,Tags/Kategorie,Eingereicht von,Original Text,Original Text EN,index,level_0
0,1,Die Tabelle “Subscriptions” im Schema “CRM” en...,"DWH, SQL",,"Hey, du kannst die Tabelle “Subscriptions” im ...","Hey, you can use the ""Subscriptions"" table in ...",,
1,2,Maximal 3 Urlaubstage mit ins nächste Jahr neh...,HR,,"Hallo Marcel, hier noch eine kurze Info für di...","Hello Marcel, here is a short info for you: Yo...",,
2,3,CR bedeutet Conversion Rate.,"Allgemeine Definitionen, CRM",,"Hi Marcel, CR bedeutet in unserem Kontext Conv...","Hi Marcel, CR in our context means conversion ...",,
3,4,AE bedeutet Account Executive.,"Allgemeine Definitionen, CRM",,"AE = Account Executive, also Sales Mitarbeiter...","AE = Account Executive, i.e. Sales employees w...",,
4,5,Mit “Abrechnungsdatum” ist das Attribut “Purch...,"Allgemeine Definitionen, DWH",,"Hi Marcel, ja genau mit “Abrechnungsdatum” ist...","Hi Marcel, yes exactly with ""Billing Date"" the...",,
...,...,...,...,...,...,...,...,...
535,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0,106.0
536,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that had you an Azure D...",26.0,107.0
537,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that had you an Azure D...",26.0,107.0
538,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0,107.0


In [None]:
sentences_synonym_swap['Original Text EN'].nunique()

664

Languagemodel Magic

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")

magic_words = sentences_synonym_swap["Original Text EN"].apply(lambda x: aug.augment(x,4))

sentences_magic_words = sentences_synonym_swap.copy()
sentences_magic_words['Original Text EN'] = magic_words
sentences_magic_words = sentences_magic_words.explode('Original Text EN')
sentences_magic_words = sentences_magic_words.drop(columns=['level_0'])
sentences_magic_words = sentences_magic_words.reset_index()

sentences_synonym_swap_magic = sentences_synonym_swap.append(sentences_magic_words)

sentences_synonym_swap_magic

Unnamed: 0,ID,Bereinigter Text,Tags/Kategorie,Eingereicht von,Original Text,Original Text EN,index,level_0
0,1,Die Tabelle “Subscriptions” im Schema “CRM” en...,"DWH, SQL",,"Hey, du kannst die Tabelle “Subscriptions” im ...","Hey, you can use the ""Subscriptions"" table in ...",,
1,2,Maximal 3 Urlaubstage mit ins nächste Jahr neh...,HR,,"Hallo Marcel, hier noch eine kurze Info für di...","Hello Marcel, here is a short info for you: Yo...",,
2,3,CR bedeutet Conversion Rate.,"Allgemeine Definitionen, CRM",,"Hi Marcel, CR bedeutet in unserem Kontext Conv...","Hi Marcel, CR in our context means conversion ...",,
3,4,AE bedeutet Account Executive.,"Allgemeine Definitionen, CRM",,"AE = Account Executive, also Sales Mitarbeiter...","AE = Account Executive, i.e. Sales employees w...",,
4,5,Mit “Abrechnungsdatum” ist das Attribut “Purch...,"Allgemeine Definitionen, DWH",,"Hi Marcel, ja genau mit “Abrechnungsdatum” ist...","Hi Marcel, yes exactly with ""Billing Date"" the...",,
...,...,...,...,...,...,...,...,...
2695,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had an Azure D...",26.0,538.0
2696,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said that you had got Enterp...",26.0,539.0
2697,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Niklas, Stefan said before you had an Azure...",26.0,539.0
2698,27,Azure DevOps Lizenz. Wie war da der Prozess? ...,HR,,"Hi Niklas, Stefan meinte, dass du für deinen P...","Hi Dan, Stefan said that you had our Enterpris...",26.0,539.0


In [None]:
sentences_synonym_swap_magic.to_excel('../data/demo_data_augmented.xlsx')