In [1]:
# markdown seemingly unused parts
import os
from pathlib import Path
from collections import defaultdict

In [2]:
import spacy
nlp = spacy.load("pl_core_news_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def load_data(datadir):
    data = []
    for fpath in datadir.iterdir():
        if not fpath.suffix == '.txt':
            continue
        with open(fpath) as f:
            text = " ".join(text.strip() for text in f.readlines())
        data.append((fpath.stem, nlp(text)))
    return data

data = load_data(Path('./data/raw'))

from transformers import MT5ForConditionalGeneration, T5Tokenizer

def load_model(modelpath):
    model = MT5ForConditionalGeneration.from_pretrained(modelpath)
    tokenizer = T5Tokenizer.from_pretrained(modelpath)
    return model, tokenizer

model, tokenizer = load_model("google/mt5-small")

In [8]:
import plwn
# plwn.download()
wn = plwn.load("./default_model")

In [9]:
lex = wn.lexical_unit('pies', plwn.PoS.noun_pl, 2)
print(lex)

pies.2(21:zw)


In [12]:
import plwn


class WordNet:

    pos_mapping = {
    "VERB": "CZASOWNIK",
    "NOUN": "RZECZOWNIK",
    "ADJ": "PRZYMIOTNIK",
    "ADV": 'PRZYSŁÓWEK',
}

    def __init__(self, wn_path):
        self.wn = plwn.load(wn_path)
        self.senses = self.wn.lexical_units()

        self.index_by_lemma = defaultdict(set)

        for lexicalunit in self.wn.lexical_units():
            self.index_by_lemma[
                (lexicalunit.lemma, lexicalunit.pos.short_value.upper())
            ].add(lexicalunit)

    def get_senses(self, lemma=None, pos=None):
        if lemma and pos:
            # adding spacy name handling
            plwn_pos = self.pos_mapping.get(pos, pos)
            return self.index_by_lemma[(lemma, plwn_pos)]
        return self.senses

    def get_sense_by_id(self, lemma, synid):
        try:
            synset = self.wn.synset_by_id(synid)
        except plwn.exceptions.SynsetNotFound:
            return
        try:
            return next(iter(
                (synset, lexicalunit) for lexicalunit in synset.lexical_units
                if lexicalunit.lemma == lemma
            ))
        except StopIteration:
            return

In [13]:
# wordnet = WordNet('./data/plwn-15012022.db')
wordnet = WordNet('./default_model')

wn = plwn.load('./data/plwn-15012022.db')

In [13]:
def polysemy_stats(wordnet, data):
    stats = {}
    for fname, doc in data:
        tokens = (token for sent in doc.sents for token in sent)
        for token in tokens:
            lemma, pos = token.lemma_, token.pos_
            senses = wordnet.get_senses(lemma, pos)
            stats[(lemma, pos)] = len(senses)
    return stats

In [None]:
for artist

In [14]:
import re
def clean_song_text(lyrics: str)-> str:
    def clean_brackets(text):
        pattern = r'\[.*?\]'
        return re.sub(pattern, '', text)

    # find the first occurance of the word "Lyrics", and discard what's before that
    lyrics_start = lyrics.find('Lyrics') + len('Lyrics')
    lyrics = lyrics[lyrics_start:].lower()
    # cut out the end of the string (the word Embed and the number)
    # search for the number on the end and if it exists cut out from it
    if re.search(r'\d+', lyrics[::-1]):
        lyrics_end = re.search(r'\d+', lyrics[::-1]).span()[1]
    else:
        lyrics_end = 1
    lyrics = lyrics[:-lyrics_end]
    # should ignore anything in the square brackets
    lyrics = clean_brackets(lyrics)
    return lyrics

In [15]:
# import one of the pl artits
# concat all the songs and put them to a doc
# just destroyed first from the list!
import os
import pickle
data_path = Path('scraped_data/artists_pl/')
artists = os.listdir(data_path)
artist_path = artists[1]
artist = pickle.load(open(data_path / artist_path, 'rb'))
all_songs = '\n'.join((clean_song_text(song.lyrics) for song in artist.songs))
cur_doc = nlp(all_songs)



In [37]:
data = [('dummy_name', cur_doc)]
stats_with_zeros = polysemy_stats(wordnet, data)
stats = {k:v for k,v in stats_with_zeros.items() if v}


In [33]:
sorted_stats = sorted(stats.items(), key=lambda i:i[1], reverse=True )
sorted_stats

[(('mieć', 'VERB'), 30),
 (('iść', 'VERB'), 30),
 (('czysty', 'ADJ'), 29),
 (('ciągnąć', 'VERB'), 29),
 (('odbić', 'VERB'), 27),
 (('brać', 'VERB'), 26),
 (('wziąć', 'VERB'), 25),
 (('zostawić', 'VERB'), 25),
 (('wychodzić', 'VERB'), 25),
 (('wyjście', 'NOUN'), 25),
 (('wybijać', 'VERB'), 25),
 (('pozostawić', 'VERB'), 24),
 (('stawiać', 'VERB'), 24),
 (('lecieć', 'VERB'), 24),
 (('wyjść', 'VERB'), 24),
 (('bić', 'VERB'), 23),
 (('beat', 'VERB'), 23),
 (('zejść', 'VERB'), 23),
 (('dojść', 'VERB'), 22),
 (('ciężki', 'ADJ'), 22),
 (('wejście', 'NOUN'), 22),
 (('przejść', 'VERB'), 21),
 (('słaby', 'ADJ'), 21),
 (('linia', 'NOUN'), 21),
 (('wbijać', 'VERB'), 21),
 (('przechodzić', 'VERB'), 21),
 (('mocny', 'ADJ'), 20),
 (('cut', 'NOUN'), 20),
 (('przebić', 'VERB'), 20),
 (('chodzić', 'VERB'), 19),
 (('palić', 'VERB'), 19),
 (('blok', 'NOUN'), 19),
 (('otwarty', 'ADJ'), 19),
 (('daleki', 'ADJ'), 18),
 (('ciężko', 'ADV'), 18),
 (('bliski', 'ADJ'), 17),
 (('stać', 'VERB'), 17),
 (('zbierać', 

In [22]:
type(None)

NoneType

In [None]:
polysemy_stats(wordnet, data)
# ujednoznacznić i policzyć w ilu znaczeniach dane słowo się pojawia w korpusie

"""
dokument                    słowo           domena                                      znaczenie   lemat   ile_sensów_w_korpusie   abstrakcyjny_hiperonim  aspekty wydźwięk    emocje  wartości_fundamentalne
OPS_PSL_25.01.2018_O_M_m	zlecenie	    zdarzenia
OPS_PSL_25.01.2018_O_M_m	Sądu	        grupy ludzi i rzeczy
OPS_PSL_25.01.2018_O_M_m	Rejonowego	    przymiotniki relacyjne (rzeczownikowe)
OPS_PSL_25.01.2018_O_M_m	Wydziału	    związek miedzy ludźmi, rzeczami lub ideami
OPS_PSL_25.01.2018_O_M_m	Karnego	        przymiotniki jakościowe
OPS_PSL_25.01.2018_O_M_m	dnia	        czas i stosunki czasowe
OPS_PSL_25.01.2018_O_M_m	obserwacji	    związane z myśleniem
OPS_PSL_25.01.2018_O_M_m	sądowo	        przymiotniki relacyjne (rzeczownikowe)
OPS_PSL_25.01.2018_O_M_m	psychiatrycznej	przymiotniki relacyjne (rzeczownikowe)
OPS_PSL_25.01.2018_O_M_m	terminie	    czas i stosunki czasowe
OPS_PSL_25.01.2018_O_M_m	Psychiatrii	    miejsca i umiejscowienie
OPS_PSL_25.01.2018_O_M_m	Sądowej	        przymiotniki relacyjne (rzeczownikowe)
OPS_PSL_25.01.2018_O_M_m	Instytutu	    związek miedzy ludźmi, rzeczami lub ideami
"""

# rejestry: wyciągnąć z nowego dumpa

{('Sporządzona', 'ADJ'): 0,
 ('na', 'ADP'): 0,
 ('zlecenie', 'NOUN'): 6,
 ('sąd', 'NOUN'): 6,
 ('rejonowy', 'ADJ'): 1,
 ('dla', 'ADP'): 0,
 ('Warszawa', 'PROPN'): 0,
 ('Mokotowa', 'PROPN'): 0,
 ('VIII', 'ADJ'): 0,
 ('wydział', 'NOUN'): 4,
 ('karny', 'ADJ'): 4,
 ('dotyczyć', 'ADJ'): 0,
 ('[', 'PUNCT'): 0,
 ('OFFENDER', 'PROPN'): 0,
 (']', 'PUNCT'): 0,
 ('urodzonego', 'ADJ'): 0,
 ('dzień', 'NOUN'): 4,
 ('DATE', 'X'): 0,
 ('po', 'ADP'): 0,
 ('obserwacja', 'NOUN'): 5,
 ('sądowy', 'ADJ'): 4,
 ('-', 'PUNCT'): 0,
 ('psychiatryczny', 'ADJ'): 5,
 ('przeprowadzić', 'ADJ'): 0,
 ('w', 'ADP'): 0,
 ('termin', 'NOUN'): 3,
 ('od', 'ADP'): 0,
 ('03', 'ADJ'): 0,
 ('.', 'X'): 0,
 ('10', 'ADJ'): 0,
 ('.', 'PUNCT'): 0,
 ('2017', 'ADJ'): 0,
 ('rok', 'X'): 0,
 ('do', 'ADP'): 0,
 ('27', 'ADJ'): 0,
 ('.', 'ADJ'): 0,
 ('Klinice', 'PROPN'): 0,
 ('psychiatria', 'NOUN'): 2,
 ('instytut', 'NOUN'): 2,
 ('i', 'CCONJ'): 0,
 ('Neurologia', 'PROPN'): 0,
 ('opinia', 'NOUN'): 3,
 ('zostać', 'AUX'): 0,
 ('sporządzić', 'ADJ

In [3]:
def disamb_fwns(wordnet, data):
    for fname, doc in data:
        tokens = (token for sent in doc.sents for token in sent)
        for token in tokens:
            lemma, pos = token.lemma_, token.pos_
            senses = wordnet.get_senses(lemma, pos)

            if not senses:
                continue

            fwns = sorted(
                senses,
                key=lambda sense: sense.variant
            )[0]
            yield fname, token, fwns.domain.value

In [39]:
with open('domains.txt', 'w') as ofile:
    for fname, token, domain in disamb_fwns(wordnet, data):
        ofile.write(f"{fname} {token} {domain}\n")

### *2nd Approach*
- Use disambiguation toolkit and apply it to CCL data
- Generate new stats based on disambiguation results

In [16]:
from xml.dom.minidom import parse
def load_data(datadir):
    data = []
    for fpath in datadir.iterdir():
        if not fpath.suffix == '.xml':
            continue
        if '.wsd' not in fpath.stem:
            continue
        yield (fpath.stem, parse(str(fpath)))

def load_data(datadir: Path):
    for fpath in datadir.iterdir():
        if fpath.suffix !=

In [5]:
def get_pos(ctag):
    "From NKJP Tagset to regular tagset"
    ctag = ctag.split(':')[0]

    nouns = {'subst', 'depr', 'brev'}
    verbs = {
        'fin', 'bedzie', 'praet', 'impt',
        'inf', 'pcon', 'pant', 'imps',
        'winien', 'pred', 'pact', 'ppas',
        'pred', 'ger'
    }
    adjs = {'adj', 'adja', 'adjp', 'adjc'}
    advs = {'adv'}

    if ctag in nouns:
        return "2"
    elif ctag in adjs:
        return "4"
    elif ctag in verbs:
        return "1"
    elif ctag in advs:
        return "3"

    return None

In [6]:
def get_lemma(base, props):
    """ Get token's lemma. The token might be a part of multiword expression,
    so we have to check token's properties.
    """
    try:
        return next(iter(
            p.firstChild.nodeValue for p in props
            if p.getAttribute('key') == 'mwe_base')
        )
    except StopIteration:
        pass
    try:
        return base.firstChild.nodeValue
    except Exception:
        pass

In [7]:
def get_sense(props):
    try:
        return next(iter(
            p.firstChild.nodeValue for p in props
            if p.getAttribute('key') == 'sense:ukb:syns_id')
        )
    except Exception:
        pass

In [8]:
def preproc(fpath, fdom):

    for token in fdom.getElementsByTagName('tok'):
        lex = token.getElementsByTagName('lex')[0]

        base, ctag = lex.childNodes
        pos = get_pos(ctag.firstChild.nodeValue)
        
        props = list(token.getElementsByTagName('prop'))

        lemma = get_lemma(base, props)
        sense = get_sense(props)

        if sense:
            yield (lemma, pos, sense)

In [9]:
def get_hypernym(synset):
    try:
        return next(iter(
            rel_obj for rel_type, rel_obj in synset.related_pairs()
            if rel_type.name == 'hiponimia' ))
    except Exception:
        pass
        

In [10]:
def decode_pos(pos):
    if pos == '2':
        return 'noun'
    elif pos == '4':
        return 'adj'
    elif pos == '1':
        return "verb"
    elif pos == '3':
        return "adv"

In [14]:
sense = next(iter(wordnet.get_senses('dom', 'NOUN')))
sense.usage_notes

('og.', 'og.')

In [None]:
data = load_data(Path('./data/preproc'))
header = ["filename", "lemma", "pos", "verb_aspect", "num_senses", "sentiment", "emotions", "valuations", "synset", "domain", "1st-hypernym", "2nd-hypernym", "3rd-hypernym", "register"]
with open('wordnet-analysis.tsv', 'w') as ofile:
    ofile.write('\t'.join(header))
    ofile.write('\n')
    for fname, fdom in data:
        for row in preproc(fname, fdom):
            lemma, pos, sense = row

            pos = decode_pos(pos)
            num_senses = len(wordnet.get_senses(lemma, pos.upper()))

            try:
                synset, sense = wordnet.get_sense_by_id(lemma, sense)
            except Exception:
                continue

            hypernym_1st = get_hypernym(synset)
            hypernym_2nd = None
            hypernym_3rd = None
            if hypernym_1st:
                hypernym_2nd = get_hypernym(hypernym_1st)
            if hypernym_2nd:
                hypernym_3rd = get_hypernym(hypernym_2nd)

            aspect = sense.verb_aspect if sense.verb_aspect else 'null'
            hypernym_1st = 'null' if not hypernym_1st else hypernym_1st.short_str()
            hypernym_2nd = 'null' if not hypernym_2nd else hypernym_2nd.short_str()
            hypernym_3rd = 'null' if not hypernym_3rd else hypernym_3rd.short_str()

            sentiment = sense.emotion_markedness.name if sense.emotion_markedness else 'null'
            emotions = '|'.join(emotion.name for emotion in sense.emotion_names if sense.emotion_names) if sense.emotion_names else 'null'
            valuations = '|'.join(value.name for value in sense.emotion_valuations if sense.emotion_valuations) if sense.emotion_valuations else 'null'

            ofile.write(f"{fname}\t{lemma}\t{pos}\t{aspect}\t{num_senses}\t{sentiment}\t{emotions}\t{valuations}\t{synset.short_str()}\t{sense.domain.value}\t{hypernym_1st}\t{hypernym_2nd}\t{hypernym_3rd}\t{sense.usage_notes}\n")


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('./wordnet-analysis.tsv', sep="\t")

In [None]:
data.columns

Index(['filename', 'lemma', 'pos', 'sentiment', 'emotions', 'valuations',
       'synset', 'domain', 'abstract_hypernym'],
      dtype='object')

In [None]:
data["domain"].value_counts()

przymiotniki jakościowe                                                   41770
czasowniki stanowe                                                        23428
ludzie                                                                    20091
PWN: all adverbs                                                          18181
czas i stosunki czasowe                                                   16130
zdarzenia                                                                 13637
związane z myśleniem                                                      12940
czasowniki mówienia, śpiewania itp.                                       12694
czasowniki myślenia (szeroko rozumianego)                                 12562
czasowniki oznacz. wydarzenie i działania społeczne i polityczne          12480
sytuacje statyczne (stany)                                                 9290
miejsca i umiejscowienie                                                   8745
związek miedzy ludźmi, rzeczami lub idea

In [None]:
domain_stats = data.groupby(['filename', 'domain']).size().unstack(fill_value=0)

In [None]:
domain_stats

domain,PWN: all adverbs,cechy ludzi i zwierząt,cel działania,czas i stosunki czasowe,czasowniki akumulatywne,czasowniki delimitatywne,czasowniki jedzenia,czasowniki myślenia (szeroko rozumianego),"czasowniki mówienia, śpiewania itp.","czasowniki oznacz. kontakt fizyczny (dotykanie, uderzenie, rycie itp.)",...,sytuacje statyczne (stany),"uczucia, odczucia i emocje",wytwory ludzkie (nazwy),zdarzenia,zjawiska naturalne,"zmiana wielkości, temeraturym natężenia, itp.",zwierzęta,związane z myśleniem,związane z porozumiewaniem się,"związek miedzy ludźmi, rzeczami lub ideami"
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OPS_PSLG_07.10.2015_A_K_m,350,78,32,202,1,0,1,279,202,71,...,266,80,117,311,19,125,0,209,92,219
OPS_PSLS_17.05.2018_A_M_m,135,32,9,84,0,0,5,92,108,24,...,42,7,60,53,6,31,1,95,29,33
OPS_PSLS_27.10.2015_A_M_m,104,69,11,112,0,0,0,98,50,12,...,64,20,28,133,5,64,1,148,50,85
OPS_PSL_03.03.2015_A_M_m,78,61,5,59,0,0,0,54,48,14,...,75,20,24,94,11,56,0,78,54,62
OPS_PSL_04.04.2016_A_K_m,112,34,7,99,0,0,5,84,114,19,...,29,19,46,59,6,42,0,87,47,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SO_V_PS_17.09.2010_A_M_m,37,28,1,37,0,0,12,24,21,11,...,24,6,12,22,3,33,0,22,7,24
SO_V_PS_22.05.2015_A_M_m,33,28,1,14,0,0,1,18,37,3,...,19,17,10,17,4,14,0,32,8,10
SO_V_PS_23.07.2012_O_M_m,95,32,1,77,0,0,5,53,87,17,...,23,8,47,76,8,30,1,56,36,76
SO_V_PS_26.11.2014_O_M_m,67,51,9,59,0,0,4,44,53,11,...,38,12,48,64,5,20,3,58,38,57


In [None]:
sentiment_stats = data.groupby(['filename', 'sentiment']).size().unstack(fill_value=0)

In [None]:
sentiment_stats

sentiment,ambiguous,strong_negative,strong_positive,weak_negative,weak_positive
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OPS_PSLG_07.10.2015_A_K_m,413,165,73,192,416
OPS_PSLS_17.05.2018_A_M_m,82,26,11,55,157
OPS_PSLS_27.10.2015_A_M_m,164,50,28,47,211
OPS_PSL_03.03.2015_A_M_m,109,67,16,59,202
OPS_PSL_04.04.2016_A_K_m,88,37,19,78,132
...,...,...,...,...,...
SO_V_PS_17.09.2010_A_M_m,36,18,3,29,45
SO_V_PS_22.05.2015_A_M_m,15,17,4,30,38
SO_V_PS_23.07.2012_O_M_m,60,37,8,74,104
SO_V_PS_26.11.2014_O_M_m,79,47,6,40,113


In [6]:
import plwn
wn = plwn.load('default_model')
# wn = plwn.load('./data/plwn-15012022.db')

In [7]:
from collections import defaultdict
lexicalunits = defaultdict(list)
for lu in wn.lexical_units():
    lexicalunits[lu.pos.name].append(lu)

In [8]:
lu = lexicalunits['verb'][0]

In [9]:
lu

<LexicalUnit id='d44b223d-aac4-11ed-aae5-0242ac130002' lemma='administrować' pos=<PoS.verb: 'czasownik'> variant=1>