# Morfologicka anotacia

In [1]:
%pip install stanza

Note: you may need to restart the kernel to use updated packages.


In [3]:
import stanza
import tqdm
import os
import pandas as pd
stanza.download('sk')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 22.7MB/s]                    
2023-10-27 09:50:39 INFO: Downloading default packages for language: sk (Slovak) ...


2023-10-27 09:50:40 INFO: File exists: C:\Users\jakub\stanza_resources\sk\default.zip
2023-10-27 09:50:42 INFO: Finished downloading models and saved to C:\Users\jakub\stanza_resources.


In [3]:
nlp_sk = stanza.Pipeline('sk', processors='tokenize,lemma,pos', verbose=False, use_gpu=True)

In [6]:
def get_text(path):
    files = os.listdir(path)
    #files.remove('.ipynb_checkpoints')
    docs = []
    files_names = []
    for i, fname in enumerate(files):
        with open(path+fname, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        file_name = fname[:-4]
        all_text = ' '.join(lines)
        docs.append(all_text)
        files_names.append(file_name)
    return docs

def get_file_names(path):
    files = os.listdir(path)
    #files.remove('.ipynb_checkpoints')
    #docs = []
    files_names = []
    for i, fname in enumerate(files):
        with open(path+fname, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        file_name = fname[:-4]
        all_text = ' '.join(lines)
        #docs.append(all_text)
        files_names.append(file_name)
    return files_names

In [7]:
texty_df = pd.DataFrame({'text':get_text('martak_texty/'), 'file_name':get_file_names('martak_texty/')})
texty_df.head()

Unnamed: 0,text,file_name
0,﻿\n \n \n \n \n \n \n \n \n \n \n pre 2. roční...,Chémia pre 2. ročník gymnázia so štvorročným š...
1,﻿SLOVENSKÝ\n JAZYK pre 3. ročník základných šk...,Slovenský jazyk pre 3. ročník ZŠ
2,﻿\n \n \n \n \n \n \n \n k HUPSOVMU\n šlabikár...,Pracovný zošit k Hupsovmu šlabikáru Lipka pre ...
3,﻿\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...,Dejepis pre 8. ročník ZŠ a 3. ročník gymnázia ...
4,﻿\n \n \n \n \n SLOVENSKÝ\n JAZYK pre 2. roční...,Slovenský jazyk pre 2. ročník ZŠ


## POS-taging

In [10]:
def tokenization(data):
    df_text = pd.DataFrame(columns=['document','word','lemma','pos'])
    for i in tqdm.notebook.trange(len(data),desc='Text tokenization'):
        try:
            doc = nlp_sk(data.loc[i, "text"])
        except AssertionError:
            continue
        else:
            filename = data.loc[i, "file_name"]
            for sentence in doc.sentences:
                for word in sentence.words:
                    df_text = pd.concat([df_text, pd.DataFrame.from_records([{ 
                        'document': filename,
                        'word': word.text,
                        'lemma': word.lemma,
                        'pos': word.pos
                    }])], ignore_index=True)
    return df_text

In [11]:
textMR = tokenization(texty_df)

Text tokenization:   0%|          | 0/33 [00:00<?, ?it/s]

In [24]:
textMR.to_csv('martak_texty/text_MR.csv', sep=';', encoding='utf8', decimal=',')

In [12]:
# count each lexical category
def morphostats(data):
    df_stat = pd.DataFrame(columns=['document','NOUN','ADJ','VERB','DET','ADP','PROPN','CCONJ','SCONJ','INTJ','ADV','PRON',
                                'AUX','NUM','PART','PUNCT','SPACE','X'])
    for doc in data['document'].unique():
        dfp = data.loc[data['document']==doc]
        df_stat = pd.concat([df_stat, pd.DataFrame.from_records([{ 
            'document': doc,
            'NOUN': len(dfp.loc[dfp['pos']=='NOUN']),
            'ADJ': len(dfp.loc[dfp['pos']=='ADJ']),
            'VERB': len(dfp.loc[dfp['pos']=='VERB']),
            'DET': len(dfp.loc[dfp['pos']=='DET']),
            'ADP': len(dfp.loc[dfp['pos']=='ADP']),
            'PROPN': len(dfp.loc[dfp['pos']=='PROPN']),
            'CCONJ': len(dfp.loc[dfp['pos']=='CCONJ']),
            'SCONJ': len(dfp.loc[dfp['pos']=='SCONJ']),
            'INTJ': len(dfp.loc[dfp['pos']=='INTJ']),
            'ADV': len(dfp.loc[dfp['pos']=='ADV']),
            'PRON': len(dfp.loc[dfp['pos']=='PRON']),
            'AUX': len(dfp.loc[dfp['pos']=='AUX']),
            'PART': len(dfp.loc[dfp['pos']=='PART']),
            'PUNCT': len(dfp.loc[dfp['pos']=='PUNCT']),
            'SPACE': len(dfp.loc[dfp['pos']=='SPACE']),
            'X': len(dfp.loc[dfp['pos']=='X']),
            'NUM': len(dfp.loc[dfp['pos']=='NUM'])
        }])], ignore_index=True)
    return df_stat

In [13]:
text_stats_MR = morphostats(textMR)

In [15]:
text_stats_MR.to_csv('martak_texty/text_stats_MR.csv', sep=';', encoding='utf8', decimal=',')

In [16]:
import re
import string
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     lbenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
def analyze_senteces(df, cat):
    long_sent = 0
    short_sent = 0
    count_words = 0
    count_char = 0
    list_text = []
    for i in df.index:
        sentences = nltk.sent_tokenize(df.at[i,'text'])
        for sen in sentences:
            tex = re.sub('['+string.punctuation+']', '', str(sen))
            count_char += len(tex.replace(" ",""))
            rtext = re.sub('['+string.punctuation+']', '', str(sen)).split()
            for w in rtext:
                if w not in list_text:
                    list_text.append(w)
            if len(rtext)>=10:
                long_sent += 1
            else:
                short_sent += 1
            count_words += len(rtext)
    no_sent = long_sent+short_sent
    print(cat,':')
    print('avg_sent',count_words/no_sent)
    print('avg_word',count_char/count_words)
    print('short_sent',short_sent)
    print('long_sent',long_sent)
    print('no_sent',no_sent)

In [18]:
analyze_senteces(texty_df, 'annual_reports')

annual_reports :
avg_sent 11.735774618052798
avg_word 5.3473517964916235
short_sent 38901
long_sent 36633
no_sent 75534


# POS Tagging for complexity

In [25]:
def xposing(data):
    df_text = pd.DataFrame(columns=['lemma','xpos'])
    for i in tqdm.notebook.trange(len(data),desc='Text tokenization'):
        try:
            doc = nlp_sk(data.loc[i, "text"])
        except AssertionError:
            continue
        else:
            filename = data.loc[i, "file_name"]
            for sentence in doc.sentences:
                for word in sentence.words:
                    df_text = pd.concat([df_text, pd.DataFrame.from_records([{ 
                        'document': filename,
                        'lemma': word.lemma,
                        'pos': word.xpos
                    }])], ignore_index=True)
    return df_text

In [None]:
posMR = xposing(texty_df)

Text tokenization:   0%|          | 0/33 [00:00<?, ?it/s]

In [None]:
posMR = posMR.drop('xpos', axis=1)

In [None]:
posMR.to_csv('martak_texty/posMR.csv', sep=';', encoding='utf8', decimal=',', index = False)