## Teksta sarežģītības līmeņa klasifikācija

In [None]:
!pip -q install -U datasets
!pip -q install stanza

In [None]:
import json
import pandas as pd
import torch
import re
import stanza
import math
import random
from datasets import Dataset

In [None]:
!wget -q -O exam_texts.json https://raw.githubusercontent.com/LUMII-AILab/VTI-Data/refs/heads/main/lv-exams/NLG/VISC_LATV.json
!wget -q -O essays.csv https://lava.korpuss.lv/download/essays_20211214.csv
!wget -q -O top_words.csv https://raw.githubusercontent.com/IvoDz/lv-text-complexity/refs/heads/main/wordlist_LVK2022.csv

Sākotnējais teksta sarežģītības novērtēšanas algoritms

In [None]:
words_df = pd.read_csv("top_words.csv")

In [None]:
nlp = stanza.Pipeline(lang='lv', processors='tokenize')

word_freq = {}
MAX_LOGFREQ = 1.0

def init_classifier(words_df: pd.DataFrame):
    """ Startē klasifikatoru, uzstrāda vārdu biežumus """
    global word_freq, MAX_LOGFREQ
    word_freq = dict(words_df.values)
    MAX_LOGFREQ = math.log10(max(word_freq.values(), default=1) + 1)

def tokenize_words_and_sentences(text: str):
    """ Sadala tekstu vārdos un tekstvienībās """
    doc = nlp(text)
    words = [word.text.lower() for sent in doc.sentences for word in sent.words if word.text.isalpha()]
    sents = [sent.text for sent in doc.sentences]
    return words, sents

def word_rarity(word: str) -> float:
    """ Iegūst relatīvu vārda biežuma rādītāju vienam vārdam """
    freq = word_freq.get(word.lower(), 1)
    log_freq = math.log10(freq + 1)
    return max(0.0, 1.0 - log_freq / MAX_LOGFREQ)

def avg_word_rarity(words: list[str]) -> float:
    """ Iegūst vidēju relatīvu vārda biežuma rādītāju vārdu sarakstā """
    content = [w for w in words if len(w) > 3]
    return (sum(word_rarity(w) for w in content) / len(content)) if content else 0.0

def classify(text: str, debug: bool=False) -> dict:
    """
    Klasificē tekstu vienā no 3 kategorijām: viegls, vidējs, sarežģīts
        - ja vārds satur ne vairāk kā 5 vārdus un katrs vārds ir vai nu <= 3 simbolus garš, vai nu ir iekš top 1000 biežākajiem vārdiem - uzreiz atgriež "viegls"
        - citādi, ja vārds ir ne vairāk kā 6 vārdus garš, atgriež rezultātu balstoties uz statiskiem parametriem vidējam vārda garumam un biežumam
        - ja teikums ir garāks, rēķina rezultātu balstoties uz 4 parametriem:
            - vid. vārda garums teikumā
            - vid. vārdu retums sarakstā (no LVK biežumvārdnīcas)
            - vid. teikuma garums tekstā
            - vid. vārdu skaits teikumā
     """
    text = text.strip()
    if not text:
        return {"text": text, "level": "nederīgs", "score": None}

    words, sents = tokenize_words_and_sentences(text)
    if not words:
        return {"text": text, "level": "nederīgs", "score": None}

    wc = len(words)
    avg_wlen = sum(len(w) for w in words) / wc
    avg_slen = sum(len(sent.split()) for sent in sents if sent.strip()) / len(sents)
    rarity = avg_word_rarity(words)
    length_score = max(0.0, min((wc - 2) / 18.0, 1.0))

    if wc <= 5 and all(word_freq.get(w, 0) > 1000 or len(w) <= 3 for w in words):
        return {"text": text, "level": "viegls", "score": 0.0} if debug else {"text": text, "level": "viegls"}

    if wc <= 6:
        if avg_wlen <= 6.0:
            lvl = "viegls"
        elif avg_wlen <= 7.5:
            lvl = "vidējs"
        else:
            lvl = "vidējs" if rarity < 0.5 else "sarežģīts"
        return {"text": text, "level": lvl} if not debug else {"text": text, "level": lvl, "score": None}

    score = (
        0.35 * min(avg_wlen / 7.0, 1.0) +
        0.2  * rarity +
        0.25 * min(avg_slen / 20.0, 1.0) +
        0.2  * length_score
    )

    if score < 0.5:
        lvl = "viegls"
    elif score < 0.6:
        lvl = "vidējs"
    else:
        lvl = "sarežģīts"

    result = {"text": text, "level": lvl}
    if debug:
        result.update({
            "score": round(score, 4),
            "word_count": wc,
            "avg_word_length": round(avg_wlen, 2),
            "avg_sentence_length": round(avg_slen, 2),
            "rarity": round(rarity, 4),
            "length_score": round(length_score, 4),
        })
    return result

In [None]:
init_classifier(words_df) # inicializē klasifikatoru

Eseju tekstu sagatavošana / priekšapstrāde

In [None]:
def slice_from_second_sentence(text):
    """ Atgriež teksta fragmentu, ne garāku par 20 vārdiem, sākot ar 2. teikumu dotajā tekstā """
    sentences = re.split(r'(?<=[.!?])\s+', text)

    if len(sentences) < 2:
        return None

    start_idx = random.randint(2, len(sentences) - 1)

    result_sentences = []
    total_words = 0

    for sentence in sentences[start_idx:]:
        sentence = sentence.strip()
        word_count = len(sentence.split())

        if total_words + word_count > 20:
            break

        result_sentences.append(sentence)
        total_words += word_count

    if total_words < 2:
        return None

    return ' '.join(result_sentences)

In [None]:
df = pd.read_csv('essays.csv', usecols=['corrected_text'])
df['corrected_text'] = (
    df['corrected_text']
    .astype(str)
    .str.replace(r'\r?\n', ' ', regex=True)
    .str.replace(r'\*', '', regex=True)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)


df = df[df['corrected_text'].str.split().str.len() >= 2]
df['char_len'] = df['corrected_text'].str.len()
df = df.sort_values(by='char_len').head(350).copy()
df = df.drop(columns='char_len')

df['corrected_text'] = df['corrected_text'].apply(slice_from_second_sentence)
df = df.dropna().reset_index(drop=True)

In [None]:
classified_essays = df['corrected_text'].apply(classify)
classified_essays = pd.DataFrame(classified_essays.tolist())
classified_essays

In [None]:
classified_essays.to_csv("essays_labeled.csv", index=False)

Wikipedia tekstu sagatavošana

In [None]:
### ņemts no https://huggingface.co/datasets/RaivisDejus/latvian-text/blob/main/tools/wikipedia/GetWikipedia.py

from datasets import load_dataset
from tqdm import tqdm

DATE = "20221120"
CUTOFF_SECTIONS = ['Atsauces un piezīmes', 'Atsauces', 'Ārējās saites', 'Literatūra', 'Skatīt arī',
                   ' Atsauces un piezīmes', ' Atsauces', ' Ārējās saites', ' Literatūra', ' Skatīt arī']

dataset = load_dataset('joelito/EU_Wikipedias', date=DATE, language="lv", split='train', trust_remote_code=True)
subset = dataset.select(range(10000))

with open(f'wikipedia_{DATE}.txt', 'w', encoding='utf-8') as file:
    for entry in tqdm(subset):
        cutoffs = [len(entry["text"])]
        for section in CUTOFF_SECTIONS:
            idx = entry["text"].find('\n\n' + section)
            if idx != -1:
                cutoffs.append(idx)

        file.write(f'{entry["title"]}\n\n')
        file.write(f'{entry["text"][:min(cutoffs)]}\n\n\n')

print('Done')

In [None]:
with open('wikipedia_20221120.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

non_empty_lines = [line.strip() for line in lines if line.strip()]
long_enough_lines = [line for line in non_empty_lines if len(line.split()) >= 10]

sampled_lines = pd.Series(long_enough_lines).sample(
    n=min(5000, len(long_enough_lines)),
    random_state=42
).reset_index(drop=True)

df = pd.DataFrame(sampled_lines, columns=['text'])

In [None]:
LATVIAN_CHARS = 'A-Za-zĀČĒĢĪĶĻŅŠŪŽāčēģīķļņšūž'
DIGITS = '0-9'
PUNCTUATION = r'\.\,\:\;\!\?\-\(\)\'\"'

df['text'] = (
    df['text']
    .astype(str)
    .str.replace(r'\r?\n', ' ', regex=True)
    .str.replace(r'[^' + LATVIAN_CHARS + DIGITS + PUNCTUATION + r'\s]', '', regex=True)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)

df = df[df['text'].apply(lambda x: sum(w.isalnum() for w in x.split()) >= 3)]

def truncate_words(text):
    """ Atgiež nejaušu fragmentu 3-20 vārdu garumā no dotā teksta. """
    words = text.split()
    if len(words) < 3:
        return None
    end = random.randint(3, min(20, len(words)))
    return ' '.join(words[:end])

df['text'] = df['text'].apply(truncate_words)
df = df.dropna().reset_index(drop=True)

df = df.sample(n=min(1000, len(df)), random_state=42).reset_index(drop=True)

In [None]:
classified_wiki = df['text'].apply(classify)
classified_wiki = pd.DataFrame(classified_wiki.tolist())
classified_wiki = classified_wiki[["text", "level"]]
classified_wiki.to_csv("wiki_labeled.csv", columns=["text", "level"], index=False)

Eksāmenu tekstu sagatavošana

In [None]:
with open('exam_texts.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df_exams = pd.DataFrame(data["LV_EXAM_SET"])[["TEKSTA_FRAGMENTS"]]

df_exams['TEKSTA_FRAGMENTS'] = (
    df_exams['TEKSTA_FRAGMENTS']
    .astype(str)
    .str.replace(r'\r?\n', ' ', regex=True)
    .str.replace(r'\*', '', regex=True)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)

df_exams['text'] = df_exams['TEKSTA_FRAGMENTS']
df_exams = df_exams.drop('TEKSTA_FRAGMENTS', axis=1)

In [None]:
def until_first_excl_or_period(text):
    """ Atgriež tekstu līdz pirmajam punktam vai izsaukuma zīmei dotajā tekstā. """
    for punct in ['.', '!']:
        idx = text.find(punct)
        if idx != -1:
            return text[:idx + 1]
    return text

df_exams['text'] = df_exams['text'].apply(until_first_excl_or_period)

df_exams = df_exams[
    df_exams['text'].apply(lambda x: 2 <= len(str(x).split()) <= 20)
].reset_index(drop=True)

In [None]:
classified_exams = df_exams['text'].apply(classify)
classified_exams = pd.DataFrame(classified_exams.tolist())
classified_exams.to_csv("exams_labeled.csv", index=False)

Tekstu apvienošana vienā datu kopā

In [None]:
labeled_exams = pd.read_csv('exams_labeled.csv')
labeled_wiki = pd.read_csv('wiki_labeled.csv')
labeled_essays = pd.read_csv('essays_labeled.csv')

In [None]:
labeled_full = pd.concat([labeled_exams, labeled_wiki, labeled_essays])

In [None]:
labeled_full['text'] = (
    labeled_full['text']
    .astype(str)
    .str.strip()
    .str.replace(r'^[\s\"\'\-\–\.\,\:\;\!\?\(\[]+', '', regex=True)
    .str.replace(r'[\s\"\'\-\.\–\,\:\;\!\?\)\]]+$', '', regex=True)
)

labeled_full['text']

In [None]:
final = labeled_full['text'].apply(classify)
final = pd.DataFrame(final.tolist())
final.to_csv("final.csv", columns=['text','level'], index=False)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
final.groupby('level').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

Sākotnējie dati sagatavoti, vēlāk tie koriģēti manuāli.