# Overview of classical NLP techniques for text classification using Python libraries such as datasets, sklearn, and nltk.

In [1]:
%%capture
!pip install datasets fasttext "numpy<2" ftfy nltk pymorphy3 pymorphy3-dicts-uk optuna # for fasttext

Add a HF_TOKEN to run this notebook (key on the left)

### We would use a ukrainian news dataset for clasification task, consisting of approximately 61K news articles.

In [2]:
import datasets

dataset = datasets.load_dataset("shamotskyi/ukr_pravda_2y", split="train")
dataset

Dataset({
    features: ['art_id', 'date_published', 'tags', 'ukr_uri', 'ukr_title', 'ukr_author_name', 'ukr_text', 'ukr_tags', 'ukr_tags_full', 'rus_uri', 'rus_title', 'rus_author_name', 'rus_text', 'rus_tags', 'rus_tags_full', 'eng_uri', 'eng_title', 'eng_author_name', 'eng_text', 'eng_tags', 'eng_tags_full'],
    num_rows: 61629
})

### Sanity check for the data

In [3]:
dataset[0]

{'art_id': 7411594,
 'date_published': '2023-07-16',
 'tags': 'trivoga',
 'ukr_uri': 'https://www.pravda.com.ua/news/2023/07/16/7411594/',
 'ukr_title': '–£ –ö–∏—î–≤—ñ —Ç–∞ –Ω–∏–∑—Ü—ñ –æ–±–ª–∞—Å—Ç–µ–π –Ω–∞ 10 —Ö–≤–∏–ª–∏–Ω –æ–≥–æ–ª–æ—à—É–≤–∞–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É',
 'ukr_author_name': '–ö–∞—Ç–µ—Ä–∏–Ω–∞ –¢–∏—â–µ–Ω–∫–æ',
 'ukr_text': '–£ –ö–∏—î–≤—ñ —Ç–∞ —â–µ –Ω–∏–∑—Ü—ñ —Ä–µ–≥—ñ–æ–Ω—ñ–≤ –£–∫—Ä–∞—ó–Ω–∏ –æ–≥–æ–ª–æ—à—É–≤–∞–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É —á–µ—Ä–µ–∑ –∑–∞–≥—Ä–æ–∑—É –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–æ–≥–æ –æ–∑–±—Ä–æ—î–Ω–Ω—è. –î–∂–µ—Ä–µ–ª–æ: alerts.in.ua, –ü–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –î–µ—Ç–∞–ª—ñ: –ë–ª–∏–∑—å–∫–æ 18:52 —É —Å—Ç–æ–ª–∏—Ü—ñ —Ç–∞ —ñ–Ω—à–∏—Ö —Ä–µ–≥—ñ–æ–Ω–∞—Ö –æ–≥–æ–ª–æ—Å–∏–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É. –ü—ñ–∑–Ω—ñ—à–µ –ü–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –ø–æ–≤—ñ–¥–æ–º–∏–ª–∏, —â–æ –≤ –ß–µ—Ä–Ω—ñ–≥—ñ–≤—Å—å–∫—ñ–π, –ö–∏—ó–≤—Å—å–∫—ñ–π, –°—É–º—Å—å–∫—ñ–π, –ü–æ–ª—Ç–∞–≤—Å—å–∫—ñ–π, –ß–µ—Ä–∫–∞—Å—å–∫—ñ–π, –î–Ω—ñ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å—å–∫—ñ–π, –î–æ–Ω–µ—Ü—

In [4]:
dataset = dataset.select_columns(["ukr_text", "ukr_tags", "tags", "ukr_tags_full"])
dataset[0]

{'ukr_text': '–£ –ö–∏—î–≤—ñ —Ç–∞ —â–µ –Ω–∏–∑—Ü—ñ —Ä–µ–≥—ñ–æ–Ω—ñ–≤ –£–∫—Ä–∞—ó–Ω–∏ –æ–≥–æ–ª–æ—à—É–≤–∞–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É —á–µ—Ä–µ–∑ –∑–∞–≥—Ä–æ–∑—É –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–æ–≥–æ –æ–∑–±—Ä–æ—î–Ω–Ω—è. –î–∂–µ—Ä–µ–ª–æ: alerts.in.ua, –ü–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –î–µ—Ç–∞–ª—ñ: –ë–ª–∏–∑—å–∫–æ 18:52 —É —Å—Ç–æ–ª–∏—Ü—ñ —Ç–∞ —ñ–Ω—à–∏—Ö —Ä–µ–≥—ñ–æ–Ω–∞—Ö –æ–≥–æ–ª–æ—Å–∏–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É. –ü—ñ–∑–Ω—ñ—à–µ –ü–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –ø–æ–≤—ñ–¥–æ–º–∏–ª–∏, —â–æ –≤ –ß–µ—Ä–Ω—ñ–≥—ñ–≤—Å—å–∫—ñ–π, –ö–∏—ó–≤—Å—å–∫—ñ–π, –°—É–º—Å—å–∫—ñ–π, –ü–æ–ª—Ç–∞–≤—Å—å–∫—ñ–π, –ß–µ—Ä–∫–∞—Å—å–∫—ñ–π, –î–Ω—ñ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å—å–∫—ñ–π, –î–æ–Ω–µ—Ü—å–∫—ñ–π, –ó–∞–ø–æ—Ä—ñ–∑—å–∫—ñ–π –æ–±–ª–∞—Å—Ç—è—Ö —ñ—Å–Ω—É—î –∑–∞–≥—Ä–æ–∑–∞ –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–æ–≥–æ –æ–∑–±—Ä–æ—î–Ω–Ω—è. –û–¥–Ω–∞–∫ –≤–∂–µ –æ 19:02 —Ç—Ä–∏–≤–æ–≥—É —Å–∫–∞—Å—É–≤–∞–ª–∏ –º–∞–π–∂–µ –≤ —É—Å—ñ—Ö –æ–±–ª–∞—Å—Ç—è—Ö.',
 'ukr_tags': '–ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞',
 'tags': 'trivoga',
 'ukr_tags_full':

In [5]:
dataset[4]

{'ukr_text': '5 –≥—Ä—É–¥–Ω—è, —É —Ä—ñ—á–Ω–∏—Ü—é –ø—ñ–¥–ø–∏—Å–∞–Ω–Ω—è –ë—É–¥–∞–ø–µ—à—Ç—Å—å–∫–æ–≥–æ –º–µ–º–æ—Ä–∞–Ω–¥—É–º—É, —Ä–æ—Å—ñ–π—Å—å–∫—ñ –≤—ñ–π—Å—å–∫–∞ –≤–ª—É—á–∏–ª–∏ –≤ –µ–Ω–µ—Ä–≥–µ—Ç–∏—á–Ω—ñ –æ–±‚Äô—î–∫—Ç–∏ —É –ö–∏—ó–≤—Å—å–∫—ñ–π, –í—ñ–Ω–Ω–∏—Ü—å–∫—ñ–π —Ç–∞ –û–¥–µ—Å—å–∫—ñ–π –æ–±–ª–∞—Å—Ç—è—Ö –£–∫—Ä–∞—ó–Ω–∏. –î–∂–µ—Ä–µ–ª–æ: –ø—Ä–µ–º\'—î—Ä-–º—ñ–Ω—ñ—Å—Ç—Ä –£–∫—Ä–∞—ó–Ω–∏ –î–µ–Ω–∏—Å –®–º–∏–≥–∞–ª—å –ü—Ä—è–º–∞ –º–æ–≤–∞: "–¢–µ—Ä–æ—Ä–∏—Å—Ç–∏—á–Ω–∞ –∫—Ä–∞—ó–Ω–∞ –†–æ—Å—ñ—è —Å–ø—Ä–æ–±—É–≤–∞–ª–∞ –∑–Ω–æ–≤—É —Ä–µ–∞–ª—ñ–∑—É–≤–∞—Ç–∏ —Å–≤—ñ–π –∑–ª–æ—á–∏–Ω–Ω–∏–π –ø–ª–∞–Ω ‚Äî –∑–∞–Ω—É—Ä–∏—Ç–∏ –£–∫—Ä–∞—ó–Ω—É –≤ —Ç–µ–º—Ä—è–≤—É —Ç–∞ —Ö–æ–ª–æ–¥. –ó–∞–≤–¥—è–∫–∏ –≥–µ—Ä–æ—ó—á–Ω–∏–º –ó–°–£ —Ç–∞ —Å–∏–ª–∞–º –ü–ü–û –≤–æ—Ä–æ–≥—É –≤–∫–æ—Ç—Ä–µ –Ω–µ –≤–¥–∞–ª–æ—Å—è –∑–∞–¥—É–º–∞–Ω–µ. –ï–Ω–µ—Ä–≥–µ—Ç–∏—á–Ω–∞ —Å–∏—Å—Ç–µ–º–∞ –∫—Ä–∞—ó–Ω–∏ —Ñ—É–Ω–∫—Ü—ñ–æ–Ω—É—î —Ç–∞ –∑–∞–ª–∏—à–∞—î—Ç—å—Å—è —Ü—ñ–ª—ñ—Å–Ω–æ—é. –ë—É–ª–∏ –≤–ª—É—á–∞–Ω–Ω—è –≤ –µ–Ω–µ—Ä–≥–æ–æ–æ–±‚Äô—î–∫—Ç–∏ —É –ö–∏—ó–≤—Å—å–∫—ñ–π, –í—ñ–Ω–Ω–∏—Ü—å–∫—ñ–π, –û–¥–µ—Å—å–∫—ñ–π 

In [6]:
# first basic filter
dataset = dataset.filter(lambda x: isinstance(x["ukr_text"], str))
dataset = dataset.filter(lambda x: isinstance(x["ukr_tags"], str))
dataset

Dataset({
    features: ['ukr_text', 'ukr_tags', 'tags', 'ukr_tags_full'],
    num_rows: 56131
})

In [7]:
dataset = dataset.map(lambda x: {'ukr_tags': [i.strip() for i in x['ukr_tags'].split(',')]}, num_proc=8)
dataset[4]

{'ukr_text': '5 –≥—Ä—É–¥–Ω—è, —É —Ä—ñ—á–Ω–∏—Ü—é –ø—ñ–¥–ø–∏—Å–∞–Ω–Ω—è –ë—É–¥–∞–ø–µ—à—Ç—Å—å–∫–æ–≥–æ –º–µ–º–æ—Ä–∞–Ω–¥—É–º—É, —Ä–æ—Å—ñ–π—Å—å–∫—ñ –≤—ñ–π—Å—å–∫–∞ –≤–ª—É—á–∏–ª–∏ –≤ –µ–Ω–µ—Ä–≥–µ—Ç–∏—á–Ω—ñ –æ–±‚Äô—î–∫—Ç–∏ —É –ö–∏—ó–≤—Å—å–∫—ñ–π, –í—ñ–Ω–Ω–∏—Ü—å–∫—ñ–π —Ç–∞ –û–¥–µ—Å—å–∫—ñ–π –æ–±–ª–∞—Å—Ç—è—Ö –£–∫—Ä–∞—ó–Ω–∏. –î–∂–µ—Ä–µ–ª–æ: –ø—Ä–µ–º\'—î—Ä-–º—ñ–Ω—ñ—Å—Ç—Ä –£–∫—Ä–∞—ó–Ω–∏ –î–µ–Ω–∏—Å –®–º–∏–≥–∞–ª—å –ü—Ä—è–º–∞ –º–æ–≤–∞: "–¢–µ—Ä–æ—Ä–∏—Å—Ç–∏—á–Ω–∞ –∫—Ä–∞—ó–Ω–∞ –†–æ—Å—ñ—è —Å–ø—Ä–æ–±—É–≤–∞–ª–∞ –∑–Ω–æ–≤—É —Ä–µ–∞–ª—ñ–∑—É–≤–∞—Ç–∏ —Å–≤—ñ–π –∑–ª–æ—á–∏–Ω–Ω–∏–π –ø–ª–∞–Ω ‚Äî –∑–∞–Ω—É—Ä–∏—Ç–∏ –£–∫—Ä–∞—ó–Ω—É –≤ —Ç–µ–º—Ä—è–≤—É —Ç–∞ —Ö–æ–ª–æ–¥. –ó–∞–≤–¥—è–∫–∏ –≥–µ—Ä–æ—ó—á–Ω–∏–º –ó–°–£ —Ç–∞ —Å–∏–ª–∞–º –ü–ü–û –≤–æ—Ä–æ–≥—É –≤–∫–æ—Ç—Ä–µ –Ω–µ –≤–¥–∞–ª–æ—Å—è –∑–∞–¥—É–º–∞–Ω–µ. –ï–Ω–µ—Ä–≥–µ—Ç–∏—á–Ω–∞ —Å–∏—Å—Ç–µ–º–∞ –∫—Ä–∞—ó–Ω–∏ —Ñ—É–Ω–∫—Ü—ñ–æ–Ω—É—î —Ç–∞ –∑–∞–ª–∏—à–∞—î—Ç—å—Å—è —Ü—ñ–ª—ñ—Å–Ω–æ—é. –ë—É–ª–∏ –≤–ª—É—á–∞–Ω–Ω—è –≤ –µ–Ω–µ—Ä–≥–æ–æ–æ–±‚Äô—î–∫—Ç–∏ —É –ö–∏—ó–≤—Å—å–∫—ñ–π, –í—ñ–Ω–Ω–∏—Ü—å–∫—ñ–π, –û–¥–µ—Å—å–∫—ñ–π 

In [8]:
count_tags = {}
for tags in dataset['ukr_tags']:
    for tag in tags:
        if tag not in count_tags:
            count_tags[tag] = 0
        count_tags[tag] += 1

count_tags = dict(sorted(count_tags.items(), key=lambda item: item[1], reverse=True))
print("Number of unique tags:", len(count_tags))
count_tags

Number of unique tags: 984


{'–≤—ñ–π–Ω–∞': 24880,
 '–†–æ—Å—ñ—è': 16277,
 '–£–∫—Ä–∞—ó–Ω–∞': 5899,
 '–ó–µ–ª–µ–Ω—Å—å–∫–∏–π': 3832,
 '–°–®–ê': 3744,
 '–æ–∫—É–ø–∞—Ü—ñ—è': 2925,
 '–∑–±—Ä–æ—è': 2637,
 '–ó–±—Ä–æ–π–Ω—ñ —Å–∏–ª–∏': 2510,
 '–ü—É—Ç—ñ–Ω': 2155,
 '–æ–±—Å—Ç—Ä—ñ–ª': 2127,
 '–Ñ–°': 1906,
 '–ö–∏—ó–≤': 1881,
 '–•–µ—Ä—Å–æ–Ω—Å—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 1641,
 '–ì–µ–Ω—à—Ç–∞–±': 1609,
 '–î–æ–Ω–µ—Ü—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 1360,
 '–ù—ñ–º–µ—á—á–∏–Ω–∞': 1338,
 '–ù–ê–¢–û': 1338,
 '–ë—ñ–ª–æ—Ä—É—Å—å': 1333,
 '–°–ë–£': 1312,
 '—Å–∞–Ω–∫—Ü—ñ—ó': 1295,
 '–±–µ–∑–ø—ñ–ª–æ—Ç–Ω–∏–∫–∏': 1227,
 '–í–µ–ª–∏–∫–∞ –ë—Ä–∏—Ç–∞–Ω—ñ—è': 1182,
 '–ü–æ–ª—å—â–∞': 1114,
 '—Ä–æ–∑–≤—ñ–¥–∫–∞': 1112,
 '–•–∞—Ä–∫—ñ–≤—Å—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 1060,
 '–ö—Ä–∏–º': 1050,
 '–≤–∏–±—É—Ö': 1037,
 '–ú–∞—Ä—ñ—É–ø–æ–ª—å': 964,
 '–∂–µ—Ä—Ç–≤–∏': 954,
 '–¥–æ–ø–æ–º–æ–≥–∞ –£–∫—Ä–∞—ó–Ω—ñ': 926,
 '–õ—É–≥–∞–Ω—Å—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 901,
 '–ó–∞–ø–æ—Ä—ñ–∑—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 867,
 '–•–µ—Ä—Å–æ–Ω': 860,
 '–°—É–º—Å—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 814,
 '–ö–∏—ó–≤—Å—å–∫–∞ –æ–±–ª–∞—Å—Ç—å': 803,

In [9]:
fasttext_lang_id_url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
import fasttext
import os
if not os.path.exists("lid.176.bin"):
    import urllib.request
    urllib.request.urlretrieve(fasttext_lang_id_url, "lid.176.bin")
ft_lang_id = fasttext.load_model("lid.176.bin")

In [10]:
sentences_to_test = [
    "–•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω?",
    "Who holds this neighborhood?",
    "–•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? Who holds this neighborhood?",
    "Who holds this neighborhood? –•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? ",
    "Who holds this neighborhood? –•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? Maybe some kind of a dog?"
]

for sentence in sentences_to_test:
    print(sentence)
    print(ft_lang_id.predict(sentence, k=3))
    print("=====")


–•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω?
(('__label__uk', '__label__el', '__label__ar'), array([9.99939144e-01, 3.24452776e-05, 3.03752004e-05]))
=====
Who holds this neighborhood?
(('__label__en', '__label__hu', '__label__fr'), array([0.9793033 , 0.00456449, 0.00240685]))
=====
–•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? Who holds this neighborhood?
(('__label__uk', '__label__en', '__label__el'), array([0.86032069, 0.02375383, 0.01937906]))
=====
Who holds this neighborhood? –•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? 
(('__label__uk', '__label__en', '__label__el'), array([0.86032081, 0.0237538 , 0.01937906]))
=====
Who holds this neighborhood? –•—Ç–æ —Ç—Ä–∏–º–∞—î —Ü–µ–π —Ä–∞–π–æ–Ω? Maybe some kind of a dog?
(('__label__uk', '__label__en', '__label__ru'), array([0.56096339, 0.22342798, 0.03072331]))
=====


In [11]:
print("Before filtering:", len(dataset))

wrong_language = dataset.filter(lambda x: ft_lang_id.predict(x["ukr_text"].replace("\n", " "))[0][0] != "__label__uk", num_proc=4)
wrong_language



Before filtering: 56131


Filter (num_proc=4):   0%|          | 0/56131 [00:00<?, ? examples/s]

Dataset({
    features: ['ukr_text', 'ukr_tags', 'tags', 'ukr_tags_full'],
    num_rows: 4
})

In [12]:
from IPython.display import display
display(wrong_language[3])
display(wrong_language[2])

#print(wrong_language[2]["ukr_text"])

{'ukr_text': '–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç –£–∫—Ä–∞—ó–Ω–∏ –í–æ–ª–æ–¥–∏–º–∏—Ä –ó–µ–ª–µ–Ω—Å—å–∫–∏–π –∑–∞–∫–ª–∏–∫–∞–≤ —Å–æ–ª–¥–∞—Ç—ñ–≤ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –æ–∫—É–ø–∞–Ω—Ç–∞ –∑–∞–∫—ñ–Ω—á–∏—Ç–∏ –≤—ñ–π–Ω—É —Ç–∞ –ø–æ–≤–µ—Ä–Ω—É—Ç–∏—Å—è –¥–æ–¥–æ–º—É, –±–æ —É–∫—Ä–∞—ó–Ω—Å—å–∫–∏–π –Ω–∞—Ä–æ–¥ —Å—Ç–æ—è—Ç–∏–º–µ –¥–æ –∫—ñ–Ω—Ü—è. –î–∂–µ—Ä–µ–ª–æ: –∑–≤–µ—Ä–Ω–µ–Ω–Ω—è –≥–ª–∞–≤–∏ –¥–µ—Ä–∂–∞–≤–∏ –≤—ñ–¥ 9 –±–µ—Ä–µ–∑–Ω—è –ü—Ä—è–º–∞ –º–æ–≤–∞ (–ó–µ–ª–µ–Ω—Å—å–∫–∏–π –≥–æ–≤–æ—Ä–∏–≤ —Ä–æ—Å—ñ–π—Å—å–∫–æ—é) "–†–æ—Å—Å–∏–π—Å–∫–∏–µ —Å–æ–ª–¥–∞—Ç—ã! –£ –≤–∞—Å –µ—â—ë –µ—Å—Ç—å —à–∞–Ω—Å –≤—ã–∂–∏—Ç—å. –ü–æ—á—Ç–∏ –¥–≤–µ –Ω–µ–¥–µ–ª–∏ –Ω–∞—à–µ–≥–æ —Å–æ–ø—Ä–æ—Ç–∏–≤–ª–µ–Ω–∏—è –ø–æ–∫–∞–∑–∞–ª–∏ –≤–∞–º, —á—Ç–æ –º—ã –Ω–µ —Å–¥–∞–¥–∏–º—Å—è. –ß—Ç–æ –º—ã –±—É–¥–µ–º –≤–æ–µ–≤–∞—Ç—å, –ø–æ–∫–∞ –Ω–µ –≤–µ—Ä–Ω—ë–º —Å–≤–æ—é –∑–µ–º–ª—é –∏ –ø–æ–∫–∞ –Ω–µ –æ—Ç–≤–µ—Ç–∏–º —Å–ø–æ–ª–Ω–∞ –∑–∞ –≤—Å–µ—Ö –Ω–∞—à–∏—Ö —É–±–∏—Ç—ã—Ö –ª—é–¥–µ–π. –ó–∞ —É–±–∏—Ç—ã—Ö –¥–µ—Ç–µ–π. –í—ã –µ—â—ë –º–æ–∂–µ—Ç–µ —Å–ø–∞—Å—Ç–∏—Å—å. –ï—Å–ª–∏ –ø—Ä–æ—Å—Ç–æ —É–π–¥—ë—Ç–µ. –ù–µ –≤–µ—Ä—å—Ç–µ —Å–

{'ukr_text': '–†–æ—Å—ñ–π—Å—å–∫–∏–π –∑–∞–≥–∞—Ä–±–Ω–∏–∫ —É —Ç–µ–ª–µ—Ñ–æ–Ω–Ω—ñ–π —Ä–æ–∑–º–æ–≤—ñ –∑ –¥—Ä—É–∂–∏–Ω–æ—é –ø–æ—Ö–≤–∞–ª–∏–≤—Å—è, —â–æ –Ω–∞–∫—Ä–∞–≤ –¥–ª—è –Ω–µ—ó –∑ —É–∫—Ä–∞—ó–Ω—Å—å–∫–æ–≥–æ –±—É–¥–∏–Ω–∫—É "–∫–æ—Å–º–µ—Ç–∏–∫–∏, –∫—Ä–æ—Å—ñ–≤–∫–∏ —Ñ—ñ—Ä–º–æ–≤—ñ —ñ —è–∫—ñ—Å–Ω–∏—Ö —Ñ—É—Ç–±–æ–ª–æ–∫", –¥—Ä—É–∂–∏–Ω–∞ –æ–∫—É–ø–∞–Ω—Ç–∞ –ø–æ–ø—Ä–æ—Å–∏–ª–∞ –Ω–æ—É—Ç–±—É–∫ —ñ —Å–ø–æ—Ä—Ç–∏–≤–Ω–∏–π –∫–æ—Å—Ç—é–º. –î–∂–µ—Ä–µ–ª–æ: –°–ë–£, —è–∫–∞ –ø—É–±–ª—ñ–∫—É—î –ø–µ—Ä–µ—Ö–æ–ø–ª–µ–Ω—ñ —Ä–æ–∑–º–æ–≤–∏ —Ä–æ—Å—ñ–π—Å—å–∫–∏—Ö –∑–∞–≥–∞—Ä–±–Ω–∏–∫—ñ–≤ –î–µ—Ç–∞–ª—ñ: –û–∫—É–ø–∞–Ω—Ç —Ä–æ–∑–ø–æ–≤—ñ–≤, —â–æ –º–µ—à–∫–∞–Ω—Ü—ñ –¥–æ–º—É, —è–∫–∏–π –≤—ñ–Ω –≥—Ä–∞–±—É–≤–∞–≤, –∂–∏–ª–∏ –¥—É–∂–µ –≥–∞—Ä–Ω–æ ‚Äì —Ä–µ–º–æ–Ω—Ç —É –Ω–∏—Ö –∫—Ä–∞—Å–∏–≤–∏–π, —Ä–µ—á—ñ –≤—Å—ñ —è–∫—ñ—Å–Ω—ñ, –≤–æ–Ω–∏ –ø–∏–ª–∏ –≤—ñ—Ç–∞–º—ñ–Ω–∏ "–¥–æ—Ä–æ–≥—ñ", —ñ –Ω–∞–≤—ñ—Ç—å –º–∞–ª–∏ —Å–∞—É–Ω—É, —É —è–∫—ñ–π –æ–∫—É–ø–∞–Ω—Ç–∏ "–ø–∞—Ä—è—Ç—å—Å—è –≤–∂–µ –¥—Ä—É–≥–∏–π –¥–µ–Ω—å". –î–∞–ª—ñ –ø—Ä–∏–≤–æ–¥–∏–º–æ –¥—ñ–∞–ª–æ–≥ –æ–∫—É–ø–∞–Ω—Ç–∞ –ê–Ω–¥—Ä—ñ—è –∑ –π–æ–≥–æ 

In [13]:
dataset = dataset.filter(lambda x: ft_lang_id.predict(x["ukr_text"].replace("\n", " "))[0][0] == "__label__uk", num_proc=4)
print("After filtering:", len(dataset))
dataset

Filter (num_proc=4):   0%|          | 0/56131 [00:00<?, ? examples/s]

After filtering: 56127


Dataset({
    features: ['ukr_text', 'ukr_tags', 'tags', 'ukr_tags_full'],
    num_rows: 56127
})

In [14]:
# replace with <email>, <url>, <phone>
import re

def replace_sensitive_info(text):
    # Email pattern
    email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    # URL pattern
    url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # Phone number pattern
    phone = r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'

    text = re.sub(email, '<email>', text)
    text = re.sub(url, '<url>', text)
    text = re.sub(phone, '<phone>', text)

    return text
dataset = dataset.map(lambda x: {"ukr_text": replace_sensitive_info(x["ukr_text"])}, num_proc=8)
print(dataset.filter(lambda x: "<email>" in x["ukr_text"], num_proc=4)[0])

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/56127 [00:00<?, ? examples/s]

{'ukr_text': '–ú–æ–±—ñ–ª—ñ–∑–∞—Ü—ñ—è –∂–∏—Ç–µ–ª—ñ–≤ –æ–∫—É–ø–æ–≤–∞–Ω–æ–≥–æ –ö—Ä–∏–º—É –Ω–∞ –≤—ñ–π–Ω—É –ø—Ä–æ—Ç–∏ –£–∫—Ä–∞—ó–Ω–∏ —î –≤–æ—î–Ω–Ω–∏–º –∑–ª–æ—á–∏–Ω–æ–º –†–æ—Å—ñ–π—Å—å–∫–æ—ó –§–µ–¥–µ—Ä–∞—Ü—ñ—ó, –ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä–∞ –ê–† –ö—Ä–∏–º —Ä–∞–¥–∏—Ç—å –∂–∏—Ç–µ–ª—è–º –ø—ñ–≤–æ—Å—Ç—Ä–æ–≤–∞ —É –≤–∏–ø–∞–¥–∫—É –ø—Ä–∏–∑–æ–≤—É –≤ —Ä–æ—Å—ñ–π—Å—å–∫—É –∞—Ä–º—ñ—é —ñ–Ω—Ñ–æ—Ä–º—É–≤–∞—Ç–∏ —ó—ó –ø—Ä–æ —Ü–µ –π –Ω–µ –≤—á–∏–Ω—è—Ç–∏ –∑–ª–æ—á–∏–Ω—ñ–≤ –ø—Ä–æ—Ç–∏ –£–∫—Ä–∞—ó–Ω–∏. –î–∂–µ—Ä–µ–ª–æ: –ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä–∞ –ê–≤—Ç–æ–Ω–æ–º–Ω–æ—ó –†–µ—Å–ø—É–±–ª—ñ–∫–∏ –ö—Ä–∏–º —Ç–∞ –º—ñ—Å—Ç–∞ –°–µ–≤–∞—Å—Ç–æ–ø–æ–ª—å, —ó—ó –∫–µ—Ä—ñ–≤–Ω–∏–∫ –Ü–≥–æ—Ä –ü–æ–Ω–æ—á–æ–≤–Ω–∏–π –ü—Ä—è–º–∞ –º–æ–≤–∞ –ü–æ–Ω–æ—á–æ–≤–Ω–æ–≥–æ: "–ú–æ–±—ñ–ª—ñ–∑–∞—Ü—ñ—è, —è–∫—É –ø—Ä–æ–≤–æ–¥–∏—Ç—å –†–§ –≤ –æ–∫—É–ø–æ–≤–∞–Ω–æ–º—É –ö—Ä–∏–º—É, ‚Äì —á–µ—Ä–≥–æ–≤–∏–π –≤–æ—î–Ω–Ω–∏–π –∑–ª–æ—á–∏–Ω–∏ –ø—Ä–æ—Ç–∏ –≥—Ä–æ–º–∞–¥—è–Ω –£–∫—Ä–∞—ó–Ω–∏, —â–æ –º–µ—à–∫–∞—é—Ç—å –Ω–∞ –ø—ñ–≤–æ—Å—Ç—Ä–æ–≤—ñ". –î–µ—Ç–∞–ª—ñ: –ö–µ—Ä—ñ–≤–Ω–∏–∫ –ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä–∏ –ê–†–ö –∑–∞–∑–Ω–∞—

In [15]:
# lowercase / uppercase?

dataset = dataset.map(lambda x: {"ukr_text": x["ukr_text"].lower()}, num_proc=8)

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

In [16]:
# check chars

def extract_chars(dataset):
    dataset = dataset.map(lambda x: {"chars": set(x["ukr_text"])}, num_proc=8)

    total_chars = set()
    for item in dataset:
        total_chars.update(item["chars"])
    return total_chars

total_chars = extract_chars(dataset)
total_chars


Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

{'\n',
 '\r',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '~',
 '¬£',
 '¬ß',
 '¬´',
 '\xad',
 '¬∞',
 '¬±',
 '¬∑',
 '¬ª',
 '¬ø',
 '√ó',
 '√ü',
 '√†',
 '√°',
 '√¢',
 '√£',
 '√§',
 '√•',
 '√¶',
 '√ß',
 '√®',
 '√©',
 '√™',
 '√´',
 '√¨',
 '√≠',
 '√Æ',
 '√Ø',
 '√∞',
 '√±',
 '√≥',
 '√¥',
 '√µ',
 '√∂',
 '√∏',
 '√π',
 '√∫',
 '√º',
 '√Ω',
 'ƒÅ',
 'ƒÉ',
 'ƒÖ',
 'ƒá',
 'ƒç',
 'ƒè',
 'ƒë',
 'ƒì',
 'ƒó',
 'ƒô',
 'ƒõ',
 'ƒü',
 'ƒ´',
 'ƒ±',
 'ƒ∑',
 'ƒº',
 'ƒæ',
 '≈Ç',
 '≈Ñ',
 '≈Ü',
 '≈à',
 '≈ë',
 '≈ô',
 '≈õ',
 '≈ü',
 '≈°',
 '≈£',
 '≈•',
 '≈´',
 '≈Ø',
 '≈±',
 '≈≥',
 '≈∫',
 '≈º',
 '≈æ',
 '»õ',
 '…ô',
 ' º',
 'Àó',
 'ÀÆ',
 'ÃÅ',
 'Ãá',
 'Ãà',
 'Ãì',
 'Ãß',
 'Ã∂',

In [17]:
len(total_chars)

591

In [18]:
dataset.filter(lambda x: "ü´°" in x["ukr_text"])[0]

Filter:   0%|          | 0/56127 [00:00<?, ? examples/s]

{'ukr_text': '—Ç–µ—Ö–Ω–æ–ª–æ–≥—ñ—á–Ω–∏–π —Ä–µ–ø–æ—Ä—Ç–µ—Ä new york times —Ä–∞–π–∞–Ω –º–∞–∫ –ø–æ–≤—ñ–¥–æ–º–∏–≤, —â–æ –∑ twitter –∑–≤—ñ–ª—å–Ω—è—é—Ç—å—Å—è —Å–æ—Ç–Ω—ñ —Å–ø—ñ–≤—Ä–æ–±—ñ—Ç–Ω–∏–∫—ñ–≤. –¥–∂–µ—Ä–µ–ª–æ: —Ä–∞–π–∞–Ω –º–∞–∫ —É twitter, bbc pink floyd\'s "wish you were here," employees signing off a video call while musk is pitching his vision, a wave of ü´° emojis.inside decision day at twitter where hundreds of people may have just resigned.(w/ @mikeisaac, @dmccabe & @kateconger) <url> –ø—Ä—è–º–∞ –º–æ–≤–∞ —Ä–∞–π–∞–Ω–∞ –º–∞–∫–∞: "–¥–µ–Ω—å —É—Ö–≤–∞–ª–µ–Ω–Ω—è —Ä—ñ—à–µ–Ω—å —É twitter, –∫–æ–ª–∏ —Å–æ—Ç–Ω—ñ –ª—é–¥–µ–π, –º–æ–∂–ª–∏–≤–æ, —â–æ–π–Ω–æ –∑–≤—ñ–ª—å–Ω–∏–ª–∏—Å—è". –¥–µ—Ç–∞–ª—ñ: twitter –ø–æ–≤—ñ–¥–æ–º–∏–≤ —Å–ø—ñ–≤—Ä–æ–±—ñ—Ç–Ω–∏–∫–∞–º, —â–æ –æ—Ñ—ñ—Å–Ω—ñ –±—É–¥—ñ–≤–ª—ñ –∫–æ–º–ø–∞–Ω—ñ—ó –±—É–¥–µ —Ç–∏–º—á–∞—Å–æ–≤–æ –∑–∞–∫—Ä–∏—Ç–æ –¥–æ 21 –ª–∏—Å—Ç–æ–ø–∞–¥–∞. –ø—Ä–∏—á–∏–Ω—É –ø–µ—Ä–µ—ó–∑–¥—É –Ω–µ –≤–∫–∞–∑–∞–Ω–æ, –∑–∞–∑–Ω–∞—á–∞—î bbc. –æ–≥–æ–ª–æ—à–µ–Ω–Ω—è –∑‚Äô—è–≤–∏–ª–æ—Å—è –Ω–∞ —Ç–ª—ñ –ø–

In [19]:
dataset.filter(lambda x: "√ü" in x["ukr_text"])[0]

Filter:   0%|          | 0/56127 [00:00<?, ? examples/s]

{'ukr_text': '–ª—ñ–¥–µ—Ä —î–≤—Ä–µ–π—Å—å–∫–æ—ó –≥—Ä–æ–º–∞–¥–∏ –∞–≤—Å—Ç—Ä—ñ—ó –æ—Å–∫–∞—Ä –¥–æ–π—á –∑–∞—è–≤–∏–≤, —â–æ —É —Å–µ—Ä–µ–¥—É –≤–Ω–æ—á—ñ –Ω–∞ —î–≤—Ä–µ–π—Å—å–∫—ñ–π —á–∞—Å—Ç–∏–Ω—ñ —Ü–µ–Ω—Ç—Ä–∞–ª—å–Ω–æ–≥–æ –∫–ª–∞–¥–æ–≤–∏—â–∞ –≤—ñ–¥–Ω—è —Å—Ç–∞–ª–∞—Å—è –ø–æ–∂–µ–∂–∞, –∞ –∑–æ–≤–Ω—ñ—à–Ω—ñ —Å—Ç—ñ–Ω–∏ —Ä–æ–∑–º–∞–ª—é–≤–∞–ª–∏ —Å–≤–∞—Å—Ç–∏–∫–æ—é. –ø–æ–∂–µ–∂–Ω–∞ —Å–ª—É–∂–±–∞ —Ç–∞ –ø–æ–ª—ñ—Ü—ñ—è –ø—Ä–æ–≤–æ–¥—è—Ç—å —Ä–æ–∑—Å–ª—ñ–¥—É–≤–∞–Ω–Ω—è. –¥–∂–µ—Ä–µ–ª–æ: "—î–≤—Ä–æ–ø–µ–π—Å—å–∫–∞ –ø—Ä–∞–≤–¥–∞" –∑ –ø–æ—Å–∏–ª–∞–Ω–Ω—è–º –Ω–∞ ap –¥–µ—Ç–∞–ª—ñ: –∑–∞ –π–æ–≥–æ —Å–ª–æ–≤–∞–º–∏, –ø–æ–∂–µ–∂–∞ —Å–ø–∞–ª–∏–ª–∞ –≤—Ö—ñ–¥–Ω–∏–π –≤–µ—Å—Ç–∏–±—é–ª—å –¥–æ —Ü–µ—Ä–µ–º–æ–Ω—ñ–∞–ª—å–Ω–æ—ó –∑–∞–ª–∏, –∞–ª–µ –Ω—ñ—Ö—Ç–æ –Ω–µ –ø–æ—Å—Ç—Ä–∞–∂–¥–∞–≤. –∑–∞ –π–æ–≥–æ —Å–ª–æ–≤–∞–º–∏, –ø–æ–∂–µ–∂–Ω–∞ —Å–ª—É–∂–±–∞ —Ç–∞ –ø–æ–ª—ñ—Ü—ñ—è –ø—Ä–æ–≤–æ–¥—è—Ç—å —Ä–æ–∑—Å–ª—ñ–¥—É–≤–∞–Ω–Ω—è. in der nacht wurde am j√ºdischen teil des zentralfriedhofs (iv. tor) ein brand gelegt. der vorraum der zeremonienhalle ist ausgebrannt. an au√üenmauern wurden

In [20]:
import ftfy
def fix_text(text):
    return ftfy.fix_text(text)

dataset = dataset.map(lambda x: {"ukr_text": fix_text(x["ukr_text"])}, num_proc=8)

total_chars = extract_chars(dataset)

print("After fixing:", len(total_chars))
total_chars

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

After fixing: 584


{'\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '~',
 '¬£',
 '¬ß',
 '¬´',
 '\xad',
 '¬∞',
 '¬±',
 '¬∑',
 '¬ª',
 '¬ø',
 '√ó',
 '√ü',
 '√†',
 '√°',
 '√¢',
 '√£',
 '√§',
 '√•',
 '√¶',
 '√ß',
 '√®',
 '√©',
 '√™',
 '√´',
 '√¨',
 '√≠',
 '√Æ',
 '√Ø',
 '√∞',
 '√±',
 '√≥',
 '√¥',
 '√µ',
 '√∂',
 '√∏',
 '√π',
 '√∫',
 '√º',
 '√Ω',
 'ƒÅ',
 'ƒÉ',
 'ƒÖ',
 'ƒá',
 'ƒç',
 'ƒè',
 'ƒë',
 'ƒì',
 'ƒó',
 'ƒô',
 'ƒõ',
 'ƒü',
 'ƒ´',
 'ƒ±',
 'ƒ∑',
 'ƒº',
 'ƒæ',
 '≈Ç',
 '≈Ñ',
 '≈Ü',
 '≈à',
 '≈ë',
 '≈ô',
 '≈õ',
 '≈ü',
 '≈°',
 '≈£',
 '≈•',
 '≈´',
 '≈Ø',
 '≈±',
 '≈≥',
 '≈∫',
 '≈º',
 '≈æ',
 '»õ',
 '…ô',
 'Àó',
 'ÀÆ',
 'ÃÅ',
 'Ãá',
 'Ãà',
 'Ãì',
 'Ãß',
 'Ã∂',
 'Œ¨',
 'Œ≠',

In [21]:
def basic_cleaning(text):
    text = text.replace("`", "'")
    text = text.replace(" º", "'")
    text = text.replace("‚Ä¶", "...")

    symbols = {
        "‚Äù": '"',
        "‚Äú": '"',
        "‚Äô": '"',
        "‚Äò": '"',
        "¬´": '"',
        "¬ª": '"',
        "‚Äì": "-",
        "‚Äî": "-",
        "‚Äï": "-",
    }
    for symbol, value in symbols.items():
        text = text.replace(symbol, value)
    return text

dataset = dataset.map(lambda x: {"ukr_text": basic_cleaning(x["ukr_text"])}, num_proc=8)
total_chars = extract_chars(dataset)
print("After basic cleaning:", len(total_chars))

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

After basic cleaning: 578


In [22]:
target_chats = set("–∞–±–≤–≥“ë–¥–µ—î–∂–∑–∏—ñ—ó–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—å—é—è' ") # remove emails, urls, phones

def normalize_text(text):
    # leave only allowed characters
    # replace with a space, then remove extra spaces
    for char in text:
        if char.lower() not in target_chats:
            text = text.replace(char, " ")

    text = " ".join(text.split())
    return text
dataset = dataset.map(lambda x: {"ukr_text": normalize_text(x["ukr_text"])}, num_proc=8)
total_chars = extract_chars(dataset)
print("After normalization:", len(total_chars))

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

After normalization: 35


In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download required NLTK data
nltk.download('stopwords')

try:
    # Initialize Ukrainian stemmer and stopwords
    ukrainian_stopwords = set(stopwords.words('ukrainian'))
except:
    print("No ukrainian stopwords in nltk")


No ukrainian stopwords in nltk


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robinhad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
dataset[0]["ukr_text"]

'—É –∫–∏—î–≤—ñ —Ç–∞ —â–µ –Ω–∏–∑—Ü—ñ —Ä–µ–≥—ñ–æ–Ω—ñ–≤ —É–∫—Ä–∞—ó–Ω–∏ –æ–≥–æ–ª–æ—à—É–≤–∞–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É —á–µ—Ä–µ–∑ –∑–∞–≥—Ä–æ–∑—É –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–æ–≥–æ –æ–∑–±—Ä–æ—î–Ω–Ω—è –¥–∂–µ—Ä–µ–ª–æ –ø–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –¥–µ—Ç–∞–ª—ñ –±–ª–∏–∑—å–∫–æ —É —Å—Ç–æ–ª–∏—Ü—ñ —Ç–∞ —ñ–Ω—à–∏—Ö —Ä–µ–≥—ñ–æ–Ω–∞—Ö –æ–≥–æ–ª–æ—Å–∏–ª–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω—É —Ç—Ä–∏–≤–æ–≥—É –ø—ñ–∑–Ω—ñ—à–µ –ø–æ–≤—ñ—Ç—Ä—è–Ω—ñ —Å–∏–ª–∏ –ø–æ–≤—ñ–¥–æ–º–∏–ª–∏ —â–æ –≤ —á–µ—Ä–Ω—ñ–≥—ñ–≤—Å—å–∫—ñ–π –∫–∏—ó–≤—Å—å–∫—ñ–π —Å—É–º—Å—å–∫—ñ–π –ø–æ–ª—Ç–∞–≤—Å—å–∫—ñ–π —á–µ—Ä–∫–∞—Å—å–∫—ñ–π –¥–Ω—ñ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å—å–∫—ñ–π –¥–æ–Ω–µ—Ü—å–∫—ñ–π –∑–∞–ø–æ—Ä—ñ–∑—å–∫—ñ–π –æ–±–ª–∞—Å—Ç—è—Ö —ñ—Å–Ω—É—î –∑–∞–≥—Ä–æ–∑–∞ –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–æ–≥–æ –æ–∑–±—Ä–æ—î–Ω–Ω—è –æ–¥–Ω–∞–∫ –≤–∂–µ –æ —Ç—Ä–∏–≤–æ–≥—É —Å–∫–∞—Å—É–≤–∞–ª–∏ –º–∞–π–∂–µ –≤ —É—Å—ñ—Ö –æ–±–ª–∞—Å—Ç—è—Ö'

In [25]:
# check words

def extract_words(dataset):
    dataset = dataset.map(lambda x: {"words": set(x["ukr_text"].split(" "))}, num_proc=8)

    total_words = set()
    for item in dataset:
        total_words.update(item["words"])
    return total_words

total_words = extract_words(dataset)

print("Total words:", len(total_words))
total_words

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Total words: 185678


{'–≤—Å—Ç–∞–≤–ª—è—Ç–∏',
 '–Ω–µ–∑–∞–¥–æ–≤—ñ–ª—å–Ω–æ–º—É',
 '–µ—à–µ–ª–æ–Ω–æ–≤–∞–Ω—É',
 '–Ω–µ–ø—Ä–∞—Ü—é—é—á—É',
 '—Å–∞–Ω–∂–∞—Ä—ñ–≤–∫–∏',
 '–ø—Ä–∏–º—ñ—Ç–∏–≤–Ω–∞',
 '–Ω–µ–ø—Ä–æ–Ω–∏–∫–Ω–∏–º–∏',
 '–ø—Ä–æ–ø–æ—Ä—Ü—ñ–π–Ω–µ',
 '–º–∞—Ç—ñ',
 '–∞–≤—ñ–∞–∫–≤–∏—Ç–æ–∫',
 "–±–µ–∑–∏–º'—è–Ω–µ",
 '–±–∞—Ä–º–µ–Ω–æ–º',
 '–Ω–∞–ø–ª—ñ—á–Ω–∏–∫—ñ–≤',
 '–±–ª–æ–∫—É—é—á—ñ',
 '–¥–∏–∫–∏—Ö',
 '–ø–æ–≤—Ç—Ä—è–Ω—É',
 '–æ—Å–∫–æ–ª–∫–æ–≤—ñ–π',
 '–æ–±–µ—Ä–µ–Ω—á—É–∫–æ–º',
 '–æ–ø–∞–ª—é–≤–∞–ª—å–Ω–æ–≥–æ',
 '–≤–∏–Ω–æ—à—É–≤–∞–ª–∏',
 '—Ä–µ–≥—É–ª—å–æ–≤–∞–Ω–æ–≥–æ',
 '—Å—Ç—Ä–æ–∫–æ–≤–∏–∫–∏',
 '—Å—Ç–æ—Ä–æ–∂–∏—Ç–∏',
 '–∫–æ–Ω—Å—Ç–∞–Ω—Ç–∞',
 '–ø–æ–ª–æ',
 '—á–µ—Ä–µ–º—Å—ñ',
 '—Ä–æ–∑–≥–ª—è–¥–∞—î—Ç–µ',
 '–¥—ñ–ª—î—Ä–±–µ–∫–∞',
 '–Ω–µ–∑–∞–∫–æ–Ω–Ω–æ–º—É',
 '–∑–µ–ª–µ–Ω–æ–º—É',
 '–æ—Ç—Ä–∏–º–∞—î—Ç–µ',
 '–¥–µ–º–æ–∫—Ä–∞—Ç–∏',
 '–æ–∫—Ç–æ–∫–æ–ø—Ç–µ—Ä',
 '–∞—Ç–µ—Å—Ç–∞—Ç',
 '–≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ–º',
 '–º–æ—Ä–ø—ñ—Ö–∞–º',
 '–Ω–∞–π–∞–∫—Ç–∏–≤–Ω—ñ—à–æ–≥–æ',
 '–ø—Ä–∏–Ω—Ü–æ–º',
 "–∫–∞–º'—è–Ω—á–∞–Ω",
 '–¥–∏–≤–∞–Ω',
 '–ø–æ–ø–µ—Ä–µ–¥–∂–µ–Ω–Ω—è',
 '—â–∏—Ç–∞–º–∏',
 '–º–∞—Ä

In [26]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer(lang='uk')

def ukrainian_stem(word):
    """Get the normal form (lemma) of a Ukrainian word"""
    parsed = morph.parse(word)[0]
    return parsed.normal_form

for test_words in ["–ø—Ä–∞—Ü—é–≤–∞–≤", "–±–∞–∂–∞–Ω–æ–≥–æ", "—Ä–æ–±–æ—á–∞"]:
    print(test_words, "->", ukrainian_stem(test_words))


–ø—Ä–∞—Ü—é–≤–∞–≤ -> –ø—Ä–∞—Ü—é–≤–∞—Ç–∏
–±–∞–∂–∞–Ω–æ–≥–æ -> –±–∞–∂–∞–Ω–µ
—Ä–æ–±–æ—á–∞ -> —Ä–æ–±–æ—á–∏–π


In [27]:
dataset = dataset.map(lambda x: {"ukr_text": " ".join([ukrainian_stem(word) for word in x["ukr_text"].split(" ")])}, num_proc=8)
total_words = extract_words(dataset)
print("After stemming and stopwords removal:", len(total_words))

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/56127 [00:00<?, ? examples/s]

After stemming and stopwords removal: 78782


In [None]:
dataset[0] # –ú—ñ—Å—Ç–æ –ö–∏–π?

{'ukr_text': '—É –∫–∏–π —Ç–∞ —â–µ –Ω–∏–∑–∫–∞ —Ä–µ–≥—ñ–æ–Ω —É–∫—Ä–∞—ó–Ω–∞ –æ–≥–æ–ª–æ—à—É–≤–∞—Ç–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω–∏–π —Ç—Ä–∏–≤–æ–≥–∞ —á–µ—Ä–µ–∑ –∑–∞–≥—Ä–æ–∑–∞ –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–∏–π –æ–∑–±—Ä–æ—î–Ω–Ω—è –¥–∂–µ—Ä–µ–ª–æ –ø–æ–≤—ñ—Ç—Ä—è–Ω–∏–π —Å–∏–ª–∞ –¥–µ—Ç–∞–ª—å –±–ª–∏–∑—å–∫–æ —É —Å—Ç–æ–ª–∏—Ü—è —Ç–∞ —ñ–Ω—à–∏–π —Ä–µ–≥—ñ–æ–Ω –æ–≥–æ–ª–æ—Å–∏—Ç–∏ –ø–æ–≤—ñ—Ç—Ä—è–Ω–∏–π —Ç—Ä–∏–≤–æ–≥–∞ –ø—ñ–∑–Ω—ñ—à–∏–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–∏–π —Å–∏–ª–∞ –ø–æ–≤—ñ–¥–æ–º–∏—Ç–∏ —â–æ –≤ —á–µ—Ä–Ω—ñ–≥—ñ–≤—Å—å–∫–∏–π –∫–∏—ó–≤—Å—å–∫–∏–π —Å—É–º—Å—å–∫–∏–π –ø–æ–ª—Ç–∞–≤—Å—å–∫–∏–π —á–µ—Ä–∫–∞—Å—å–∫–∏–π –¥–Ω—ñ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å—å–∫–∏–π –¥–æ–Ω–µ—Ü—å–∫–∏–π –∑–∞–ø–æ—Ä—ñ–∑—å–∫–∏–π –æ–±–ª–∞—Å—Ç—å —ñ—Å–Ω—É–≤–∞—Ç–∏ –∑–∞–≥—Ä–æ–∑–∞ –∑–∞—Å—Ç–æ—Å—É–≤–∞–Ω–Ω—è –±–∞–ª—ñ—Å—Ç–∏—á–Ω–∏–π –æ–∑–±—Ä–æ—î–Ω–Ω—è –æ–¥–Ω–∞–∫ –≤–∂–µ –æ —Ç—Ä–∏–≤–æ–≥–∞ —Å–∫–∞—Å—É–≤–∞—Ç–∏ –º–∞–π–∂–µ –≤ —É–≤–µ—Å—å –æ–±–ª–∞—Å—Ç—å',
 'ukr_tags': ['–ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞'],
 'tags': 'trivoga',
 'ukr_tags_full': "[('trivoga', '–ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–

In [29]:
dataset = dataset.map(lambda x: {"target": 1 if "–≤—ñ–π–Ω–∞" in x["ukr_tags"] else 0})

target_counts = dataset['target'].count(1), dataset['target'].count(0)
print("Target counts (1, 0):", target_counts)

Map:   0%|          | 0/56127 [00:00<?, ? examples/s]

Target counts (1, 0): (24861, 31266)


In [30]:
# train test split
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_test

DatasetDict({
    train: Dataset({
        features: ['ukr_text', 'ukr_tags', 'tags', 'ukr_tags_full', 'target'],
        num_rows: 44901
    })
    test: Dataset({
        features: ['ukr_text', 'ukr_tags', 'tags', 'ukr_tags_full', 'target'],
        num_rows: 11226
    })
})

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create document-term matrix using word counts
count_vectorizer = CountVectorizer(max_features=1000)
X_train = count_vectorizer.fit_transform(train_test['train']['ukr_text'])
X_test = count_vectorizer.transform(train_test['test']['ukr_text'])

# Get training and test labels
y_train = train_test['train']['target']
y_test = train_test['test']['target']

# Initialize and train the MultinomialNB classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print model accuracy
print(f"Model accuracy: {nb_classifier.score(X_test, y_test):.3f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.76      0.74      6296
           1       0.67      0.63      0.65      4930

    accuracy                           0.70     11226
   macro avg       0.70      0.70      0.70     11226
weighted avg       0.70      0.70      0.70     11226

Model accuracy: 0.705


In [32]:
import numpy as np
from sklearn.metrics import classification_report

# Create random predictions
np.random.seed(42)  # for reproducibility
y_pred_random = np.random.random(len(y_test)) >= 0.5  # random binary predictions
y_pred_random = y_pred_random.astype(int)

# Print the classification report for random baseline
print("Random Baseline Classification Report:")
print(classification_report(y_test, y_pred_random))

# Calculate accuracy for random baseline
random_accuracy = (y_test == y_pred_random).mean()
print(f"Random baseline accuracy: {random_accuracy:.3f}")

Random Baseline Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.51      0.54      6296
           1       0.45      0.50      0.47      4930

    accuracy                           0.51     11226
   macro avg       0.51      0.51      0.51     11226
weighted avg       0.51      0.51      0.51     11226

Random baseline accuracy: 0.508


In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

# Create document-term matrix using word counts
count_vectorizer = CountVectorizer(max_features=1000)
X_train = count_vectorizer.fit_transform(train_test['train']['ukr_text'])
X_test = count_vectorizer.transform(train_test['test']['ukr_text'])

# Get training and test labels
y_train = train_test['train']['target']
y_test = train_test['test']['target']

# Initialize and train the RandomForest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, verbose=2)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Print the classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

# Print model accuracy
print(f"Random Forest accuracy: {rf_classifier.score(X_test, y_test):.3f}")

# Print feature importance (optional)
feature_importance = list(zip(count_vectorizer.get_feature_names_out(), rf_classifier.feature_importances_))
sorted_features = sorted(feature_importance, key=lambda x: x[1], reverse=True)
print("\nTop 10 most important words:")
for word, importance in sorted_features[:10]:
    print(f"{word}: {importance:.4f}")

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   22.0s


building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   53.6s finished
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.83      0.80      6296
           1       0.76      0.68      0.71      4930

    accuracy                           0.76     11226
   macro avg       0.76      0.75      0.75     11226
weighted avg       0.76      0.76      0.76     11226



[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


Random Forest accuracy: 0.762

Top 10 most important words:
–æ–±—Å—Ç—Ä—ñ–ª: 0.0138
—Ä–æ—Å—ñ–π—Å—å–∫–∏–π: 0.0136
–≤—ñ–π—Å—å–∫–æ: 0.0127
–æ–∫—É–ø–∞–Ω—Ç: 0.0126
—î–≤—Ä–æ–ø–µ–π—Å—å–∫–∏–π: 0.0110
–≤–æ—Ä–æ–≥: 0.0082
–≤—ñ–π—Å—å–∫–æ–≤–∏–π: 0.0079
–≤—Ç—Ä–∞—Ç–∞: 0.0076
—Ä–æ—Å—ñ—è–Ω–∏–Ω: 0.0072
–¥–æ: 0.0070


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Create TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train = tfidf_vectorizer.fit_transform(train_test['train']['ukr_text'])
X_test = tfidf_vectorizer.transform(train_test['test']['ukr_text'])

# Get training and test labels
y_train = train_test['train']['target']
y_test = train_test['test']['target']

# Initialize and train the RandomForest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, verbose=2)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Print the classification report
print("Random Forest with TF-IDF Classification Report:")
print(classification_report(y_test, y_pred))

# Print model accuracy
print(f"Random Forest with TF-IDF accuracy: {rf_classifier.score(X_test, y_test):.3f}")

# Print feature importance
feature_importance = list(zip(tfidf_vectorizer.get_feature_names_out(), rf_classifier.feature_importances_))
sorted_features = sorted(feature_importance, key=lambda x: x[1], reverse=True)
print("\nTop 10 most important words:")
for word, importance in sorted_features[:10]:
    print(f"{word}: {importance:.4f}")

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   23.3s


building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   57.0s finished
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


Random Forest with TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      6296
           1       0.75      0.68      0.72      4930

    accuracy                           0.76     11226
   macro avg       0.76      0.75      0.75     11226
weighted avg       0.76      0.76      0.76     11226



[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


Random Forest with TF-IDF accuracy: 0.761

Top 10 most important words:
—Ä–æ—Å—ñ–π—Å—å–∫–∏–π: 0.0148
–≤—ñ–π—Å—å–∫–æ: 0.0140
–æ–±—Å—Ç—Ä—ñ–ª: 0.0140
–æ–∫—É–ø–∞–Ω—Ç: 0.0125
—î–≤—Ä–æ–ø–µ–π—Å—å–∫–∏–π: 0.0109
–¥–∂–µ—Ä–µ–ª–æ: 0.0105
–≤—ñ–π—Å—å–∫–æ–≤–∏–π: 0.0095
–≤—ñ–π–Ω—É—Ç–∏: 0.0085
–¥–æ: 0.0079
–≤—Ç—Ä–∞—Ç–∞: 0.0079


# Optuna Hyperparameter search

# https://optuna.org/

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
import numpy as np

def objective(trial):
    # Define the hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }
    
    # TF-IDF parameters
    max_features = trial.suggest_int('tfidf_max_features', 500, 3000)
    
    # Create features
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    X = tfidf_vectorizer.fit_transform(train_test['train']['ukr_text'])
    y = np.array(train_test['train']['target'])  # Convert to numpy array
    
    # Create and train model
    rf_classifier = RandomForestClassifier(
        random_state=42,
        **params
    )
    
    # Perform cross-validation
    scores = cross_val_score(rf_classifier, X, y, cv=3, scoring='f1')
    return scores.mean()

# Create and run study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Print results
print("Best trial:")
print("  Value:", study.best_trial.value)
print("  Params:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

# Train final model with best parameters
best_params = study.best_trial.params

# Split the parameters
tfidf_params = {'max_features': best_params.pop('tfidf_max_features')}

# Create final model with best parameters
final_tfidf = TfidfVectorizer(**tfidf_params)
X_train = final_tfidf.fit_transform(train_test['train']['ukr_text'])
X_test = final_tfidf.transform(train_test['test']['ukr_text'])
y_train = np.array(train_test['train']['target'])
y_test = np.array(train_test['test']['target'])

# Train final model
final_rf = RandomForestClassifier(random_state=42, **best_params)
final_rf.fit(X_train, y_train)

# Evaluate
y_pred = final_rf.predict(X_test)
print("\nFinal Model Performance:")
print(classification_report(y_test, y_pred))

# Print feature importance
feature_importance = list(zip(final_tfidf.get_feature_names_out(), final_rf.feature_importances_))
sorted_features = sorted(feature_importance, key=lambda x: x[1], reverse=True)
print("\nTop 10 most important words:")
for word, importance in sorted_features[:10]:
    print(f"{word}: {importance:.4f}")