# Data Preprocessing

In [112]:
! pip install --upgrade pip
! pip install fasttext
! pip install emoji
! pip install pandas emoji openpyxl



To identify the language of each Telegram post, I applied automatic language detection using a pre-trained fastText model (lid.176.bin), which supports 176 languages. This was necessary to filter and classify posts as Russian, Ukrainian. Before applying language detection, I cleaned the data to avoid errors caused by empty or short texts.

In [113]:
import fasttext
import pandas as pd
import re
import emoji

In [114]:
# Loading the language detection model
lang_model = fasttext.load_model("data/lid.176.ftz")

# Defining language detection function
def detect_language(text):
    if not isinstance(text, str) or len(text.strip()) < 10:
        return "unknown"
    prediction = lang_model.predict(text.replace('\n', ' '), k=1)
    return prediction[0][0].replace("__label__", "")

In [115]:
df_ko = pd.read_csv("data/ua/Posts_ko.csv")
df_ds = pd.read_csv("data/ua/Posts_ds.csv")
df_uo = pd.read_csv("data/ua/Posts_uo.csv")
df_no = pd.read_csv("data/ua/Posts_no.csv")
df_dm = pd.read_csv("data/ru/Posts_dm.csv")
df_cl = pd.read_csv("data/ru/Posts_cl.csv")
df_re = pd.read_csv("data/ru/Posts_re.csv")

In [116]:
df_ko = df_ko[df_ko["text"].notna()]
df_ko["language"] = df_ko["text"].apply(detect_language)

In [117]:
df_ds = df_ds[df_ds["text"].notna()]
df_ds["language"] = df_ds["text"].apply(detect_language)
df_ds

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,21462,2025-03-13 21:41:55+00:00,**üåä**** **[**36 –û–ë—Ä–ú–ü**](https://t.me/ua_marin...,311822,370,-1001469021333,DeepStateUA,uk
1,21461,2025-03-13 18:46:49+00:00,**ü§ù –ü–æ–≤–Ω—ñ—Å—Ç—é –∑–≥–æ–¥–Ω—ñ –∑ –¥—É–º–∫–æ—é –Ω–∞—à–∏—Ö –¥—Ä—É–∑—ñ–≤ –∑ –†–ù...,302309,280,-1001469021333,DeepStateUA,uk
2,21460,2025-03-13 17:03:33+00:00,‚öñÔ∏è **–Ñ–°–ü–õ **[**–≤–∏–∑–Ω–∞–≤**](https://hudoc.echr.co...,335154,896,-1001469021333,DeepStateUA,uk
3,21459,2025-03-13 13:48:11+00:00,**üîÑ**** –ú–∞–ø—É –æ–Ω–æ–≤–ª–µ–Ω–æ\n**\n‚öîÔ∏è –í–æ—Ä–æ–≥ –ø—Ä–æ—Å—É–Ω—É–≤—Å—è...,296881,154,-1001469021333,DeepStateUA,uk
4,21458,2025-03-13 12:20:54+00:00,**üü°****–ö–∞–¥—Ä–∏ —É—Ä–∞–∂–µ–Ω–Ω—è –∫–∞—Ü–∞–ø–Ω—ñ –≤ –ë–∞—Å—ñ–≤—Ü—ñ –≤—ñ–¥ –±—ñ...,371653,557,-1001469021333,DeepStateUA,uk
...,...,...,...,...,...,...,...,...
175,21283,2025-02-15 07:38:48+00:00,**üá©üá™**** –ú–µ–¥—ñ–∞–Ω–∞ –æ–ø–∏—Ç—É–≤–∞–Ω—å –ø–µ—Ä–µ–¥ –ü–∞—Ä–ª–∞–º–µ–Ω—Ç—Å—å–∫–∏...,262553,441,-1001469021333,DeepStateUA,uk
176,21282,2025-02-14 23:45:47+00:00,**üîÑ**** –ú–∞–ø—É –æ–Ω–æ–≤–ª–µ–Ω–æ\n**\n‚öîÔ∏è –í–æ—Ä–æ–≥ –ø—Ä–æ—Å—É–Ω—É–≤—Å—è...,272033,115,-1001469021333,DeepStateUA,uk
177,21281,2025-02-14 12:58:01+00:00,üá∑üá∫ **–ü—Ä–æ—Å—É–≤–∞–Ω–Ω—è –∫–∞—Ü–∞–ø—ñ–≤ –Ω–∞ –ø—Ä–∞–≤–æ–º—É –±–µ—Ä–µ–∑—ñ —Ä—ñ—á–∫...,292437,743,-1001469021333,DeepStateUA,uk
178,21280,2025-02-14 12:36:50+00:00,**üïØ –í –£–∫—Ä–∞—ó–Ω—É –ø–æ–≤–µ—Ä–Ω—É–ª–∏ —Ç—ñ–ª–∞ 757 —É–∫—Ä–∞—ó–Ω—Å—å–∫–∏—Ö –ì...,274636,329,-1001469021333,DeepStateUA,uk


In [118]:
df_uo = df_uo[df_uo["text"].notna()]
df_uo["language"] = df_uo["text"].apply(detect_language)
df_uo

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,97026,2025-03-13 21:51:13+00:00,‚ùóÔ∏è**–¢—Ä–∞–º–ø –Ω–µ –≤–≤–æ–¥–∏–≤ –Ω–æ–≤–∏—Ö —Å–∞–Ω–∫—Ü—ñ–π –ø—Ä–æ—Ç–∏ —Ä—Ñ: —ó—Ö...,303971,652,-1001233777422,UaOnlii,uk
1,97025,2025-03-13 21:37:21+00:00,**‚ùóÔ∏è–ó–∞–≤–¥—è–∫–∏ –¥—ñ—è–º –¢—Ä–∞–º–ø–∞ –º–∞–π–∂–µ –≤—Å—ñ –≤–µ–ª–∏–∫—ñ —Ä–æ—Å—ñ–π...,292042,934,-1001233777422,UaOnlii,uk
2,97024,2025-03-13 21:24:05+00:00,**‚ùóÔ∏è–°–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–Ω–∏–∫–∞ –°–®–ê –ö–µ–ª–ª–æ–≥–∞ –≤—ñ–¥—Å—Ç–æ—Ä–æ–Ω–∏–ª–∏ ...,276603,688,-1001233777422,UaOnlii,uk
3,97023,2025-03-13 21:13:59+00:00,**‚ùóÔ∏è–°–®–ê –≤—ñ–¥–Ω–æ–≤–ª—é—é—Ç—å –ø–æ—Å—Ç–∞—á–∞–Ω–Ω—è –£–∫—Ä–∞—ó–Ω—ñ –¥–∞–ª–µ–∫–æ–±...,274636,656,-1001233777422,UaOnlii,uk
4,97022,2025-03-13 21:10:44+00:00,**‚ùóÔ∏è–¢–∏—à–∞ –ø—Ä–æ—Ç—Ä–∏–º–∞–ª–∞—Å—è –Ω–µ–¥–æ–≤–≥–æ: –≤—ñ–¥–±—É–ª–∏—Å—è –ø—É—Å–∫–∏...,254215,241,-1001233777422,UaOnlii,uk
...,...,...,...,...,...,...,...,...
2090,94787,2025-02-14 07:17:53+00:00,"**‚ùóÔ∏è–í–Ω–æ—á—ñ –¥—Ä–æ–Ω —Ä—Ñ –∞—Ç–∞–∫—É–≤–∞–≤ –ß–æ—Ä–Ω–æ–±–∏–ª—å—Å—å–∫—É –ê–ï–°, ...",231312,2596,-1001233777422,UaOnlii,uk
2091,94786,2025-02-14 06:40:39+00:00,**–ù–µ –≤–µ–¥—ñ—Ç—å—Å—è –Ω–∞ –µ–º–æ—Ü—ñ–π–Ω—ñ –≥–æ–π–¥–∞–ª–∫–∏. –Ü–Ω—Ç–µ—Ä–µ—Å–∏ –£...,218032,323,-1001233777422,UaOnlii,uk
2092,94785,2025-02-14 06:39:25+00:00,**–¶—ñ–Ω–∏ –Ω–∞ –ª—ñ–∫–∏ –æ—Ñ—ñ—Ü—ñ–π–Ω–æ –∑–Ω–∏–∑—è—Ç—å –Ω–∞ 30% –∑ 1 –±–µ—Ä...,243144,256,-1001233777422,UaOnlii,uk
2093,94784,2025-02-14 06:01:15+00:00,"**‚ùóÔ∏è–¢—Ä–∞–º–ø –ø–µ—Ä–µ–¥–∞—Å—Ç—å –£–∫—Ä–∞—ó–Ω—ñ —è–¥–µ—Ä–Ω—É –∑–±—Ä–æ—é, —è–∫—â–æ...",258946,1286,-1001233777422,UaOnlii,uk


In [119]:
df_no = df_no[df_no["text"].notna()]
df_no["language"] = df_no["text"].apply(detect_language)
df_no

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,53962,2025-03-13 21:52:57+00:00,–ì–æ–ª–æ–≤–Ω–µ —Å—Ç–∞–Ω–æ–º –Ω–∞ –∑–∞—Ä–∞–∑: \n\n‚Ä¢ –ö–∞–Ω–∞–¥–∞ [–≤–∏–¥—ñ–ª–∏–ª...,52034.0,53.0,-1001134948258,novinach,uk
1,53961,2025-03-13 21:33:51+00:00,NBC News: —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–Ω–∏–∫–∞ –°–®–ê –ö–µ–ª–ª–æ–≥–∞ [–≤—ñ–¥—Å—Ç–æ...,55178.0,514.0,-1001134948258,novinach,uk
2,53960,2025-03-13 19:42:14+00:00,"–ó–µ–ª–µ–Ω—Å—å–∫–∏–π –∑–∞—è–≤–∏–≤, —â–æ –ø—É—Ç—ñ–Ω –≥–æ—Ç—É—î –≤—ñ–¥–º–æ–≤—É –≤—ñ–¥ ...",55562.0,179.0,-1001134948258,novinach,uk
3,53959,2025-03-13 17:12:42+00:00,"–¢—Ä–∞–º–ø –∑–∞—è–≤–∏–≤, —â–æ –ø—É—Ç—ñ–Ω –∑—Ä–æ–±–∏–≤ ""–¥—É–∂–µ –±–∞–≥–∞—Ç–æ–æ–±—ñ—Ü...",56554.0,146.0,-1001134948258,novinach,uk
4,53958,2025-03-13 15:59:01+00:00,"–ø—É—Ç—ñ–Ω –≤–∏–º–∞–≥–∞—î –≥–∞—Ä–∞–Ω—Ç—ñ–π, —â–æ –ø—ñ–¥ —á–∞—Å 30-–¥–µ–Ω–Ω–æ–≥–æ ...",58506.0,747.0,-1001134948258,novinach,uk
...,...,...,...,...,...,...,...,...
686,53258,2025-02-14 09:46:22+00:00,–°—É—Å–ø—ñ–ª—å–Ω–µ: –£–∫—Ä–∞—ó–Ω–∞ [–¥–æ–æ–ø—Ä–∞—Ü—é–≤–∞–ª–∞](https://susp...,48756.0,133.0,-1001134948258,novinach,uk
695,53249,2025-02-14 09:21:29+00:00,–ó –î–Ω–µ–º –≤—Å—ñ—Ö –∑–∞–∫–æ—Ö–∞–Ω–∏—Ö! –ù–∞–¥—ñ—à–ª—ñ—Ç—å –≤–∞–ª–µ–Ω—Ç–∏–Ω–∫—É —Å–≤...,51155.0,998.0,-1001134948258,novinach,uk
696,53248,2025-02-14 08:19:22+00:00,"""–î—ñ—è"" –¥–æ –î–Ω—è –≤—Å—ñ—Ö –∑–∞–∫–æ—Ö–∞–Ω–∏—Ö [–ø—Ä–µ–∑–µ–Ω—Ç—É–≤–∞–ª–∞](htt...",55985.0,1490.0,-1001134948258,novinach,uk
697,53247,2025-02-14 07:20:17+00:00,—Ä–æ—Å—ñ–π—Å—å–∫–∏–π –±–µ–∑–ø—ñ–ª–æ—Ç–Ω–∏–∫ –≤–ª—É—á–∏–≤ –ø–æ —Å–∞—Ä–∫–æ—Ñ–∞–≥—É –ß–æ—Ä...,55546.0,1215.0,-1001134948258,novinach,uk


In [120]:
df_dm = df_dm[df_dm["text"].notna()]
df_dm["language"] = df_dm["text"].apply(detect_language)
df_dm

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,66717,2025-03-13 20:03:47+00:00,¬´‚Ä¶–ö–æ–ª—å –≤ –≤–µ—Ä—Ö–∞—Ö –±–æ–ª—å—à–∏–µ –ª–∏—Ü–∞\n–ù–µ –Ω–∞—à–ª–∏ –¥–ª—è –º–∏—Ä...,352222.0,275.0,-1001513431778,dva_majors,ru
1,66716,2025-03-13 19:50:18+00:00,**‚ú®**** –°–±–æ—Ä –≤** **–ö—É—Ä—Å–∫—É—é –æ–±–ª–∞—Å—Ç—å! ****‚ú®****\...,553973.0,56.0,-1001513431778,dva_majors,ru
2,66715,2025-03-13 19:40:59+00:00,"**–ö—É—Ä—Å–∫–∞—è –æ–±–ª–∞—Å—Ç—å. –°—É–¥–∂–∞**, **–∫–∞–¥—Ä—ã **[**–ì—Ä—É–ø–ø...",418278.0,1029.0,-1001513431778,dva_majors,ru
3,66714,2025-03-13 19:08:55+00:00,[**–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è**](https://t.me/dva_majors/66712...,405044.0,4466.0,-1001513431778,dva_majors,ru
5,66712,2025-03-13 19:01:43+00:00,üîû **–í—Å–µ–º –±–æ–π—Ü–∞–º –Ω–∞ –ø–µ—Ä–µ–¥–æ–≤–æ–π –∏ –≤ –ø—Ä–∏—Ñ—Ä–æ–Ω—Ç–æ–≤–æ–π ...,315548.0,5767.0,-1001513431778,dva_majors,ru
...,...,...,...,...,...,...,...,...
2170,64517,2025-02-14 06:08:24+00:00,–ù–∞—á–∞–ª–∞—Å—å –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∫ –ø—Ä–æ–¥–≤–∏–∂–µ–Ω–∏—é –ó–∞–ª—É–∂–Ω–æ–≥–æ –≤ ...,314479.0,1330.0,-1001513431778,dva_majors,ru
2171,64516,2025-02-14 04:31:56+00:00,**–ü–µ—Ä–µ–≥–æ–≤–æ—Ä–Ω—ã–π **[**–ø—Ä–æ—Ü–µ—Å—Å**](https://t.me/ne...,318387.0,590.0,-1001513431778,dva_majors,ru
2172,64515,2025-02-14 04:20:56+00:00,**–ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –†–æ—Å—Å–∏–∏:**\n\n–í —Ç–µ—á–µ–Ω–∏–µ –ø—Ä–æ—à–µ–¥—à–µ–π ...,306357.0,240.0,-1001513431778,dva_majors,ru
2173,64514,2025-02-14 03:43:50+00:00,**#–°–≤–æ–¥–∫–∞**** –Ω–∞ —É—Ç—Ä–æ 14 —Ñ–µ–≤—Ä–∞–ª—è 2025 –≥–æ–¥–∞**\n...,629203.0,853.0,-1001513431778,dva_majors,ru


In [121]:
df_cl = df_cl[df_cl["text"].notna()]
df_cl["language"] = df_cl["text"].apply(detect_language)
df_cl

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,157779,2025-03-13 23:20:52+00:00,"–û —Ç–µ—Ö, –∫—Ç–æ –∑–∞–Ω–∏–º–∞–ª—Å—è –≤–æ–µ–Ω–Ω—ã–º–∏ –ø—Ä–µ—Å—Ç—É–ø–ª–µ–Ω–∏—è–º–∏ –Ω...",253245.0,210.0,-1001101806611,boris_rozhin,ru
1,157778,2025-03-13 23:12:01+00:00,[–û–ø–µ—Ä–∞—Ç–æ—Ä—ã](https://t.me/mod_russia/50058) FPV...,246294.0,56.0,-1001101806611,boris_rozhin,ru
2,157777,2025-03-13 22:23:01+00:00,**–ï–≤—Ä–æ–ø–µ–π—Å–∫–∏–π –°–æ—é–∑ –æ—á–µ–Ω—å –∏ –æ—á–µ–Ω—å –ø—Ä–æ—Ç–∏–≤–Ω—ã–π \n–Ø...,250403.0,492.0,-1001101806611,boris_rozhin,ru
4,157775,2025-03-13 22:11:29+00:00,**–û—á–µ—Ä–µ–¥–Ω–∞—è —Å–∞–º–æ–¥–µ–ª–∫–∞ —Å –ü–æ–∫—Ä–æ–≤—Å–∫–æ–≥–æ –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏...,254333.0,505.0,-1001101806611,boris_rozhin,ru
5,157774,2025-03-13 21:34:33+00:00,–ü—É—Ç–∏–Ω –ø—Ä–æ–≤–µ–ª —Ç–µ–ª–µ—Ñ–æ–Ω–Ω—ã–µ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã —Å –±–∏–Ω –°–∞–ª–º–∞...,271850.0,158.0,-1001101806611,boris_rozhin,ru
...,...,...,...,...,...,...,...,...
3198,154532,2025-02-14 03:13:01+00:00,–î—Ä–æ–Ω–æ–≤–æ–¥—ã ¬´–†—É–±–∏–∫–æ–Ω–∞¬ª —É–Ω–∏—á—Ç–æ–∂–∏–ª–∏ –±—Ä–æ–Ω–∏—Ä–æ–≤–∞–Ω–Ω—É—é ...,196734.0,78.0,-1001101806611,boris_rozhin,ru
3200,154530,2025-02-14 02:10:15+00:00,**‚†Ä**\n**üî¥**** ** **–°–≤–∏–¥–µ—Ç–µ–ª—å—Å—Ç–≤–∞ –≤–µ—Ä–Ω—É–≤—à–∏—Ö—Å—è ...,187674.0,335.0,-1001101806611,boris_rozhin,ru
3201,154529,2025-02-14 01:36:01+00:00,"–ù–æ–≤—ã–π –∏—Ä–∞–Ω—Å–∫–∏–π —Ä–µ–∞–∫—Ç–∏–≤–Ω—ã–π –ë–ü–õ–ê-–∫–∞–º–∏–∫–∞–¥–∑–µ ""–•–∞–ª–∏...",188367.0,231.0,-1001101806611,boris_rozhin,ru
3202,154528,2025-02-14 00:33:01+00:00,–í —Ä–∞–π–æ–Ω–µ –í–µ–ª–∏–∫–æ–π –ù–æ–≤–æ—Å—ë–ª–∫–∏ –Ω–∞—à–∏ –ø–∞—Ä–Ω–∏ —Å 305 –±—Ä...,190817.0,110.0,-1001101806611,boris_rozhin,ru


In [122]:
df_re = df_re[df_re["text"].notna()]
df_re["language"] = df_re["text"].apply(detect_language)
df_re

Unnamed: 0,post_id,date,text,views,forwards,channel_id,channel_name,language
0,94053,2025-03-13 19:29:58+00:00,**–í–ª–∞–¥–∏–º–∏—Ä –ü—É—Ç–∏–Ω –¥–∏–ø–ª–æ–º–∞—Ç–∏—á–Ω–æ —Ä–∞—Å—Å—Ç–∞–≤–∏–ª –∞–∫—Ü–µ–Ω—Ç...,959162,1495,-1001260622817,readovkanews,ru
1,94052,2025-03-13 18:59:58+00:00,**–ü–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω—ã –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω–∞ —Å–Ω—è—Ç–∏–µ —Å —Ä–æ–∑—ã—Å–∫–∞ 1...,959836,925,-1001260622817,readovkanews,ru
2,94051,2025-03-13 18:34:58+00:00,**–î–£–ú –†–§ –∏ –¢—É—Ä—Ü–∏—è —Å–Ω–æ–≤–∞ –ø—Ä–∏–∑–Ω–∞—é—Ç—Å—è –≤ –ª—é–±–≤–∏ –¥—Ä—É...,1020364,1215,-1001260622817,readovkanews,ru
4,94049,2025-03-13 17:59:58+00:00,**–†—É—Å—Å–∫–∞—è –∞—Ä–º–∏—è —Ñ–æ—Ä–º–∏—Ä—É–µ—Ç –Ω–æ–≤—ã–π –ø–ª–∞—Ü–¥–∞—Ä–º –∑–∞ –∫–∞...,903210,675,-1001260622817,readovkanews,ru
5,94048,2025-03-13 17:00:31+00:00,"**–ü—É—Ç–∏–Ω —Å–¥–µ–ª–∞–ª –æ—á–µ–Ω—å –º–Ω–æ–≥–æ–æ–±–µ—â–∞—é—â–µ–µ –∑–∞—è–≤–ª–µ–Ω–∏–µ,...",880199,1130,-1001260622817,readovkanews,ru
...,...,...,...,...,...,...,...,...
1034,93006,2025-02-14 07:37:59+00:00,**–ù–∞–∫–∞–Ω—É–Ω–µ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–æ–≤ –≤ –ú—é–Ω—Ö–µ–Ω–µ –ó–µ–ª–µ–Ω—Å–∫–∏–π ¬´—à–æ...,334579,2377,-1001260622817,readovkanews,ru
1035,93005,2025-02-14 07:22:17+00:00,**–¢–∞–π–Ω–∞ –ø–∏—Ä–æ–≥–æ–≤ —Ä–∞—Å–∫—Ä—ã—Ç–∞ ‚Äî —Ä—Ç—É—Ç—å –≤ –±—É–ª–æ—á–∫–∏ –¥–æ–±...,292128,2216,-1001260622817,readovkanews,ru
1036,93004,2025-02-14 06:20:01+00:00,"**–°–®–ê –º–æ–≥—É—Ç –æ—Ç–ø—Ä–∞–≤–∏—Ç—å –≤–æ–π—Å–∫–∞ –Ω–∞ –£–∫—Ä–∞–∏–Ω—É, –µ—Å–ª–∏ ...",307165,1787,-1001260622817,readovkanews,ru
1037,93003,2025-02-14 05:30:07+00:00,"‚ùóÔ∏è**¬´–í—Å–µ–º —Å–ø–∞—Å–∏–±–æ, –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—É –í–ª–∞–¥–∏–º–∏—Ä—É –í–ª–∞–¥–∏–º...",319996,1688,-1001260622817,readovkanews,ru


# Further processing

### Removing duplicates

In [123]:
df_ko = df_ko.drop_duplicates(subset="text")
df_ds = df_ds.drop_duplicates(subset="text")
df_uo = df_uo.drop_duplicates(subset="text")
df_no = df_no.drop_duplicates(subset="text")
df_dm = df_dm.drop_duplicates(subset="text")
df_cl = df_cl.drop_duplicates(subset="text")
df_re = df_re.drop_duplicates(subset="text")

### Normalizing Whitespace

In [124]:
df_ko.loc["text"] = df_ko["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_ds.loc["text"] = df_ds["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_uo.loc["text"] = df_uo["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_no.loc["text"] = df_no["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_dm.loc["text"] = df_dm["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_cl.loc["text"] = df_cl["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
df_re.loc["text"] = df_re["text"].str.replace(r'\s+', ' ', regex=True).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uo.loc["text"] = df_uo["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm.loc["text"] = df_dm["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl.loc["text"] = df_cl["text"].str.replace(r'\s+', ' ', regex=True).str.strip()


### Filtering out non relevant posts from Ukrainian telegram channels

During the preprocessing stage, it's essential to remove posts that do not contribute meaningfully to the framing analysis. In particular, messages containing terms like "–ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞" (air raid alert), "—É–≤–∞–≥–∞" (attention), "–≤—ñ–¥–±—ñ–π —Ç—Ä–∏–≤–æ–≥–∏" (end of alert), or similar phrases are often:

Automated system alerts, or

Security and public service announcements

Such messages typically lack political framing, narrative construction, or propaganda elements. Instead, they serve purely informational or emergency-related purposes and may introduce noise or bias into qualitative, discursive, or sentiment-based analyses.

By excluding these entries, the dataset becomes more focused on relevant, communicative content, which strengthens the reliability and interpretability of subsequent framing analysis.




In [125]:
system_keywords = r"–ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞|—É–≤–∞–≥–∞|–≤—ñ–¥–±—ñ–π —Ç—Ä–∏–≤–æ–≥–∏|–±–æ—Ç|–Ω–∞–≥–∞–¥—É–≤–∞–Ω–Ω—è"
df_ko = df_ko[~df_ko["text"].str.contains(system_keywords, case=False, na=False)]
df_ds = df_ds[~df_ds["text"].str.contains(system_keywords, case=False, na=False)]
df_uo = df_uo[~df_uo["text"].str.contains(system_keywords, case=False, na=False)]
df_no = df_no[~df_no["text"].str.contains(system_keywords, case=False, na=False)]

### Text Cleaning

In [126]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r"http\S+|www\S+|t\.me\S+", "", text)  # removing URLs
    text = re.sub(r"@\w+", "", text)                     # removing mentions
    text = re.sub(r"#\w+", "", text)                     # removing hashtags
    return text

In [127]:
df_ko["text"] = df_ko["text"].apply(clean_text)
df_ds["text"] = df_ds["text"].apply(clean_text)
df_uo["text"] = df_uo["text"].apply(clean_text)
df_no["text"] = df_no["text"].apply(clean_text)
df_dm["text"] = df_dm["text"].apply(clean_text)
df_cl["text"] = df_cl["text"].apply(clean_text)
df_re["text"] = df_re["text"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm["text"] = df_dm["text"].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["text"] = df_cl["text"].apply(clean_text)


### Date into separate column

In [128]:
df_ko["date"] = pd.to_datetime(df_ko["date"], errors="coerce")
df_ds["date"] = pd.to_datetime(df_ds["date"], errors="coerce")
df_uo["date"] = pd.to_datetime(df_uo["date"], errors="coerce")
df_no["date"] = pd.to_datetime(df_no["date"], errors="coerce")
df_dm["date"] = pd.to_datetime(df_dm["date"], errors="coerce")
df_cl["date"] = pd.to_datetime(df_cl["date"], errors="coerce")
df_re["date"] = pd.to_datetime(df_re["date"], errors="coerce")
df_ko["date_only"] = df_ko["date"].dt.date
df_ds["date_only"] = df_ds["date"].dt.date
df_uo["date_only"] = df_uo["date"].dt.date
df_no["date_only"] = df_no["date"].dt.date
df_dm["date_only"] = df_dm["date"].dt.date
df_cl["date_only"] = df_cl["date"].dt.date
df_re["date_only"] = df_re["date"].dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm["date"] = pd.to_datetime(df_dm["date"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["date"] = pd.to_datetime(df_cl["date"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm["date_only"] = df_dm["date"].dt.date
A value is trying to be set on a cop

### Lowercasing

In [129]:
df_ko["text"] = df_ko["text"].str.lower()
df_ds["text"] = df_ds["text"].str.lower()
df_uo["text"] = df_uo["text"].str.lower()
df_no["text"] = df_no["text"].str.lower()
df_dm["text"] = df_dm["text"].str.lower()
df_cl["text"] = df_cl["text"].str.lower()
df_re["text"] = df_re["text"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm["text"] = df_dm["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["text"] = df_cl["text"].str.lower()


# Transforming emotion into a text

In [130]:
# Function to transform emojis
def transform_emojis(text):
    if pd.isnull(text):
        return ""
    return emoji.demojize(text, delimiters=(":"," "))

In [131]:
# Applying transformation to the "text" column
df_ko["text_transformed"] = df_ko["text"].apply(transform_emojis)
df_ds["text_transformed"] = df_ds["text"].apply(transform_emojis)
df_uo["text_transformed"] = df_uo["text"].apply(transform_emojis)
df_no["text_transformed"] = df_no["text"].apply(transform_emojis)
df_dm["text_transformed"] = df_dm["text"].apply(transform_emojis)
df_cl["text_transformed"] = df_cl["text"].apply(transform_emojis)
df_re["text_transformed"] = df_re["text"].apply(transform_emojis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm["text_transformed"] = df_dm["text"].apply(transform_emojis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl["text_transformed"] = df_cl["text"].apply(transform_emojis)


### Saving data to a new file in a folder "data_clean/"