In [None]:
# import dependencies
from fuzzywuzzy import fuzz
import pandas as pd
import pickle
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
from nltk.corpus import wordnet
import json

In [None]:
# import NLP processing tools
nlp = spacy.load("en_core_web_sm")
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
sia = SentimentIntensityAnalyzer()

In [None]:
# load scraped data
with open(r'../data/bellumactanews1.json', 'r', encoding='utf-8') as f:
    belumactanews = json.load(f)
with open(r'../data/IntelSlavaZ.json', 'r', encoding='utf-8') as f:
    intelslava = json.load(f)

In [None]:
# Function to extract text from message
def extract_text(message):
    text = message.get('text', '')
    # Handle cases where text is a list of dicts (which can happen in Telegram exports)
    if isinstance(text, list):
        text_parts = []
        for part in text:
            if isinstance(part, str):
                text_parts.append(part)
            elif isinstance(part, dict) and 'text' in part:
                text_parts.append(part['text'])
        text = ' '.join(text_parts)
    elif not isinstance(text, str):
        text = str(text)
    return text

In [None]:
messages_intelslava = intelslava.get('messages', [])
texts = []
for msg in messages_intelslava:
    if msg.get('type') == 'message':
        text = extract_text(msg)
        if text.strip():
            texts.append(text)
messages_belumacta = intelslava.get('messages', [])
for msg in messages_belumacta:
    if msg.get('type') == 'message':
        text = extract_text(msg)
        if text.strip():
            texts.append(text)

In [None]:
scraped_df = pd.DataFrame(texts, columns=['text'])

In [None]:
public_df = pd.read_csv('../data/russia_ukraine_public.csv')
public_df = pd.DataFrame(data = public_df['text'], columns = ['text'])
telegram_df = pd.concat([public_df, scraped_df])
telegram_df = telegram_df.dropna()
telegram_df['text'] = telegram_df['text'].apply(lambda text: text.replace('\n', ' '))
telegram_df.reset_index(inplace=True, drop = True)

In [None]:
def get_synonyms(keywords):
    synonyms = {}
    for word in keywords:
        related_words = set()
        for synset in wordnet.synsets(word):
            for lemma in synset.lemmas():
                related_words.add(lemma.name())
        synonyms[word] = list(related_words)
    all_synonyms = []
    for word, related_words in synonyms.items():
        related_words = [word.replace('_', ' ') for word in related_words]
        all_synonyms = all_synonyms + related_words
    
    return all_synonyms

In [None]:
class UserModel:
    def __init__(self, min_length, name_weight, sentiment_weight, keywords, eval_mode = True):
        self.min_length = min_length  
        self.name_weight = name_weight  
        self.sentiment_weight = sentiment_weight
        self.keywords = keywords
        self.eval_mode = eval_mode

    def preprocess_text(self, text):
        doc = nlp(text)
        return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

    def count_keyword_matches(self, text, threshold=80):
        vanilla_text = text.lower().split()
        processed_text = set(self.preprocess_text(text))  
        match_count = 0
        for keyword in self.keywords:
            for word in processed_text:
                similarity = fuzz.ratio(keyword.lower(), word)  
                if similarity >= threshold:  
                    match_count += 1
                    break
                else:
                    if keyword in vanilla_text:
                        match_count += 1
                        break
        
        return match_count

    def extract_named_entities(self, text):
        doc = nlp(text)
        names = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE"]]
        return names

    def sentiment_analysis(self, text):
        sentiment = sia.polarity_scores(text)
        return sentiment["compound"]

    def check_for_links(self, text):
        return "http" in text or "https" in text

    def score_post(self, post):
        post_length = len(post.split())
        length_score = -5 if post_length <= self.min_length else 0
        keyword_matches = self.count_keyword_matches(post)
        keyword_score = 2 * keyword_matches if keyword_matches > 0 else - 10
        person_names = self.extract_named_entities(post)
        name_score = len(person_names) * self.name_weight
        sentiment_score = self.sentiment_analysis(post) * self.sentiment_weight
        link_score = 5 if (self.check_for_links(post) and keyword_score > 0) else 0
        total_score = (
            length_score +
            keyword_score +
            name_score +
            sentiment_score +
            link_score 
        )
        if self.eval_mode:
            print('length_score:',length_score, 'keyword_score:',keyword_score, 'name_score:',name_score, 'sentiment_score:',sentiment_score)
            print('link_score:',link_score)
        return total_score

keywords = ['russia', 'ukraine', 'ukranian', 'russian', 'ukranians', 'russians', '🇷🇺', '🇷🇺🇺🇦', '⚡️', 'war', '🇺🇦', 'putin', 'Putin', 'Zelenskyy'] 
synonyms = get_synonyms(keywords)
keywords = keywords + synonyms
user_model = UserModel(min_length=5, name_weight=1, sentiment_weight=-3, keywords=keywords)


posts = telegram_df.sample(n=10)['text']
for post in posts:
    print(post)
    score = user_model.score_post(post)
    print(f"Post relevance score: {score}")
    print('---------')

In [None]:
tqdm.pandas()
user_model = UserModel(min_length=5, name_weight=1, sentiment_weight=-3, keywords=keywords, eval_mode=False)
telegram_df['reward'] = telegram_df['text'].progress_apply(lambda text: user_model.score_post(text))

In [None]:
with open('../data/telegram_df', 'wb') as file:
    pickle.dump(telegram_df, file)