In [2]:
from fuzzywuzzy import fuzz
import pandas as pd
import pickle
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
from nltk.corpus import wordnet



In [2]:
nlp = spacy.load("en_core_web_sm")
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
sia = SentimentIntensityAnalyzer()

In [4]:
telegram_df = pd.read_csv('russia_ukraine_public.csv')
telegram_df = pd.DataFrame(data = telegram_df['text'], columns = ['text'])
telegram_df = telegram_df.dropna()
telegram_df['text'] = telegram_df['text'].apply(lambda text: text.replace('\n', ' '))
telegram_df.reset_index(inplace=True, drop = True)

In [5]:
def get_synonyms(keywords):
    synonyms = {}
    for word in keywords:
        related_words = set()
        for synset in wordnet.synsets(word):
            for lemma in synset.lemmas():
                related_words.add(lemma.name())
        synonyms[word] = list(related_words)
    all_synonyms = []
    for word, related_words in synonyms.items():
        related_words = [word.replace('_', ' ') for word in related_words]
        all_synonyms = all_synonyms + related_words
    
    return all_synonyms

In [8]:
class UserModel:
    def __init__(self, min_length, name_weight, sentiment_weight, keywords, eval_mode = True):
        self.min_length = min_length  
        self.name_weight = name_weight  
        self.sentiment_weight = sentiment_weight
        self.keywords = keywords
        self.eval_mode = eval_mode

    def preprocess_text(self, text):
        doc = nlp(text)
        return [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

    def count_keyword_matches(self, text, threshold=80):
        vanilla_text = text.lower().split()
        processed_text = set(self.preprocess_text(text))  
        match_count = 0
        for keyword in self.keywords:
            for word in processed_text:
                similarity = fuzz.ratio(keyword.lower(), word)  
                if similarity >= threshold:  
                    match_count += 1
                    break
                else:
                    if keyword in vanilla_text:
                        match_count += 1
                        break
        
        return match_count

    def extract_named_entities(self, text):
        doc = nlp(text)
        names = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE"]]
        return names

    def sentiment_analysis(self, text):
        sentiment = sia.polarity_scores(text)
        return sentiment["compound"]

    def check_for_links(self, text):
        return "http" in text or "https" in text

    def score_post(self, post):
        post_length = len(post.split())
        length_score = -5 if post_length <= self.min_length else 0
        keyword_matches = self.count_keyword_matches(post)
        keyword_score = 2 * keyword_matches if keyword_matches > 0 else - 10
        person_names = self.extract_named_entities(post)
        name_score = len(person_names) * self.name_weight
        sentiment_score = self.sentiment_analysis(post) * self.sentiment_weight
        link_score = 5 if (self.check_for_links(post) and keyword_score > 0) else 0
        total_score = (
            length_score +
            keyword_score +
            name_score +
            sentiment_score +
            link_score 
        )
        if self.eval_mode:
            print('length_score:',length_score, 'keyword_score:',keyword_score, 'name_score:',name_score, 'sentiment_score:',sentiment_score)
            print('link_score:',link_score)
        return total_score

keywords = ['russia', 'ukraine', 'ukranian', 'russian', 'ukranians', 'russians', '🇷🇺', '🇷🇺🇺🇦', '⚡️', 'war', '🇺🇦', 'putin', 'Putin', 'Zelenskyy'] 
synonyms = get_synonyms(keywords)
keywords = keywords + synonyms
user_model = UserModel(min_length=5, name_weight=1, sentiment_weight=-3, keywords=keywords)


posts = telegram_df.sample(n=10)['text']
for post in posts:
    print(post)
    score = user_model.score_post(post)
    print(f"Post relevance score: {score}")
    print('---------')

🇷🇺🇺🇦 a large video of the assault on the checkpoint of the armed forces of ukraine by the combined assault group of the 4th brigade of the lpr army and one of the lars. after processing positions with artillery, attack aircraft go to enemy positions, where they crush resistance and clean up those who did not have time to escape.
length_score: 0 keyword_score: 8 name_score: 0 sentiment_score: 2.7243
link_score: 0
Post relevance score: 10.7243
---------
🇺🇸🇷🇺🇺🇦the ukrainian government is hiding the number of people killed in the war, us presidential candidate donald trump said.  “this war should not have happened. but it did. and it’s harder to negotiate now than it was in the beginning. millions of people have died. and that number is only going to increase. the death toll is going to be higher than people can imagine. if you look at the destruction of buildings all over ukraine, the number is going to be higher. they’re lying when they say the numbers now, trying to make them small. the

In [15]:
tqdm.pandas()
user_model = UserModel(min_length=5, name_weight=1, sentiment_weight=-3, keywords=keywords, eval_mode=False)
telegram_df['user_score_1'] = telegram_df['text'].progress_apply(lambda text: user_model.score_post(text))

100%|████████████████████████████████████████████████████████████████████████| 152609/152609 [1:10:41<00:00, 35.98it/s]


In [16]:
#with open('telegram_df', 'wb') as file:
#    pickle.dump(telegram_df, file)