# Inferring sentiment for the rest of the articles

In [2]:
import numpy as np
import pandas as pd
import re
import sqlite3
from digest_utils import strpdict
from itertools import chain
from collections import Counter
import requests
from tqdm import tqdm
import pickle
from pathlib import Path

np.random.seed(173)

In [3]:
# Connect to local db
db_armenpress = sqlite3.connect("Armenpress/scraping.db")
db_newsam = sqlite3.connect("Newsam/scraping.db")
db_tertam = sqlite3.connect("Tertam/scraping.db")

# loading data into dataframes
df_armenpress = pd.read_sql_query("select * from article", db_armenpress)
df_newsam = pd.read_sql_query("select * from article", db_newsam)
df_tertam = pd.read_sql_query("select * from article", db_tertam)

# parsing datetimes
df_tertam.time = pd.to_datetime(df_tertam.time)
df_newsam.time = pd.to_datetime(df_newsam.time)
df_armenpress.time = pd.to_datetime(df_armenpress.time)

In [4]:
# join all datasets
df = pd.concat([df[["time", "feedback", "content"]] for df in [df_armenpress, df_newsam, df_tertam]], axis=0)
df.feedback.str.replace("\n", "")
(df_feedback := df[~df.feedback.isna()].copy())

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Serz...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Davi...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{""entities"": [\n {""name"": ""World Bank"", ""sent...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,"{\n ""entities"": [\n {""name"": ""CoE Congress...","YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Matt...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{\n ""entities"": [\n {\n ""name"": ""EBRD...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{\n ""entities"": [\n {\n ""name"": ""Dmyt...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{\n ""entities"": [\n {\n ""name"": ""Phil...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{\n ""entities"": [\n {\n ""name"": ""Aray...","On December 13, President of the Artsakh Repub..."


In [5]:
# parse feedbacks
df_feedback.feedback = df_feedback.feedback.apply(strpdict)

unexpected indent (<unknown>, line 20)
'{' was never closed (<unknown>, line 33)
'{' was never closed (<unknown>, line 18)
'{' was never closed (<unknown>, line 18)
'[' was never closed (<unknown>, line 2)
'{' was never closed (<unknown>, line 28)
'{' was never closed (<unknown>, line 8)
'{' was never closed (<unknown>, line 13)
'{' was never closed (<unknown>, line 13)


In [6]:
# filter down to usable feedbacks
(df_feedback := df_feedback[
    ~df_feedback.feedback.isna() & (
        df_feedback.feedback.apply(
            lambda x: hasattr(x, "keys") and "entities" in x.keys()
            )
        )
    ]
)

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{'entities': [{'name': 'Serzh Sargsyan', 'sent...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{'entities': [{'name': 'David Hakhverdyan', 's...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{'entities': [{'name': 'World Bank', 'sentimen...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,{'entities': [{'name': 'CoE Congress of Local ...,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{'entities': [{'name': 'Matthew Bryza', 'senti...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{'entities': [{'name': 'EBRD', 'sentiment': 'p...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{'entities': [{'name': 'Dmytro Kuleba', 'senti...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{'entities': [{'name': 'Philip Reeker', 'senti...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{'entities': [{'name': 'Arayik Harutyunyan', '...","On December 13, President of the Artsakh Repub..."


In [7]:
# words to count for bag of words
words_to_count = Counter()
ent_name = df_feedback.feedback.iloc[0]["entities"][0]["name"]
feedback = df_feedback.feedback.iloc[0]["entities"][0]["explanation"]
article_sentences = list(chain.from_iterable(
    [partition.split(".") for partition in df_feedback.content.iloc[0].split(":")]
))
article_sentences = [s.replace(",", "").lower() for s in article_sentences]
relevant_sentances = [feedback] + [
    sent
    for sent
    in article_sentences
    if ent_name.lower() in sent
]

relevant_words = list(
    chain.from_iterable(
        [re.findall(r"\w+", sent) for sent in relevant_sentances]
    )
)

# Split the sentances that mentio the entity into words
words_to_count += Counter(relevant_words)
words_to_count

Counter({'the': 10,
         'of': 4,
         'as': 3,
         'received': 2,
         'armenian': 2,
         'president': 2,
         'serzh': 2,
         'sargsyan': 2,
         'director': 2,
         'france': 2,
         'telecom': 2,
         'orange': 2,
         'company': 2,
         'that': 2,
         'in': 2,
         'mentioned': 1,
         'Armenian': 1,
         'President': 1,
         'who': 1,
         'executives': 1,
         'today': 1,
         'olaf': 1,
         'swantee': 1,
         'executive': 1,
         'vice': 1,
         'and': 1,
         'bruno': 1,
         'dutua': 1,
         'chief': 1,
         'armenia': 1,
         'we': 1,
         'are': 1,
         'interested': 1,
         'one': 1,
         'world': 1,
         's': 1,
         'leading': 1,
         'operators': 1,
         'telecommunication': 1,
         'sphere': 1,
         'starts': 1,
         'its': 1,
         'activity': 1,
         'our': 1,
         'state': 1,
         'soo

Ignoring stopwords

In [8]:
(english_stopwords := requests.get(
    "https://gist.githubusercontent.com"
    "/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089"
    "c9821dbcf6d0c786c/NLTK's%2520list%2520of%252"
    "0english%2520stopwords"
).content.decode().split("\n"))[40:50]

['was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do']

In [9]:
{
    k.lower(): v
    for k, v in words_to_count.items()
    if (
        k.lower() not in english_stopwords
    ) and (
        k.lower() not in ent_name.lower()
    )
}

{'mentioned': 1,
 'armenian': 2,
 'president': 2,
 'received': 2,
 'executives': 1,
 'today': 1,
 'olaf': 1,
 'swantee': 1,
 'executive': 1,
 'vice': 1,
 'director': 2,
 'france': 2,
 'telecom': 2,
 'orange': 2,
 'company': 2,
 'bruno': 1,
 'dutua': 1,
 'chief': 1,
 'armenia': 1,
 'interested': 1,
 'one': 1,
 'world': 1,
 'leading': 1,
 'operators': 1,
 'telecommunication': 1,
 'sphere': 1,
 'starts': 1,
 'activity': 1,
 'state': 1,
 'soon': 1,
 'possible': 1,
 'pointed': 1,
 'noting': 1,
 'government': 1,
 'ready': 1,
 'assist': 1,
 'solution': 1,
 'issues': 1,
 'arouse': 1,
 'works': 1}

Wrapping it all in a function

In [10]:
words_to_count = {
    "positive": Counter(),
    "negative": Counter(),
    "neutral": Counter(),
}

ent_datas = df_feedback.feedback.iloc[0]["entities"]
def extract_word_counts(obs: dict, content: str):
    """To be applied (not the built-in way) on a pandas series of
    dictionaries containing entity data. Counts the words around
    the mentioned entity and while keeping track of the sentiment

    Args:
        obs (dict): observation that has the key `entities` which
        is a list of dicts that have the keys `name`, `sentiment`,
        and `explanation`.
        content (str): the text of the article.
    """
    if "entities" not in obs:
        print("Obs. missing key entities")

    for ent_data in obs["entities"]:
        if (type(ent_data) is not dict) or any(
            [k not in ent_data for k in ["name", "sentiment"]]
        ) or (
            ent_data["sentiment"].lower() not in ["positive", "negative", "neutral"]
        ):
            continue
        

        ent_name = ent_data["name"].lower()
        feedback = ent_data["explanation"].lower() if "explanation" in ent_data else ""
        sentiment = ent_data["sentiment"].lower()

        # splitting articles into sentences
        article_sentences = list(chain.from_iterable(
            [partition.split(".") for partition in content.split(":")]
        ))
        article_sentences = [s.replace(",", "").lower() for s in article_sentences]
        
        # selecting relevant sentences
        relevant_sentances = [feedback] + [
            sent
            for sent
            in article_sentences
            if ent_name in sent
        ]

        # Splitting into words
        relevant_words = list(
            chain.from_iterable(
                [re.findall(r"\w+", sent) for sent in relevant_sentances]
            )
        )

        # filtering stopwords and entity names
        relevant_words = [
            w for w in relevant_words
            if (w not in english_stopwords) and (
                w not in ent_name
            )
        ]

        # Split the sentances that mentio the entity into words
        words_to_count[sentiment] += Counter(relevant_words)

In [65]:
for i, obs in tqdm(df_feedback.iterrows()):
    extract_word_counts(obs["feedback"], obs["content"])

36610it [08:07, 75.09it/s] 


In [66]:
with open(Path("cache") / "words_count.Counter.pkl", "wb") as f:
    pickle.dump(words_to_count, f)

In [11]:
with open(Path("cache") / "words_count.Counter.pkl", "rb") as f:
    word_sentiments = pickle.load(f)

In [12]:
words = set(
    list(word_sentiments["positive"].keys()) + 
    list(word_sentiments["negative"].keys()) + 
    list(word_sentiments["neutral"].keys())
)

term_sentiment = {}

for word in words:
    term_sentiment[word] = {
        sent: word_sentiments[sent][word] if word in word_sentiments[sent] else 0
        for sent in ["positive", "negative", "neutral"]
    }

term_sentiment

{'mortgaging': {'positive': 0, 'negative': 2, 'neutral': 0},
 'lowers': {'positive': 0, 'negative': 3, 'neutral': 1},
 'round': {'positive': 348, 'negative': 168, 'neutral': 791},
 'plunged': {'positive': 4, 'negative': 31, 'neutral': 28},
 'samuelson': {'positive': 0, 'negative': 1, 'neutral': 0},
 'spectre': {'positive': 0, 'negative': 2, 'neutral': 2},
 'hobbs': {'positive': 3, 'negative': 0, 'neutral': 0},
 'diyarbekir': {'positive': 0, 'negative': 0, 'neutral': 2},
 'terrestrial': {'positive': 1, 'negative': 2, 'neutral': 0},
 'grant': {'positive': 172, 'negative': 66, 'neutral': 176},
 'stimulation': {'positive': 1, 'negative': 0, 'neutral': 3},
 'entailing': {'positive': 0, 'negative': 2, 'neutral': 9},
 'hematological': {'positive': 1, 'negative': 0, 'neutral': 1},
 'aftera': {'positive': 1, 'negative': 2, 'neutral': 0},
 'abdelhak': {'positive': 3, 'negative': 0, 'neutral': 0},
 'rout': {'positive': 2, 'negative': 3, 'neutral': 1},
 '5222408': {'positive': 0, 'negative': 2, 'n

In [13]:
df_feedback

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{'entities': [{'name': 'Serzh Sargsyan', 'sent...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{'entities': [{'name': 'David Hakhverdyan', 's...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{'entities': [{'name': 'World Bank', 'sentimen...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,{'entities': [{'name': 'CoE Congress of Local ...,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{'entities': [{'name': 'Matthew Bryza', 'senti...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{'entities': [{'name': 'EBRD', 'sentiment': 'p...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{'entities': [{'name': 'Dmytro Kuleba', 'senti...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{'entities': [{'name': 'Philip Reeker', 'senti...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{'entities': [{'name': 'Arayik Harutyunyan', '...","On December 13, President of the Artsakh Repub..."


In [15]:
# def predict(content, entity_names)

out = []

for i, row in tqdm(df_feedback.iterrows()):
    article_sents = {}
    for ent in row.feedback["entities"]:
        if any([k not in ent for k in ["name", "sentiment"]]):
            continue

        name = ent["name"].lower()
        actual_sentiment = ent["sentiment"].lower()

        article_sentences = list(chain.from_iterable(
            [partition.split(".") for partition in row.content.lower().split(":")]
        ))
        article_sentences = [s.replace(",", "").lower() for s in article_sentences]
        
        # selecting relevant sentences
        relevant_sentances = [
            sent
            for sent
            in article_sentences
            if name in sent
        ]

        # Splitting into words
        relevant_words = list(
            chain.from_iterable(
                [re.findall(r"\w+", sent) for sent in relevant_sentances]
            )
        )

        # filtering stopwords and entity names
        relevant_words = [
            w for w in relevant_words
            if (w not in english_stopwords) and (
                w not in name
            )
        ]

        counts = [Counter(term_sentiment[w]) for w in relevant_words if w in term_sentiment]
        counter = Counter()
        for count in counts:
            counter += count
        
        article_sents[name] = counter
    out.append(article_sents)

0it [00:00, ?it/s]

36610it [01:08, 536.25it/s]


In [16]:
df_feedback = df_feedback.copy()
df_feedback["predictions_raw"] = out
df_feedback

Unnamed: 0,time,feedback,content,predictions_raw
0,2009-06-08 00:00:00,"{'entities': [{'name': 'Serzh Sargsyan', 'sent...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President...","{'serzh sargsyan': {'positive': 151383, 'negat..."
1,2009-06-08 00:00:00,"{'entities': [{'name': 'David Hakhverdyan', 's...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co...","{'david hakhverdyan': {'positive': 118572, 'ne..."
2,2009-06-04 00:00:00,"{'entities': [{'name': 'World Bank', 'sentimen...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme...","{'world bank': {'positive': 46029, 'negative':..."
3,2008-12-30 00:00:00,{'entities': [{'name': 'CoE Congress of Local ...,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss...",{'coe congress of local and regional authoriti...
4,2009-06-01 00:00:00,"{'entities': [{'name': 'Matthew Bryza', 'senti...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o...","{'matthew bryza': {'positive': 66399, 'negativ..."
...,...,...,...,...
84103,2022-01-20 13:03:00,"{'entities': [{'name': 'EBRD', 'sentiment': 'p...",The European Bank for Reconstruction and Devel...,"{'ebrd': {'positive': 231898, 'negative': 1515..."
84114,2022-03-10 16:40:00,"{'entities': [{'name': 'Dmytro Kuleba', 'senti...",Ukrainian Foreign Minister Dmytro Kuleba says ...,"{'dmytro kuleba': {'positive': 80926, 'negativ..."
84204,2022-11-17 13:23:00,"{'entities': [{'name': 'Philip Reeker', 'senti...",U.S. Secretary of State's adviser on Caucasus ...,"{'philip reeker': {'positive': 86325, 'negativ..."
84216,2022-12-13 13:48:00,"{'entities': [{'name': 'Arayik Harutyunyan', '...","On December 13, President of the Artsakh Repub...","{'arayik harutyunyan': {'positive': 31357, 'ne..."


In [17]:
(pred_tuples := df_feedback.apply(lambda x: [x.predictions_raw[ent["name"].lower()] for ent in x.feedback["entities"] if ("name" in ent) and ("sentiment" in ent)], axis=1))

0        [{'positive': 151383, 'negative': 79200, 'neut...
1        [{'positive': 118572, 'negative': 57729, 'neut...
2        [{'positive': 46029, 'negative': 20284, 'neutr...
3        [{'positive': 20142, 'negative': 11838, 'neutr...
4        [{'positive': 66399, 'negative': 39051, 'neutr...
                               ...                        
84103    [{'positive': 231898, 'negative': 151592, 'neu...
84114    [{'positive': 80926, 'negative': 57119, 'neutr...
84204    [{'positive': 86325, 'negative': 61237, 'neutr...
84216    [{'positive': 31357, 'negative': 21230, 'neutr...
84376    [{'positive': 28165, 'negative': 30966, 'neutr...
Length: 36610, dtype: object

In [18]:
pred_tuples.iloc[0]

[Counter({'neutral': 235799, 'positive': 151383, 'negative': 79200}),
 Counter({'neutral': 261583, 'positive': 168584, 'negative': 80362}),
 Counter({'neutral': 130661, 'positive': 87477, 'negative': 38540}),
 Counter({'neutral': 338797, 'positive': 233735, 'negative': 105654}),
 Counter({'neutral': 129424, 'positive': 83159, 'negative': 42867}),
 Counter({'neutral': 161330, 'positive': 113752, 'negative': 50056}),
 Counter({'neutral': 89991, 'positive': 48603, 'negative': 26690})]

In [19]:
naive_method = pred_tuples.apply(lambda row: [ent.most_common(1)[0][0] if len(ent.most_common(1)) else None for ent in row])
df_feedback["naive_method"] = naive_method
df_feedback["naive_accuracy"] = df_feedback.apply(
    lambda row: np.sum(
        [
            (row.feedback["entities"][i]["sentiment"].lower() == row.naive_method[i])
            if ("entities" in row.feedback) and ("sentiment" in row.feedback["entities"][i])
            else False for i in range(len(row.naive_method))
        ]) / len(row.naive_method)
    , axis=1
)

  lambda row: np.sum(


In [20]:
df_feedback.naive_accuracy.mean()

0.5092995121201346

In [21]:
total_sentiments = Counter()
for term, counts in term_sentiment.items():
    total_sentiments += counts

total_sentiments

Counter({'neutral': 3789399, 'positive': 1910713, 'negative': 1663099})

In [56]:
df_feedback["weighted_method"] = pred_tuples.apply(lambda row: [
    ["positive", "negative", "neutral"][np.argmax([
        int(ent["positive"]*730 / total_sentiments["positive"]),
        int(ent["negative"]*730 / total_sentiments["negative"]),
        int(ent["neutral"]*1000 / total_sentiments["neutral"])
    ])] for ent in row
]) 

df_feedback["weighted_accuracy"] = df_feedback.apply(
    lambda row: np.sum(
        [
            (row.feedback["entities"][i]["sentiment"].lower() == row.weighted_method[i])
            if ("entities" in row.feedback) and ("sentiment" in row.feedback["entities"][i])
            else False for i in range(len(row.weighted_method))
        ]) / len(row.weighted_method)
    , axis=1
)

df_feedback

  lambda row: np.sum(


Unnamed: 0,time,feedback,content,predictions_raw,naive_method,naive_accuracy,weighted_method,weighted_accuracy
0,2009-06-08 00:00:00,"{'entities': [{'name': 'Serzh Sargsyan', 'sent...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President...","{'serzh sargsyan': {'positive': 151383, 'negat...","[neutral, neutral, neutral, neutral, neutral, ...",0.571429,"[neutral, neutral, neutral, positive, neutral,...",0.571429
1,2009-06-08 00:00:00,"{'entities': [{'name': 'David Hakhverdyan', 's...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co...","{'david hakhverdyan': {'positive': 118572, 'ne...","[neutral, neutral, None, neutral, neutral]",0.200000,"[neutral, neutral, positive, positive, neutral]",0.400000
2,2009-06-04 00:00:00,"{'entities': [{'name': 'World Bank', 'sentimen...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme...","{'world bank': {'positive': 46029, 'negative':...","[neutral, neutral, neutral, neutral, neutral, ...",0.117647,"[positive, positive, neutral, neutral, positiv...",0.588235
3,2008-12-30 00:00:00,{'entities': [{'name': 'CoE Congress of Local ...,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss...",{'coe congress of local and regional authoriti...,"[neutral, neutral, neutral, neutral, neutral, ...",0.076923,"[neutral, neutral, neutral, neutral, neutral, ...",0.076923
4,2009-06-01 00:00:00,"{'entities': [{'name': 'Matthew Bryza', 'senti...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o...","{'matthew bryza': {'positive': 66399, 'negativ...","[neutral, neutral, None, neutral]",0.250000,"[neutral, neutral, positive, neutral]",0.250000
...,...,...,...,...,...,...,...,...
84103,2022-01-20 13:03:00,"{'entities': [{'name': 'EBRD', 'sentiment': 'p...",The European Bank for Reconstruction and Devel...,"{'ebrd': {'positive': 231898, 'negative': 1515...","[neutral, neutral, neutral, None, neutral, neu...",0.000000,"[neutral, neutral, positive, positive, neutral...",0.428571
84114,2022-03-10 16:40:00,"{'entities': [{'name': 'Dmytro Kuleba', 'senti...",Ukrainian Foreign Minister Dmytro Kuleba says ...,"{'dmytro kuleba': {'positive': 80926, 'negativ...","[neutral, neutral, neutral, neutral]",0.500000,"[neutral, neutral, neutral, neutral]",0.500000
84204,2022-11-17 13:23:00,"{'entities': [{'name': 'Philip Reeker', 'senti...",U.S. Secretary of State's adviser on Caucasus ...,"{'philip reeker': {'positive': 86325, 'negativ...","[neutral, neutral, neutral, neutral, neutral, ...",0.300000,"[neutral, neutral, neutral, neutral, neutral, ...",0.300000
84216,2022-12-13 13:48:00,"{'entities': [{'name': 'Arayik Harutyunyan', '...","On December 13, President of the Artsakh Repub...","{'arayik harutyunyan': {'positive': 31357, 'ne...","[neutral, neutral, neutral, neutral]",0.750000,"[neutral, neutral, neutral, neutral]",0.750000


In [57]:
df_feedback.weighted_accuracy.mean()

0.5308964166340528

### Estimating sentiment for all articles

In [24]:
(entity_names := pd.read_pickle(Path("cache") / "top_10k_names.series.pkl").iloc[:500])

0                         russia
1                        armenia
2                  united states
3                     azerbaijan
4                         turkey
                 ...            
495    president bashar al-assad
496             russian language
497                    krasnodar
498              serzh sarkisian
499             shakhtar donetsk
Length: 500, dtype: object

In [59]:
df.content = df.content.str.lower()
df["pred"] = [(lambda x: [])(_) for _ in range(len(df))]

In [60]:
df

Unnamed: 0,time,feedback,content,pred
0,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Serz...","yerevan, june 9, armenpress:armenian president...",[]
1,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Davi...","yerevan, june 9, armenpress:director of the co...",[]
2,2009-06-04 00:00:00,"{""entities"": [\n {""name"": ""World Bank"", ""sent...","yerevan, june 5, armenpress:the credit agreeme...",[]
3,2008-12-30 00:00:00,"{\n ""entities"": [\n {""name"": ""CoE Congress...","yerevan, june 1, armenpress:the observing miss...",[]
4,2009-06-01 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Matt...","baku, june 1, armenpress:osce minsk group is o...",[]
...,...,...,...,...
84375,2023-11-20 18:58:00,,ucom now offers the fastest home internet and ...,[]
84376,2023-11-23 15:16:00,"{\n ""entities"": [\n {\n ""name"": ""Yers...","at the beginning of november, tert.am reported...",[]
84377,2023-11-24 21:44:00,,yesterday tert.am wrote that the zangezur co...,[]
84378,2023-11-24 16:00:00,,"after the coronavirus and the war situation, t...",[]


In [61]:
total_found = Counter()

for i, content in enumerate(df.content.to_list()):
    if (i % 100) == 0:
        print(str(total_found) + f"  {i}/{len(df)}", end="\r")
    entities = [
        ent for ent in entity_names.to_list()
        if content is not None and ent in content
    ]

    for entity_name in entities:
        article_sentences = list(chain.from_iterable(
            [partition.split(".") for partition in content.lower().split(":")]
        ))
        article_sentences = [s.replace(",", "").lower() for s in article_sentences]
    
        # selecting relevant sentences
        relevant_sentances = [
            sent
            for sent
            in article_sentences
            if entity_name in sent
        ]


        # Splitting into words
        relevant_words = list(
            chain.from_iterable(
                [re.findall(r"\w+", sent) for sent in relevant_sentances]
            )
        )

        # filtering stopwords and entity names
        relevant_words = [
            w for w in relevant_words
            if (w not in english_stopwords) and (
                w not in entity_name
            )
        ]

        counts = [Counter(term_sentiment[w]) for w in relevant_words if w in term_sentiment]
        counter = Counter()
        for count in counts:
            counter += count
        
        if len(counter.most_common(1)):
            prediction = ["positive", "negative", "neutral"][np.argmax([
                int(counter["positive"]*730 / total_sentiments["positive"]),
                int(counter["negative"]*730 / total_sentiments["negative"]),
                int(counter["neutral"]*1000 / total_sentiments["neutral"])
            ])]
            total_found += {prediction: 1}

            df.iloc[i, 3].append((entity_name, prediction))

Counter({'neutral': 3537606, 'positive': 545721, 'negative': 115134})  447900/447920

In [62]:
df.to_pickle(Path("cache") / "total_inference.df.pkl")