# Inferring sentiment for the rest of the articles

In [59]:
import numpy as np
import pandas as pd
import re
import sqlite3
from digest_utils import strpdict
from itertools import chain
from collections import Counter
import requests
from tqdm import tqdm
import pickle
from pathlib import Path

np.random.seed(173)

In [2]:
# Connect to local db
db_armenpress = sqlite3.connect("Armenpress/scraping.db")
db_newsam = sqlite3.connect("Newsam/scraping.db")
db_tertam = sqlite3.connect("Tertam/scraping.db")

# loading data into dataframes
df_armenpress = pd.read_sql_query("select * from article", db_armenpress)
df_newsam = pd.read_sql_query("select * from article", db_newsam)
df_tertam = pd.read_sql_query("select * from article", db_tertam)

# parsing datetimes
df_tertam.time = pd.to_datetime(df_tertam.time)
df_newsam.time = pd.to_datetime(df_newsam.time)
df_armenpress.time = pd.to_datetime(df_armenpress.time)

In [3]:
# join all datasets
df = pd.concat([df[["time", "feedback", "content"]] for df in [df_armenpress, df_newsam, df_tertam]], axis=0)
df.feedback.str.replace("\n", "")
(df_feedback := df[~df.feedback.isna()].copy())

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Serz...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Davi...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{""entities"": [\n {""name"": ""World Bank"", ""sent...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,"{\n ""entities"": [\n {""name"": ""CoE Congress...","YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Matt...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{\n ""entities"": [\n {\n ""name"": ""EBRD...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{\n ""entities"": [\n {\n ""name"": ""Dmyt...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{\n ""entities"": [\n {\n ""name"": ""Phil...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{\n ""entities"": [\n {\n ""name"": ""Aray...","On December 13, President of the Artsakh Repub..."


In [4]:
# parse feedbacks
df_feedback.feedback = df_feedback.feedback.apply(strpdict)

unexpected indent (<unknown>, line 20)
'{' was never closed (<unknown>, line 33)
'{' was never closed (<unknown>, line 18)
'{' was never closed (<unknown>, line 18)
'[' was never closed (<unknown>, line 2)
'{' was never closed (<unknown>, line 28)
'{' was never closed (<unknown>, line 8)
'{' was never closed (<unknown>, line 13)
'{' was never closed (<unknown>, line 13)


In [5]:
# filter down to usable feedbacks
(df_feedback := df_feedback[
    ~df_feedback.feedback.isna() & (
        df_feedback.feedback.apply(
            lambda x: hasattr(x, "keys") and "entities" in x.keys()
            )
        )
    ]
)

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{'entities': [{'name': 'Serzh Sargsyan', 'sent...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{'entities': [{'name': 'David Hakhverdyan', 's...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{'entities': [{'name': 'World Bank', 'sentimen...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,{'entities': [{'name': 'CoE Congress of Local ...,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{'entities': [{'name': 'Matthew Bryza', 'senti...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{'entities': [{'name': 'EBRD', 'sentiment': 'p...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{'entities': [{'name': 'Dmytro Kuleba', 'senti...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{'entities': [{'name': 'Philip Reeker', 'senti...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{'entities': [{'name': 'Arayik Harutyunyan', '...","On December 13, President of the Artsakh Repub..."


In [13]:
# words to count for bag of words
words_to_count = Counter()
ent_name = df_feedback.feedback.iloc[0]["entities"][0]["name"]
feedback = df_feedback.feedback.iloc[0]["entities"][0]["explanation"]
article_sentences = list(chain.from_iterable(
    [partition.split(".") for partition in df_feedback.content.iloc[0].split(":")]
))
article_sentences = [s.replace(",", "").lower() for s in article_sentences]
relevant_sentances = [feedback] + [
    sent
    for sent
    in article_sentences
    if ent_name.lower() in sent
]

relevant_words = list(
    chain.from_iterable(
        [re.findall(r"\w+", sent) for sent in relevant_sentances]
    )
)

# Split the sentances that mentio the entity into words
words_to_count += Counter(relevant_words)
words_to_count

Counter({'the': 10,
         'of': 4,
         'as': 3,
         'received': 2,
         'armenian': 2,
         'president': 2,
         'serzh': 2,
         'sargsyan': 2,
         'director': 2,
         'france': 2,
         'telecom': 2,
         'orange': 2,
         'company': 2,
         'that': 2,
         'in': 2,
         'mentioned': 1,
         'Armenian': 1,
         'President': 1,
         'who': 1,
         'executives': 1,
         'today': 1,
         'olaf': 1,
         'swantee': 1,
         'executive': 1,
         'vice': 1,
         'and': 1,
         'bruno': 1,
         'dutua': 1,
         'chief': 1,
         'armenia': 1,
         'we': 1,
         'are': 1,
         'interested': 1,
         'one': 1,
         'world': 1,
         's': 1,
         'leading': 1,
         'operators': 1,
         'telecommunication': 1,
         'sphere': 1,
         'starts': 1,
         'its': 1,
         'activity': 1,
         'our': 1,
         'state': 1,
         'soo

Ignoring stopwords

In [23]:
(english_stopwords := requests.get(
    "https://gist.githubusercontent.com"
    "/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089"
    "c9821dbcf6d0c786c/NLTK's%2520list%2520of%252"
    "0english%2520stopwords"
).content.decode().split("\n"))[40:50]

['was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do']

In [29]:
{
    k.lower(): v
    for k, v in words_to_count.items()
    if (
        k.lower() not in english_stopwords
    ) and (
        k.lower() not in ent_name.lower()
    )
}

{'mentioned': 1,
 'armenian': 2,
 'president': 2,
 'received': 2,
 'executives': 1,
 'today': 1,
 'olaf': 1,
 'swantee': 1,
 'executive': 1,
 'vice': 1,
 'director': 2,
 'france': 2,
 'telecom': 2,
 'orange': 2,
 'company': 2,
 'bruno': 1,
 'dutua': 1,
 'chief': 1,
 'armenia': 1,
 'interested': 1,
 'one': 1,
 'world': 1,
 'leading': 1,
 'operators': 1,
 'telecommunication': 1,
 'sphere': 1,
 'starts': 1,
 'activity': 1,
 'state': 1,
 'soon': 1,
 'possible': 1,
 'pointed': 1,
 'noting': 1,
 'government': 1,
 'ready': 1,
 'assist': 1,
 'solution': 1,
 'issues': 1,
 'arouse': 1,
 'works': 1}

Wrapping it all in a function

In [64]:
words_to_count = {
    "positive": Counter(),
    "negative": Counter(),
    "neutral": Counter(),
}

ent_datas = df_feedback.feedback.iloc[0]["entities"]
def extract_word_counts(obs: dict, content: str):
    """To be applied (not the built-in way) on a pandas series of
    dictionaries containing entity data. Counts the words around
    the mentioned entity and while keeping track of the sentiment

    Args:
        obs (dict): observation that has the key `entities` which
        is a list of dicts that have the keys `name`, `sentiment`,
        and `explanation`.
        content (str): the text of the article.
    """
    if "entities" not in obs:
        print("Obs. missing key entities")

    for ent_data in obs["entities"]:
        if (type(ent_data) is not dict) or any(
            [k not in ent_data for k in ["name", "sentiment"]]
        ) or (
            ent_data["sentiment"].lower() not in ["positive", "negative", "neutral"]
        ):
            continue
        

        ent_name = ent_data["name"].lower()
        feedback = ent_data["explanation"].lower() if "explanation" in ent_data else ""
        sentiment = ent_data["sentiment"].lower()

        # splitting articles into sentences
        article_sentences = list(chain.from_iterable(
            [partition.split(".") for partition in content.split(":")]
        ))
        article_sentences = [s.replace(",", "").lower() for s in article_sentences]
        
        # selecting relevant sentences
        relevant_sentances = [feedback] + [
            sent
            for sent
            in article_sentences
            if ent_name in sent
        ]

        # Splitting into words
        relevant_words = list(
            chain.from_iterable(
                [re.findall(r"\w+", sent) for sent in relevant_sentances]
            )
        )

        # filtering stopwords and entity names
        relevant_words = [
            w for w in relevant_words
            if (w not in english_stopwords) and (
                w not in ent_name
            )
        ]

        # Split the sentances that mentio the entity into words
        words_to_count[sentiment] += Counter(relevant_words)

In [65]:
for i, obs in tqdm(df_feedback.iterrows()):
    extract_word_counts(obs["feedback"], obs["content"])

36610it [08:07, 75.09it/s] 


In [66]:
with open(Path("cache") / "words_count.Counter.pkl", "wb") as f:
    pickle.dump(words_to_count, f)

In [67]:
with open(Path("cache") / "words_count.Counter.pkl", "rb") as f:
    t = pickle.load(f)