# Cooccurance/relationship mining

In [2]:
import numpy as np
import pandas as pd
import re
import sqlite3
import time
from matplotlib import pyplot as plt
from datetime import datetime
import seaborn as sns
import ast
import json
from collections import defaultdict

np.random.seed(173)

In [3]:
db_armenpress = sqlite3.connect("Armenpress/scraping.db")
db_newsam = sqlite3.connect("Newsam/scraping.db")
db_tertam = sqlite3.connect("Tertam/scraping.db")

In [4]:
df_armenpress = pd.read_sql_query("select * from article", db_armenpress)
df_newsam = pd.read_sql_query("select * from article", db_newsam)
df_tertam = pd.read_sql_query("select * from article", db_tertam)

In [5]:
df_tertam.time = pd.to_datetime(df_tertam.time)
df_newsam.time = pd.to_datetime(df_newsam.time)
df_armenpress.time = pd.to_datetime(df_armenpress.time)

In [6]:
df_armenpress

Unnamed: 0,id,headline,time,content,feedback
0,218011,AREMNIAN PRESIDENT SERZH SARGSYAN RECEIVES EXE...,2009-06-08 00:00:00,"YEREVAN, JUNE 9, ARMENPRESS:Armenian President...","{\n ""entities"": [\n {\n ""name"": ""Serz..."
1,218012,ISSUES ON DEVELOPMENT OF AGRICULTURE SPHERE DI...,2009-06-08 00:00:00,"YEREVAN, JUNE 9, ARMENPRESS:Director of the Co...","{\n ""entities"": [\n {\n ""name"": ""Davi..."
2,218013,CREDIT AGREEMENT OF 25 MILLION USD SIGNED WITH...,2009-06-04 00:00:00,"YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme...","{""entities"": [\n {""name"": ""World Bank"", ""sent..."
3,218014,INTERNATIONAL OBSERVERS REGISTER THAT THE GENE...,2008-12-30 00:00:00,"YEREVAN, JUNE 1, ARMENPRESS:The observing miss...","{\n ""entities"": [\n {""name"": ""CoE Congress..."
4,218015,MATTHEW BRYZA SAYS OSCE MINSK GROUP CO-CHAIRS ...,2009-06-01 00:00:00,"BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o...","{\n ""entities"": [\n {\n ""name"": ""Matt..."
...,...,...,...,...,...
144134,1124334,Armenia and Tunisia to lift visa requirements ...,2023-11-17 11:06:31,"YEREVAN, NOVEMBER 17, ARMENPRESS. Armenia and ...",
144135,1124335,Shoghakat Vardanyan’s war documentary ‘1489’ w...,2023-11-17 11:20:02,"YEREVAN, NOVEMBER 17, ARMENPRESS. Armenian fil...",
144136,1124340,"Daniel Hilaire: “At Fast Bank, we are continuo...",2023-11-17 11:49:17,"YEREVAN, NOVEMBER 17, ARMENPRESS. Digital bank...",
144137,1124341,Yerevan police officer injured while respondin...,2023-11-17 11:51:28,"YEREVAN, NOVEMBER 17, ARMENPRESS. One of the t...",


In [7]:
df = pd.concat([df[["time", "feedback", "content"]] for df in [df_armenpress, df_newsam, df_tertam]], axis=0)
df.feedback.str.replace("\n", "")
(df_feedback := df[~df.feedback.isna()])

Unnamed: 0,time,feedback,content
0,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Serz...","YEREVAN, JUNE 9, ARMENPRESS:Armenian President..."
1,2009-06-08 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Davi...","YEREVAN, JUNE 9, ARMENPRESS:Director of the Co..."
2,2009-06-04 00:00:00,"{""entities"": [\n {""name"": ""World Bank"", ""sent...","YEREVAN, JUNE 5, ARMENPRESS:The credit agreeme..."
3,2008-12-30 00:00:00,"{\n ""entities"": [\n {""name"": ""CoE Congress...","YEREVAN, JUNE 1, ARMENPRESS:The observing miss..."
4,2009-06-01 00:00:00,"{\n ""entities"": [\n {\n ""name"": ""Matt...","BAKU, JUNE 1, ARMENPRESS:OSCE Minsk group is o..."
...,...,...,...
84103,2022-01-20 13:03:00,"{\n ""entities"": [\n {\n ""name"": ""EBRD...",The European Bank for Reconstruction and Devel...
84114,2022-03-10 16:40:00,"{\n ""entities"": [\n {\n ""name"": ""Dmyt...",Ukrainian Foreign Minister Dmytro Kuleba says ...
84204,2022-11-17 13:23:00,"{\n ""entities"": [\n {\n ""name"": ""Phil...",U.S. Secretary of State's adviser on Caucasus ...
84216,2022-12-13 13:48:00,"{\n ""entities"": [\n {\n ""name"": ""Aray...","On December 13, President of the Artsakh Repub..."


In [10]:
def strpdict(string: str) -> dict:
    try:
        return ast.literal_eval(string)
    except Exception as e:
        print(e)

(dicts := df_feedback.feedback.apply(strpdict))

unexpected indent (<unknown>, line 20)
'{' was never closed (<unknown>, line 33)
'{' was never closed (<unknown>, line 18)
'{' was never closed (<unknown>, line 18)
'[' was never closed (<unknown>, line 2)
'{' was never closed (<unknown>, line 28)
'{' was never closed (<unknown>, line 8)
'{' was never closed (<unknown>, line 13)
'{' was never closed (<unknown>, line 13)


0        {'entities': [{'name': 'Serzh Sargsyan', 'sent...
1        {'entities': [{'name': 'David Hakhverdyan', 's...
2        {'entities': [{'name': 'World Bank', 'sentimen...
3        {'entities': [{'name': 'CoE Congress of Local ...
4        {'entities': [{'name': 'Matthew Bryza', 'senti...
                               ...                        
84103    {'entities': [{'name': 'EBRD', 'sentiment': 'p...
84114    {'entities': [{'name': 'Dmytro Kuleba', 'senti...
84204    {'entities': [{'name': 'Philip Reeker', 'senti...
84216    {'entities': [{'name': 'Arayik Harutyunyan', '...
84376    {'entities': [{'name': 'Yershova', 'sentiment'...
Name: feedback, Length: 36827, dtype: object

In [40]:
(dicts := dicts[dicts.apply(lambda x: hasattr(x, "keys") and "entities" in x.keys())])

0        {'entities': [{'name': 'Serzh Sargsyan', 'sent...
1        {'entities': [{'name': 'David Hakhverdyan', 's...
2        {'entities': [{'name': 'World Bank', 'sentimen...
3        {'entities': [{'name': 'CoE Congress of Local ...
4        {'entities': [{'name': 'Matthew Bryza', 'senti...
                               ...                        
84103    {'entities': [{'name': 'EBRD', 'sentiment': 'p...
84114    {'entities': [{'name': 'Dmytro Kuleba', 'senti...
84204    {'entities': [{'name': 'Philip Reeker', 'senti...
84216    {'entities': [{'name': 'Arayik Harutyunyan', '...
84376    {'entities': [{'name': 'Yershova', 'sentiment'...
Name: feedback, Length: 36610, dtype: object

In [59]:
# defining a preset for accounting connections by sentiment
entities = defaultdict(lambda: {
    "positive": 0,
    "negative": 0,
    "neutral": 0,
    "connections": [{}, {}, {}]  # positionally correspondive
})

possible_sentiments = ["positive", "negative", "neutral"]

def account_response(resp: dict):
    # preprocess name and sentiment

    entity_sentiments = {}

    for ent in resp["entities"]:
        if (
            (type(ent) is not dict) or ("sentiment" not in ent) or 
            ((other_sent := ent["sentiment"].strip().lower()) not in possible_sentiments) or
            ("name" not in ent)
        ):
            continue

        entity_sentiments[ent["name"].strip().lower()] = other_sent

    for name, sentiment in entity_sentiments.items():
        # account sentiment
        if sentiment in entities[name]:
            entities[name][sentiment] += 1
        else:
            entities[name][sentiment] = 1

        # account connections by sentiment
        for other_name, other_sentiment in [(n, s) for n, s in entity_sentiments.items() if n != name]:
            if other_name in entities[name]["connections"][possible_sentiments.index(other_sentiment)]:
                entities[name]["connections"][possible_sentiments.index(other_sentiment)][other_name] += 1
            else:
                entities[name]["connections"][possible_sentiments.index(other_sentiment)][other_name] = 1

dicts.apply(account_response)
entities

defaultdict(<function __main__.<lambda>()>,
            {'serzh sargsyan': {'positive': 320,
              'negative': 85,
              'neutral': 691,
              'connections': [{'olaf swantee': 1,
                'france telecom-orange': 1,
                'armenian-french economic cooperation': 1,
                'russia': 182,
                'armenia': 295,
                'edward nalbandyan': 5,
                'seyran ohanyan': 4,
                'dmitry medvedev': 62,
                'armenian president': 3,
                'aram khachatrian': 1,
                'yuri grigorovich': 1,
                'united states': 64,
                'millennium challenge program': 1,
                'georgi boss': 1,
                'kaliningrad': 1,
                'russian citizens': 1,
                'usa': 6,
                'independence day': 1,
                'armenian-american cooperation': 1,
                'armenian community in usa': 1,
                'american-armenian o

Number of entities mentioned at least ten times:

In [60]:
len([name for name, stats in entities.items() if (
    (stats["positive"] if type(stats["positive"]) is int else 0) + 
    (stats["negative"] if type(stats["negative"]) is int else 0) +
    (stats["neutral"] if type(stats["neutral"]) is int else 0)) > 10])

1777

Number of entities mentioned at least once:

In [61]:
len(entities)

56945

In [63]:
import os
from pathlib import Path
import pickle

if not os.path.exists((cache := Path("cache"))):
    os.mkdir(cache)

with open(cache / "entties.dict.pkl", "wb") as f:
    pickle.dump({k: v for k, v in entities.items()}, f)