In [1]:
import json
from tqdm import tqdm
import multiprocessing as mp

from collections import Counter

from declarations_new.ledger import Universe
from declarations_new.entities import EntityUniverse, EntityInstance
from declarations_new.corpus import EmailCorpus, Conversation
from declarations_new.emails import Email
from declarations_new.topics import TopicModel, TopicInstance
from declarations_new.entity_linking import EntityLinker

# Load Conversation objects and construct Triples
---

In [2]:
mailinglist = "public-credentials" # "ietf-http-wg"

with open(f"email_data/{mailinglist}/all.json") as handle:
    mail_dicts = json.load(handle)

convos = [(subj_str, mail_ls) for period, subj_d in mail_dicts.items() 
                for subj_str, mail_ls in subj_d.items()]

def to_conv(tup):
    return Conversation.from_email_dicts(*tup)

conversations = list(tqdm(map(to_conv, convos), total=len(convos)))

corpus = EmailCorpus.from_conversations(conversations, vectorise_default=True)
print(len(corpus), ", ", corpus.n_emails)
print(corpus.start_time, ", ", corpus.end_time)

100%|██████████| 1888/1888 [00:08<00:00, 214.47it/s]


1888 ,  6260
2014-08-18 23:39:41-04:00 ,  2020-05-13 19:53:51+03:00


# Topic Modelling

In [None]:
lda = TopicModel(corpus, 20, max_iter=1)

lda.assign_topics_to_emails()
lda.assign_topics_to_conversations()

In [None]:
models = lda.determine_n_components(range(10, 30, 5))

In [None]:
[(n, m.bound_) for n, m in models.items()]

### Collect Conversations and Emails assigned to each Topic


In [None]:
emails_per_topic = {t: [] for t in lda.topics}
convos_per_topic = {t: [] for t in lda.topics}

for k, v_ls in Universe.evidenced_by.items():
    if isinstance(k, TopicInstance):
        for v in v_ls:
            if isinstance(v, Conversation):
                convos_per_topic[k.topic].append(v)
            elif isinstance(v, Email):
                emails_per_topic[k.topic].append(v)
            else:
                raise ValueError("Neither Conversation or Email!")

print("Conversations per Topic:\n\t",
     [(t.index, len(ls)) for t, ls in convos_per_topic.items()], "\n")
print("Emails per Topic:\n\t",
     [(t.index, len(ls)) for t, ls in emails_per_topic.items()])

# Entity Recognition and Linking

In [None]:
linker = EntityLinker()

linker.to_WikiData_entities(list(EntityUniverse.entities.values())[:10])

In [None]:
linker.enrich_email_bodies(corpus)

In [None]:
j = 10
corpus[j][0].body.entities#[0].entity.label)

In [None]:
len(Universe.evidenced_by), len(Universe.mentioned_in), len(EntityUniverse.entities)

In [None]:
[e for e in EntityUniverse.entities.values() if type(e) == EntityInstance]

In [None]:
[e for e in EntityUniverse.entities.values() if e.instance_score is not None]

In [None]:
from collections import Counter
Counter([type(e).__name__ for e in EntityUniverse.entities.values()])

# Neo4j
---

In [4]:
from neo4j import GraphDatabase
from neo4j_defs2 import *
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "pwd"), encrypted=False)

In [None]:
def clear(tx):
    tx.run("""MATCH (x)
            DETACH DELETE x""")

    
with driver.session() as session:
    session.write_transaction(clear)
    for conv in tqdm(corpus[:50]):
        session.write_transaction(put_conversation, conv)
        for email in conv:
            session.write_transaction(put_email, email)
            

    for entity in tqdm(EntityUniverse.entities):
        session.write_transaction(put_entity, entity, entity.__class__.__name__)
            

# with driver.session() as session:
#     session.write_transaction(clear)
#     for c in tqdm(convos[:100]):
#         session.write_transaction(add_conversation, c)
#         session.write_transaction(add_documents, c)
#         session.write_transaction(add_mentions, c)
            
#         for email in c:
#             session.write_transaction(add_person,email.sender)
#             session.write_transaction(add_person,email.receiver)
#             session.write_transaction(add_talked_to,email.sender,email.receiver)
            
# #             session.write_transaction(add_named_entities, email)
            
#         for i in c.interlocutors:
#             session.write_transaction(connect_conversation, i.name, c)
            
            
#     sorted_convos = sorted(convos, key=lambda (c1, c2): c1 <)
#     pairs = zip(sorted_convos[:-1], sorted_convos[1:])
    
#     for c1, c2 in pairs:
#         session.write_transaction(add_earlier_than, email)

---
# Identify Quoted Texts


In [None]:
from Levenshtein import distance as levenshtein

In [None]:
for e in convos[1]:
    print(e.body)
    print("____________________\n\n\n\n\n____________________")

In [None]:
i = 3
latest = convos[i][-1]

for l in latest.body.split("\n"):
    if not l.strip():
        continue
    print(":", l)
    
    for e_ in convos[1][:-1]:
        quoted = [l_ for l_ in e_.body.split("\n") if levenshtein(l, l_) < (min(len(l), len(l_))/2)]
        
        print(quoted)
        
    print("\n---")


In [None]:
for e in convos[1]:
    print(e.sender, e.receiver)
    print(e.body)
    
    print("__________________\n\n\n__________________")