In [1]:
# Browse list of available datasets
from relatio.datasets import list_datasets
print(list_datasets())

# Load an available dataset
from relatio.datasets import load_trump_data
df = load_trump_data("raw")


    List of available datasets:

    Trump Tweet Archive
    - function call: load_trump_data()
    - format: 'raw', 'split_sentences', 'srl_res'
    - allennlp version: 0.9
    - srl model: srl-model-2018.05.25.tar.gz
    


In [2]:
# Process SRL 

from relatio.preprocessing import *

p = Preprocessor(spacy_model = "en_core_web_lg")

split_sentences = p.split_into_sentences(
    df.iloc[0:100], output_path=None, progress_bar=True
)

# As sentence splitting and SRL is time-consuming, we download the results from the datasets module.

split_sentences = load_trump_data("split_sentences")
srl_res = load_trump_data("srl_res")[0:1000]

Splitting into sentences...










100%|████████████████████████████████████████| 100/100 [00:00<00:00, 276.88it/s]


In [3]:
import spacy
en = spacy.load('en_core_web_lg')
stopwords = list(en.Defaults.stop_words)

In [4]:
roles, sentence_index = p.extract_roles(
    srl_res, 
    used_roles = ["ARG0","B-V","B-ARGM-NEG","B-ARGM-MOD","ARG1","ARG2"],
    progress_bar = True
)

for d in roles[0:5]:
    print(d)

postproc_roles = p.process_roles(roles, 
                                 remove_punctuation = True,
                                 remove_digits = True,
                                 lowercase = True,
                                 lemmatize = True,
                                 stop_words = stopwords,
                                 dict_of_pos_tags_to_keep = {
                                     "ARG0": ['NOUN', 'PROPN'],
                                     "B-V": ['VERB'],
                                     "ARG1": ['NOUN', 'PROPN'],
                                     "ARG2": ['NOUN', 'PROPN']
                                 }, 
                                 progress_bar = True)

for d in postproc_roles[0:5]:
    print(d)

Extracting semantic roles...


100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 36128.83it/s]


{'B-V': 'have'}
{'ARG0': 'Republicans and Democrats', 'ARG1': 'our economic problems', 'B-V': 'created'}
{'ARG1': 'I', 'ARG2': 'thrilled to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'was'}
{'ARG1': 'I', 'ARG2': 'to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'thrilled'}
{'ARG1': 'I', 'ARG2': 'back in the Great city of Charlotte , North Carolina', 'B-V': 'be'}
Cleaning semantic roles...


100%|██████████████████████████████████████| 2088/2088 [00:14<00:00, 139.53it/s]

{'B-V': ''}
{'ARG0': 'republicans democrats', 'ARG1': 'problem', 'B-V': 'create'}
{'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': ''}
{'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': 'thrill'}
{'ARG1': '', 'ARG2': 'city charlotte north carolina', 'B-V': ''}





In [5]:
p = Preprocessor(spacy_model = "en_core_web_lg")
named_entities = p.mine_entities(split_sentences[1][0:1000], progress_bar = True)
for n in named_entities.most_common(10): print(n)

Mining named entities...


100%|██████████████████████████████████████| 1000/1000 [00:04<00:00, 203.44it/s]

('Pennsylvania', 23)
('Biden', 20)
('BreitbartNews', 19)
('Republicans', 17)
('Joe Biden', 17)
('Georgia', 17)
('Trump', 16)
('Democrats', 13)
('Wisconsin', 10)
('Arizona', 10)





In [6]:
known_entities = list(set([e[0].lower() for e in list(named_entities.most_common(100))]))

In [7]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'deterministic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'embeddings',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = True)
for n in narratives[0:10]: print(n)

No training required: the model is deterministic.


100%|██████████████████████████████████████| 2088/2088 [00:05<00:00, 406.93it/s]


democrats create carney
mike pence
thrill mike pence
north carolina
foxnews
foxnews love state
foxnews cherish mark
foxnews respect court
foxnews america





In [8]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'static',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'embeddings', # regex breaks down here
                   roles_with_embeddings = [['ARG0'],['ARG1','ARG2']], # [['ARG0','ARG1','ARG2']]
                   embeddings_model = None,
                   threshold = 1,
                   n_clusters = [10,20]) # [100]    

m.train(postproc_roles, progress_bar = True, verbose = 0)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = False)
for n in narratives[0:10]: print(n)

Focus on roles: ARG0
Ignoring known entities...
Embedding relevant phrases...
Clustering phrases into 10 clusters...
Labeling the clusters by the most frequent phrases...
Focus on roles: ARG1-ARG2
Ignoring known entities...




Embedding relevant phrases...
Clustering phrases into 20 clusters...




Labeling the clusters by the most frequent phrases...


100%|██████████████████████████████████████| 2088/2088 [00:05<00:00, 408.51it/s]

{'B-V': ''}
{'ARG0': 'democrats', 'ARG1': 'carney', 'B-V': 'create'}
{'ARG2': 'mike pence', 'B-V': ''}
{'ARG2': 'mike pence', 'B-V': 'thrill'}
{'ARG2': 'north carolina', 'B-V': ''}
{'ARG0': 'foxnews', 'B-V': ''}
{'ARG0': 'foxnews', 'ARG1': 'state', 'B-V': 'love'}
{'ARG0': 'foxnews', 'ARG1': 'mark', 'B-V': 'cherish'}
{'ARG0': 'foxnews', 'ARG1': 'court', 'B-V': 'respect'}
{'ARG0': 'foxnews', 'ARG1': 'america', 'B-V': ''}





In [9]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'dynamic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'regex',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles, progress_bar = True,)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = True)
for n in narratives[0:10]: print(n)



100%|██████████████████████████████████████| 2088/2088 [00:03<00:00, 619.96it/s]
100%|██████████████████████████████████████| 2088/2088 [00:03<00:00, 638.81it/s]


democrats create election
north carolina
thrill north carolina
north carolina
people
people love election
people cherish election
people respect election
people america





In [10]:
from collections import Counter
temp = [n for n in narratives if len(n.split()) > 3]
temp = Counter(temp)
for t in temp.most_common(10): print(t)

('happen ballot fulton county', 3)
('voter fraud voter fraud', 2)
('ballot fulton county election', 2)
('fake news voter fraud', 2)
('ballot fulton county watch', 2)
('need ballot fulton county', 2)
('voter fraud ballot fulton county', 2)
('election open voter fraud', 2)
('joe ballot fulton county', 2)
('help ballot fulton county', 2)


In [11]:
m._model_obj.labels

[{0: 'election',
  1: 'people',
  2: 'voter fraud',
  3: 'envelope',
  4: 'plus amp minus',
  5: 'medium',
  6: 'us_fda',
  7: 'million life',
  8: 'ballot fulton county',
  9: 'fake news',
  10: 'military',
  11: 'hoax',
  12: 'odd',
  13: 'election rigged',
  14: 'european countries',
  15: 'usb drive',
  16: 'patriot',
  17: 'viewer',
  18: 'sampling',
  19: 'vote people',
  20: 'pastordscotts tonight',
  21: 'complete total endorsement',
  22: 'm',
  23: 'taxis',
  24: 'boffo',
  25: 'philadelpiha',
  26: 'great loudobbs',
  27: 'emily murphy'}]

In [12]:
# Add user-written functions for the preprocessor --> see with Andrei
# Add graphs
# Add verbs
# Add model validation (/!\ this is really important /!\)

# For the dynamic version, how do we make sure the model doesn't get too massive?
# Should we allow users to cluster verbs though we know it's a bad idea?

In [13]:
# Differences with the previous wrapper
# save stuff in a folder during training phase (output_path)
# handling of verbs (dimension_reduce_verbs)
# fit multiple kmeans models (n_clusters as a list of lists) --> could prove useful for model validation
# document tracking (doc, sentence, statement, narrative)

In [14]:
# test "dynamic" (OK)
# test "deterministic" (OK)
# test differentiated clusters roles_with_embeddings = [['ARG0'], ['ARG1', 'ARG2']] and n_clusters = [20,50] (OK)

# test different spacy models --> problem with en_core_web_lg --> see with Andrei
# test "regex" --> change handling of multiple matches + problem with is_subsequence()

# test different embedding types --> see with Andrei