In [1]:
# Browse list of available datasets

from relatio.datasets import list_datasets

print(list_datasets())

# Load an available dataset

from relatio.datasets import load_trump_data

df = load_trump_data("raw")


    List of available datasets:

    Trump Tweet Archive
    - function call: load_trump_data()
    - format: 'raw', 'split_sentences', 'srl_res'
    - allennlp version: 0.9
    - srl model: srl-model-2018.05.25.tar.gz
    


In [2]:
# Process SRL 

from relatio.preprocessing import *

p = Preprocessor(spacy_model = "en_core_web_lg")

split_sentences = p.split_into_sentences(
    df.iloc[0:100], output_path=None, progress_bar=True
)

# As sentence splitting and SRL is time-consuming, we download the results from the datasets module.

split_sentences = load_trump_data("split_sentences")
srl_res = load_trump_data("srl_res")[0:1000]

Splitting into sentences...










100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 276.55it/s]


In [3]:
import spacy
en = spacy.load('en_core_web_lg')
stopwords = list(en.Defaults.stop_words)

In [4]:
roles, sentence_index = p.extract_roles(
    srl_res, 
    used_roles = ["ARG0","B-V","B-ARGM-NEG","B-ARGM-MOD","ARG1","ARG2"],
    progress_bar = True
)

for d in roles[0:5]:
    print(d)

postproc_roles = p.process_roles(roles, 
                                 remove_punctuation = True,
                                 remove_digits = True,
                                 lowercase = True,
                                 lemmatize = True,
                                 stop_words = stopwords,
                                 dict_of_pos_tags_to_keep = {
                                     "ARG0": ['NOUN', 'PROPN'],
                                     "B-V": ['VERB'],
                                     "ARG1": ['NOUN', 'PROPN'],
                                     "ARG2": ['NOUN', 'PROPN']
                                 }, 
                                 progress_bar = True)

for d in postproc_roles[0:5]:
    print(d)

Extracting semantic roles...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 40007.10it/s]


{'B-V': 'have'}
{'ARG0': 'Republicans and Democrats', 'ARG1': 'our economic problems', 'B-V': 'created'}
{'ARG1': 'I', 'ARG2': 'thrilled to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'was'}
{'ARG1': 'I', 'ARG2': 'to be back in the Great city of Charlotte , North Carolina with thousands of hardworking American Patriots who love our Country , cherish our values , respect our laws , and always put AMERICA FIRST', 'B-V': 'thrilled'}
{'ARG1': 'I', 'ARG2': 'back in the Great city of Charlotte , North Carolina', 'B-V': 'be'}
Cleaning semantic roles...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [00:14<00:00, 142.84it/s]

{'B-V': ''}
{'ARG0': 'republicans democrats', 'ARG1': 'problem', 'B-V': 'create'}
{'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': ''}
{'ARG1': '', 'ARG2': 'city charlotte north carolina thousand patriots country value law america', 'B-V': 'thrill'}
{'ARG1': '', 'ARG2': 'city charlotte north carolina', 'B-V': ''}





In [6]:
p = Preprocessor(spacy_model = "en_core_web_lg")
named_entities = p.mine_entities(split_sentences[1][0:1000], progress_bar = True)
print(named_entities.most_common(10))

Mining named entities...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 224.57it/s]

[('Pennsylvania', 23), ('Biden', 20), ('BreitbartNews', 19), ('Republicans', 17), ('Joe Biden', 17), ('Georgia', 17), ('Trump', 16), ('Democrats', 13), ('Wisconsin', 10), ('Arizona', 10)]





In [7]:
known_entities = list(set([e[0].lower() for e in list(named_entities.most_common(100))]))

In [18]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'deterministic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'embeddings',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = True)
narratives[0:10]

No training required: the model is deterministic.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [00:04<00:00, 446.50it/s]


['',
 'democrats create carney',
 'mike pence',
 'thrill mike pence',
 'north carolina',
 'foxnews',
 'foxnews love state',
 'foxnews cherish mark',
 'foxnews respect court',
 'foxnews america']

In [32]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'static',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'embeddings', # regex breaks down here
                   roles_with_embeddings = [['ARG0'],['ARG1','ARG2']], # [['ARG0','ARG1','ARG2']]
                   embeddings_model = embeddings_model,
                   threshold = 0.1,
                   n_clusters = [10,20]) # [100]    

m.train(postproc_roles, progress_bar = True, verbose = 0)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = True)
narratives[0:10]

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [22]:
from relatio.narrative_models import *
m = NarrativeModel(model_type = 'dynamic',
                   roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARGM-MOD', 'ARG1', 'ARG2'],
                   roles_with_entities = ['ARG0','ARG1','ARG2'],
                   list_of_known_entities = known_entities,
                   assignment_to_known_entities = 'regex',
                   roles_with_embeddings = [['ARG0','ARG1','ARG2']],
                   threshold = 1)    

m.train(postproc_roles, progress_bar = True,)
narratives = m.predict(postproc_roles, progress_bar = True, prettify = True)
narratives[0:10]



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [00:03<00:00, 635.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [00:03<00:00, 649.62it/s]


['',
 'republicans create election',
 'america',
 'thrill america',
 'north carolina',
 'people',
 'people love election',
 'people cherish election',
 'people respect election',
 'people america']

In [11]:
from collections import Counter
temp = [n for n in narratives if len(n.split()) > 3]
temp = Counter(temp)
temp.most_common(100)

[('joe biden witch hunt', 4),
 ('witch hunt clark county', 3),
 ('clark county clark county', 2),
 ('nate simington clark county', 2),
 ('the fake news media', 2),
 ('wish radical left american', 2),
 ('witch hunt fox news', 2),
 ('new york witch hunt', 2),
 ('joe biden clark county', 2),
 ('clark county witch hunt', 2),
 ('joe biden mike pence', 2),
 ('joe biden new york', 2),
 ('not end witch hunt', 2),
 ('mike pence not allow carney', 2),
 ('fox news joe biden', 2),
 ('not want clark county', 2),
 ('witch hunt witch hunt', 1),
 ('end clark county state', 1),
 ('sudan agree clark county', 1),
 ('pennsylvania prevent clark county', 1),
 ('clark county join mike', 1),
 ('democrats not want clark county', 1),
 ('dominion end mike pence', 1),
 ('clark county reject state', 1),
 ('come mike pence joe biden', 1),
 ('witch hunt not work', 1),
 ('inundate washington radical left', 1),
 ('republicans not want radical left', 1),
 ('radical left steal clark county', 1),
 ('carney find fox news'

In [12]:
m._model_obj.labels

[{0: 'america',
  1: 'joe biden',
  2: 'corruption',
  3: 'republican national committee rnc',
  4: 'vote',
  5: 'patriots',
  6: 'signature',
  7: 'andrew mccabe',
  8: 'people',
  9: 'wisconsin',
  10: 'pennsylvania',
  11: 'ballot',
  12: 'fraud intervention',
  13: 'case',
  14: 'country',
  15: 'voter fraud information',
  16: 'arizona',
  17: 'border',
  18: 'point wisconsin',
  19: 'us_fda democrats',
  20: 'city charlotte north carolina',
  21: 'state sponsors terrorism list',
  22: 'election process',
  23: 'progress',
  24: 'inspection election equipment container election',
  25: 'dominion',
  26: 'tony',
  27: 'gsa dems',
  28: 'fake news medium',
  29: 'china virus',
  30: 'integrity elections united states america',
  31: 'product',
  32: 'woman',
  33: 'team',
  34: 'covid',
  35: 'mask amp',
  36: 'school',
  37: 'daytime',
  38: 'approval',
  39: 'secretarysonny',
  40: 'emergency power',
  41: 'thing respect election philadelphia',
  42: 'sudan',
  43: 'article',
  44

In [None]:
# Add user-written functions for the preprocessor
# Add graphs
# Add verbs
# Add model validation (this is really important /!\)

In [None]:
# Differences with the previous wrapper
# save stuff in a folder during training phase (output_path)
# handling of verbs (dimension_reduce_verbs)
# fit multiple kmeans models (n_clusters as a list of lists) --> could prove useful for model validation
# document tracking (doc, sentence, statement, narrative)

In [None]:
# test "dynamic" (OK)
# test "deterministic" (OK)
# test differentiated clusters roles_with_embeddings = [['ARG0'], ['ARG1', 'ARG2']] and n_clusters = [20,50] (OK)

# test different spacy models --> problem with en_core_web_lg (see with Andrei)
# test "regex" --> change handling of multiple matches + problem with is_subsequence 

# test different embedding types --> see with Andrei