In [1]:
import os
import re
import json
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

import spacy
from spacy.matcher import Matcher

from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

In [79]:
debug = False
articles = {}
stat = { }
for dirpath, subdirs, files in os.walk(r'C:\Users\hugomeyer\Documents\Hackathon\Input\comm_use_subset\comm_use_subset\pdf_json'):
    for x in files:
        if x.endswith(".json"):
            articles[x] = os.path.join(dirpath, x)        
df = pd.read_csv(r'C:\Users\hugomeyer\Documents\Hackathon\Input\metadata.csv')


In [80]:
virus_ref = ['covid-19', 'coronavirus', 'cov-2', 'sars-cov-2', 'sars-cov', 'hcov', '2019-ncov']

season =  [{'Spring': ['spring','flower', 'flowering', 'thawing', 'melting']},
            {'Autumn': ['Autumn','Humid', 'rain']},
            {'Winter': ['Winter', 'cold','solstice', 'polar']},
            {'Summer': ['Summer', 'summery', 'wintry', 'overwinterin','dry','hot']},
            {'Time': ['Time', 'Periodic', 'regular','cylce','year','annual','semester','bimester','calendar','day','night','week']},
            {'Climate': ['weather', 'temperature', 'hot','cold','humid','dry','Moisture','rain','sun','light','wind','rain','snow','monsoon']}]

socials = ['quarantine','gathering','outside','party','bar','restaurant','drinks','running','meeting','park','mobility','cafe','shopping center','theme park','museums','libraries','movie theaters','retail','recreation','beaches','marinas','dog parks','plaza','public garden','train station','subway','bus']
higher_terms = ['over', 'above', 'higher', 'older', '>', 'over', 'less']
lower_terms = ['under', 'below', 'fewer', 'younger', '<', 'under', 'more']

In [81]:
matchers = {    
    "Term Matcher": lambda term: [{'LOWER': t} for t in term.split(' ')],
    "Terms Matcher": lambda terms: [{"LOWER": {"IN": terms } }],
    "Number Suffix Matcher": lambda periods: [
        {'LIKE_NUM': True},
        {"TEXT": {"REGEX": f'({"|".join(periods)})'}}
    ],
    "Number Interval Matcher": lambda periods: [
        {'POS': 'NUM',},
        {'TEXT': {'REGEX': f'({"|".join(periods)})'}, 'OP': '?'},
        {'DEP': 'quantmod', 'OP': '?'},
        {'DEP': 'punct', 'OP': '?'},
        {'DEP': 'prep', 'OP': '?'},
        {'POS': 'NUM'},
        {'TEXT': {'REGEX': f'({"|".join(periods)})'}},
    ],
    "Group Matcher": [
        {"TEXT": {"IN": higher_terms+lower_terms }}
    ]                 
}

In [82]:
def plot_dict(stat, t = 10, sort_values = False, barh = False, width = 20, height = 4, title = ''):
    filtered = dict(stat)
    to_delete = []
    for key in filtered:
        if filtered[key] < t:
            to_delete.append(key)
    for key in to_delete:
        del filtered[key]

    
    if sort_values == False:
        lists = sorted(filtered.items())
    else:
        if sort_values == True:
            lists = sorted(filtered.items(), key = lambda item : item[1])
        else:
            lists = sorted(filtered.items(), key = sort_values)
               
    fig = figure(num=None, figsize=(width, height))
    
    if title != '':
        fig.suptitle(title, fontsize=20)
        
    x, y = zip(*lists) 
    
    if barh == True:
        plt.barh(x, y)
    else:
        plt.bar(x, y)
    plt.show()
    

def merge_keys(mergers, obj):
    result = dict(obj)
    for key, arr in mergers:
        if key not in result:
            result[key] = 0
        for merger in arr:
            if merger in result:
                result[key] = result[key] + result[merger]
                del result[merger]
    return result

def dict_counter(res, arg):
    try:
        key = str(arg)
        res.setdefault(key, 0)
        res[key] = res[key] + 1
    except:
        pass

def numval(val):
    try:
        return int(float(str(val))) 
    except:
        return None
    
def day_value(val, rep = None):
    
    if rep != None:
        val = numval(val.text)
        if val != None and 'week' in rep.text:
            val = val * 7
        return val
    else:
        return None

def report_interval(res, min_val, max_val):       
    if min_val != None and max_val != None:
        for key in range(min_val, max_val):
            res.setdefault(key, 0)
            res[key] = res[key] + 1    

def virus_match(text):
    return len(re.findall(rf'({"|".join(virus_ref)})', text, flags=re.IGNORECASE)) > 0

In [83]:
literature = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sha = str(row['sha'])
    if sha != 'nan':
        sha = sha + '.json';
        try:
            found = False
            with open(articles[sha]) as f:
                data = json.load(f)
                #print([item['text'] for item in data['body_text']])
                for key in ['abstract', 'body_text']:
                    if found == False and key in data:
                        for content in data[key]:
                            text = content['text']
                            if virus_match(text) == True:                                
                                literature.append({'file': articles[sha], 'body': text})                                
        except KeyError:
            pass

100%|███████████████████████████████████████████████████████████████████████████| 47298/47298 [02:06<00:00, 375.09it/s]


In [85]:
print(len(literature))

15940


In [90]:
def execute_matches(match_arr, root, sentence, file, index = 0, execution = []):
    key, result = match_arr[0]
    rest = match_arr[1:]
    next_exec = execution + [(key, result, index)]
    if key in root:
        rule = root[key]
        if callable(rule):
            rule( (result, next_exec, sentence, file) )            
        else:
            if 'execute' in rule:
                rule['execute']( (result, next_exec, sentence, file) )
            if len(rest) > 0:
                execute_matches(rest, rule, sentence, file, index+1, next_exec)
    
    if len(rest) > 0:               
        execute_matches(rest, root, sentence, file, index + 1, execution)
        
def merge_dict_values(original, rules, drop = []):
    result = {}
    arr_map = {}
    for key, values in rules:
        for val in values:
            arr_map[val] = key
    
    for key in original.keys():
        new_key = key if key not in arr_map else arr_map[key]        
        if key not in drop and new_key not in drop:
            val = original[key]            
            result[new_key] = val if new_key not in result else result[new_key] + val
            
    return result
    
def merge_matches(matches, doc):
    match_list = []
    current = (None, None, None)
    for match_id, start, end in matches:   
        if match_id != current[0] or current[2] < start:
            if current[0] != None:
                match_list.append(current)
            current = (match_id, start, end)
        elif current[2] < end:
            current = (match_id, current[1], end)
        
    match_list.append(current)
    return match_list;

def match_parser(matcher, doc, rule, file):
    matches = matcher(doc)
    if len(matches)>0:
        to_process = []
        for match_id, start, end in merge_matches(matches, doc):
            string_id = nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            to_process.append((string_id, span))
        execute_matches(to_process, rule['root'], doc, file)

def parse_body(matcher, text, rule, file = None, sentence_level = False):
    text = text.lower()
    doc = nlp(text)
    
    if sentence_level == True:    
        for sent in doc.sents:
            sent_doc = nlp(sent.text)
            match_parser(matcher, sent_doc, rule, file)
    else:
        match_parser(matcher, doc, rule, file)

def execute_ruleset(term, rule, sentence_level = False):
    matcher = Matcher(nlp.vocab)
    for name, m in rule["Matchers"]:
        matcher.add(name, None, m)
    print(len(literature))
    for article in tqdm(literature):
#     for article in literature:
        text_list = re.compile("\. ").split(article['body'])
        file = article['file']
        for text in text_list:
            '''
            if callable(term):
                allow = term(text)
            else:
                allow = term == None or term in text
            if allow == True:
                '''
            parse_body(matcher, text, rule, file, sentence_level)        

In [91]:
len(literature)

15940

In [94]:
stat['socials'] = {}

def match(text):
    if virus_match(text) == True:
        return len(re.findall(rf'\ ({"|".join(symptoms)})\ ', text)) > 0
    else:
        return False

    
def social(res):
    ref, agregate, sentence, file = res
    dict_counter(stat['socials'], ref.text)
        


In [97]:
rule_social = {    
    "Matchers": [      
       ("Social Reference", matchers['Terms Matcher'](socials)),
    ],
    "root": {
        "Social Reference": social
    }
}


def social_match(text):
    return len(re.findall(r'social', text)) > 0

execute_ruleset(symptom_match, rule)
#plot_dict(stat['socials'], 50, True, title = "Social")

  0%|                                                                                        | 0/15940 [00:00<?, ?it/s]

15940


  0%|                                                                                | 1/15940 [00:00<53:31,  4.96it/s]


KeyError: 'symptoms'