In [None]:
import pandas as pd
import re
import json
from bs4 import BeautifulSoup
import spacy
import requests
import ast
import os

In [None]:
def annotate_text(df):
    from tqdm import tqdm
    
    
    pattern_month_name = "January|February|March|April|May|June|July|August|September|October|November|December"
    pattern_month_name_short = "Jan\.{0,1}|Feb\.{0,1}|Mar\.{0,1}|Apr\.{0,1}|Aug\.{0,1}|Sep\.{0,1}|Oct\.{0,1}|Nov\.{0,1}|Dec\.{0,1}"

    months = pattern_month_name + "|" + pattern_month_name_short

    pattern_month_number = "\d{1,2}"  # "[0-1]?[0-9]"

    days = "\d{1,2}"  # "[0-3]?[0-9]"

    year = "\d{2,4}"

    pattern_virus = r"(rabies|rabies virus)"

    # Patterns

    # Catches just XX.XX %
    pattern_mort = r"\s((mortality rate|CFR|Case Fatality Rate|IFR|infection fatality rate|fatality rate|death rate)\s.{,40}((below|above)?(?<!\S)(?=.)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?(\sper\scent|\s{,1}%\s{,1}))[^–].{,40})"

    # Mortality X to or - or – Y (%)
    pattern_mort2 = r"\s((mortality rate|CFR|Case Fatality Rate|IFR|infection fatality rate|fatality rate|death rate)\s.{,40}((below|above)?(?<!\S)([0-9]{,2}|([1-9](\d{1}|\d{0,1}(\.\d{3})*)))?(\.\d*[1-9])?(\sto\s|-|–)([0-9]{,2})?(\.\d*)?(%)?).{,40})"

    # XX.XX (per|/ 100,00)
    pattern_inc = r"\s((incidence|incidence rate)\s.{,40}((?<!\S)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?(?!\S)(/(\d{1,3},\d{1,3}|\d{4})|\sper\s(\d{1,3},\d{1,3}|\d{4}))?).{,40})"

    # below|above X
    pattern_inc2 = r"\s((incidence|incidence rate)\s.{,40}((below\s|above\s)(?<!\S)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?(?!\S)(/(\d{1,3},\d{1,3}|\d{4})|\sper\s(\d{1,3},\d{1,3}|\d{4}))?).{,40})"

    # XX,XX and|to|- XX,XX (per 100,000)
    pattern_inc3 = r"\s((incidence|incidence rate)\s.{,40}((?<!\S)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?(?!\S)((\s-\s|\sto\s|\sand\s)(?<!\S)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?(?!\S))(/(\d{1,3},\d{1,3}|\d{4})|\sper\s(\d{1,3},\d{1,3}|\d{4}))?).{,40})"

    pattern_hospi = r"\s((hospitality rate|hospitalization rate)\s.{,40}((below|above)?(?<!\S)(?=.)([0-9]+|([1-9](\d*|\d{0,2}(\.\d{3})*)))?(\.\d*[1-9])?%{0,1}(?!\S)(/(\d{1,3},\d{1,3}|\d{4})|\sper\s(\d{1,3},\d{1,3}|\d{4})|\sof\s(\d{1,3},\d{1,3}|\d{4}))?).{,40})"

    pattern_r = r"\s((r-value|r\svalue|reproduction value|\(Rt\)|reproduction number \(Rt\)|Rt|reproduction\srate|r\(0\)\svalue|r0\svalue|r-\(0\)\svalue|r0-value)\s.{,40}((below|above)?(?<!\S)(?=.)([0-9]+|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S))[^%°]{,40})"

    pattern_r2 = r"\s((r-value|r\svalue|reproduction value|\(Rt\)|reproduction number \(Rt\)|Rt|reproduction\srate|r\(0\)\svalue|r0\svalue|r-\(0\)\svalue|r0-value)\s.{,40}((below|above)?(?<!\S)(?=.)([0-9]+|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)\s{0,1}(and|-)\s{0,1}(?<!\S)(?=.)([0-9]+|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S))[^%°]{,40})"

    pattern_host_1 = "domestic_dog|canis lupus familiaris|jackal|Canis adustus and C\. mesomelas|mongoose|Herpestes spp|red fox|Vulpes vulpes|ferret badger|Melogale moschata|golden jackals|Canis aureus|raccoon dog|Nyctereutes procyonoides|raccoon|Procyon lotor|grey fox|Urocyon cinereoargenteus|striped skunk|Mephitis mephitis|coyote|Canis latrans|crab\-eating fox|Cerdocyon thous|marmoset|Callithrix jacchus|small Indian mongoose|Herpestes auropunctatus|arctic fox|Alopex lagopus"
    
    pattern_host_2 = "transmited by|transfered by|infected by|spreaded by"
    
    pattern_host = "(" + pattern_host_1 + "|" + pattern_host_2 + ")"
    
    date_slash = "(\d{2,4}/(\d{1,2}|" + months + ")/\d{2,4})"
    date_dot = "(\d{2,4}\.(\d{1,2}|" + months + ")\.\d{2,4})"
    date_hyphen = "(\d{2,4}-(\d{1,2}|" + months + ")-\d{2,4})"
    date_whitespace = "(\d{2,4}\s(\d{1,2}|" + months + ")\s\d{2,4})"

    # 10/10/2020 aber auch 10/October/2020 aber auch 10/Oct/2020
    # 10.10.2020 aber auch 10.October.2020 aber auch 10.Oct.2020
    # 10-10-2020 aber auch 10-October-2020 aber auch 10-Oct-2020
    # 10 10 2020 aber auch 10 October 2020 aber auch 10 Oct 2020

    date_slash_na = "((\d{2,4}|" + months + ")/\d{1,2}/\d{2,4})"
    date_dot_na = "((\d{2,4}|" + months + ")\.\d{1,2}\.\d{2,4})"
    date_hyphen_na = "((\d{2,4}|" + months + ")-\d{1,2}-\d{2,4})"
    date_whitespace_na = "((\d{2,4}|" + months + ")\s\d{1,2}\s\d{2,4})"

    # 10/10/2020 aber auch October/10/2020 aber auch Oct/10/2020
    # 10.10.2020 aber auch October.10.2020 aber auch Oct.10.2020
    # 10-10-2020 aber auch October-10-2020 aber auch Oct-10-2020
    # 10 10 2020 aber auch October 10 2020 aber auch Oct 10 2020

    month_day_year = "(" + "(" + months + ")" + "\s" + days + "\s," + year + ")"
    month_year = "(" + "(" + months + ")" + "\s" + year + ")"

    date = month_day_year + "|" + month_year + "|" + date_slash + "|" + date_slash_na + "|" + date_dot + "|" + date_dot_na + "|" + date_hyphen + "|" + date_hyphen_na + "|" + date_whitespace + "|" + date_whitespace_na

    # range1 = December 14, 2020-January 18, 2021
    range1 = "(" + "(" + months + ")" + "\s" + days + ",\s" + year + "-" + "(" + months + ")" + "\s" + days + ",\s" + year + ")"

    # range2 = January 2020 to 24 February 2020
    range2 = "(" + "(" + months + ")" + "\s" + year + "\sto\s" + "(" + months + ")" + "\s" + year + ")"

    # range3 = Between February 27 and July 22, 2021
    range3 = "(" + "Between" + "\s" + "(" + months + ")" + "\s" + days + "\sand\s" + "(" + months + ")" + "\s" + days + ",\s" + year + ")"

    # range4 = between 1 April 2021 and 30 April 2021
    range4 = "(" + "Between" + "\s" + days + "\s" + "(" + months + ")" + "\s" + year + "\sand\s" + days + "\s" + "(" + months + ")" + "\s" + year + ")"

    # range5 = 02 March to 30 May 2021
    range5 = "(" + days + "\s" + "(" + months + ")" + "\sto\s" + days + "\s" + "(" + months + ")" + "\s" + year + ")"

    # range6 = between 29/01/2021–23/02/2021
    range6 = "(" + "Between" + "\s" + days + "/" + pattern_month_number + "/" + year + "-" + days + "/" + pattern_month_number + "/" + year + ")"

    # range7 = Q1-Q4 2020
    range7 = "(" + "Q[1-4]-Q[1-4]\s" + year + ")"

    # range8 = 1 January 2020 to 31 May 2020
    range8 = "(" + days + "\s" + "(" + months + ")" + "\s" + year + "\sto\s" + days + "\s" + "(" + months + ")" + "\s" + year + ")"

    # range9 = from March 2020 to June 2021
    range9 = "(" + "From" + "\s" + "(" + months + ")" + "\sto\s" + "(" + months + ")" + "\s" + year + ")"

    # range10 = between 8 and 27 May 2020
    range10 = "(" + "Between" + "\s" + days + "\sand\s" + days + "\s" + "(" + months + ")" + "\s" + year + ")"

    # range11 = Between the end of January until the end of February 2021
    range11 = "(" + "Between the end of" + "\s" + "(" + months + ")" + "\suntil the end of\s" + "(" + months + ")" + "\s" + year + ")"

    # range12 = beginning of 2021
    range12 = "(" + "Beginning" + "\sof\s" + year + ")"
    # range13 = From October 2020 onwards
    range13 = "(" + "From\s" + "(" + months + ")\s" + year + "\sonwards" + ")"
    # range14 = Up to April 2021
    range14 = "(" + "Up to\s" + "(" + months + ")\s" + year + ")"

    # range15 = from January 1, 2020 to July 19, 2021
    range15 = "(" + "From" + "\s" + "(" + months + ")\s" + days + ",\s" + year + "\sto\s" + "(" + months + ")" + "\s" + days + ",\s" + year + ")"

    # range16 Februar 4, 2021 through Februar 22, 2021
    range16 = "(" + "(" + months + ")" + "\s" + days + "\s," + year + "\sthrough\s" + "(" + months + ")" + "\s" + days + "\s," + year + ")"

    range_pattern = "(" + range1 + "|" + range2 + "|" + range3 + "|" + range4 + "|" + range5 + "|" + range6 + "|" + range7 + "|" + range8 + "|" + range9 + "|" + range10 + "|" + range11 + "|" + range12 + "|" + range13 + "|" + range14 + "|" + range15 + "|" + range16 + ")"

    

    
    # set capture groups for the pattern
    capture_list = [0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0]
    
    # set name of labels for RegEx pattern
    label = ["DATE_2_range", "VIRUS", "DATE_2", "RVALUE", "RVALUE", "MORTALITY", "MORTALITY", "INCIDENCE", "INCIDENCE",
             "INCIDENCE", "HOSPITALIZATION", "HOST"]

    patterns = [range_pattern, pattern_virus, date, pattern_r, pattern_r2, pattern_mort, pattern_mort2, pattern_inc,
                pattern_inc2, pattern_inc3, pattern_hospi, pattern_host]
    
    # set the labels for spaCy
    spacy_labels = ["GPE", "LOC"]

    

    # specifiy which patterns should be caught case sensitive
    case_sensitive = [True, True, True, True, True, True, True, True, True, True, True, False]
    
    #load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # start labeling process
    
    liste_dicts = []
    for i, j in tqdm(df.iterrows(), desc="Iterate Dataframe"):
        if type(j["text"]) == str:
            test = re.split(r'\.\s+', j["text"])
            # liste_label = []
            for k in range(len(test)):
                dicti = {}
                liste_label = []
                liste_date = []
                liste_range = []
                list_pure_label = []
                string1 = test[k]
                # take the sentence with the two following sentences
                try:
                    string = string1 + ". " + test[k + 1] + ". " + test[k + 2] + "."
                # try except to cover setences that dont have previous sentences
                except:
                    try:
                        string = string1 + ". " + test[k + 1] + "."
                    except:
                        string = string1 + "."

                for p in range(len(patterns)):
                    
                    # match the RegEx patterns and apend the sentences and labels into their lists
                    
                    if case_sensitive[p] == True:
                        if re.finditer(patterns[p], string1, flags=re.I):
                            hit = re.finditer(patterns[p], string, flags=re.I)
                            for y in hit:
                                if label[p] != "DATE_2" and label[p] != "DATE_2_range":
                                    liste_label.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])
                                if label[p] != "DATE_2" and label[p] != "VIRUS" and label[p] != "DATE_2_range":
                                    list_pure_label.append(label[p])
                                if label[p] == "DATE_2":
                                    liste_date.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])
                                if label[p] == "DATE_2_range":
                                    liste_range.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])

                    elif case_sensitive[p] == False:
                        if re.finditer(patterns[p], string1):
                            hit = re.finditer(patterns[p], string)
                            for y in hit:
                                if label[p] != "DATE_2" and label[p] != "DATE_2_range":
                                    liste_label.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])
                                if label[p] != "DATE_2" and label[p] != "VIRUS" and label[p] != "DATE_2_range":
                                    list_pure_label.append(label[p])
                                if label[p] == "DATE_2":
                                    liste_date.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])
                                if label[p] == "DATE_2_range":
                                    liste_range.append(
                                        [y.span(capture_list[p])[0], y.span(capture_list[p])[1], label[p]])
                # append the entities found with spaCy to the label_list
                ents = nlp(string)

                for token in ents.ents:
                    if token.label_ in spacy_labels:
                        liste_label.append([token.start_char, token.end_char, token.label_])
                
                # delete the smaller date that was caught, because they were colliding
                
                index_list = []
                for range_label in range(len(liste_range)):
                    index_list_2 = []
                    for date_label in range(len(liste_date)):
                        if liste_range[range_label][0] == liste_date[date_label][0] and liste_range[range_label][1] > \
                                liste_date[date_label][1]:
                            index_list_2.append(date_label)
                        elif liste_range[range_label][0] < liste_date[date_label][0] and liste_range[range_label][1] == \
                                liste_date[date_label][1]:
                            index_list_2.append(date_label)
                        elif liste_range[range_label][0] < liste_date[date_label][0] and liste_range[range_label][1] > \
                                liste_date[date_label][1]:
                            index_list_2.append(date_label)
                    index_list.append(index_list_2)

                flat_list = [item for sublist in index_list for item in sublist]

                for index in range(len(liste_date)):
                    if index not in flat_list:
                        liste_range.append(liste_date[index])

                liste_label = liste_range + liste_label
                
                # unify the name of the date labels
                for labelx in liste_label:
                    if labelx[2] == "DATE_2_range":
                        labelx[2] = "DATE_2"
                # create dictionary and keep the entries if they have more than 0 labels
                
                dicti["text"] = string
                dicti["label"] = liste_label
                dicti["PMCID"] = j["PMCID"]
                dicti["Full_author"] = j["FAU"]
                dicti["Author"] = j["AU"]
                dicti["Author_ID"] = j["AUID"]

                if len(set(list_pure_label)) > 0:
                    liste_dicts.append(dicti)

    return liste_dicts

In [None]:
df = pd.read_csv("/path/to/dataframe")

In [None]:
annotations = annotate_text(df)

In [None]:
def create_folders(folder_list):
        if not os.path.exists("../articles/"+i):
            os.makedirs("../articles/"+i)

In [None]:
def create_jsonl_file(liste_dicts, filename):
    # create folder
    if not os.path.exists("../labeled_docs"):
            os.makedirs("../labeled_docs")
    # create jsonl file      
    output_file = open("../labeled_docs"+"/"+filename+".jsonl", 'w', encoding='utf-8')
    
    # write the lines of the jsonl file, each dict is written into a line
    
    for dic in liste_dicts:
        json.dump(dic, output_file)
        output_file.write("\n")

    output_file.close()

In [None]:
create_jsonl_file(annotations,"labeled_data_with_spacy_1k")