In [1]:
import pandas as pd
import os
import re
import hashlib
import json
from baseline_entity_extraction import baseline_extract_all, load_taxa_data
import spacy

  return torch._C._cuda_getDeviceCount() > 0


# Loading and preprocessing data from sentences_nlp file

In [2]:
def clean_words(words):
    '''
    Perform basic preprocessing on individual words
    '''
    clean_words = []
    for i, w in enumerate(words):
        if w == '.':
            if len(clean_words)>0:
                clean_words[-1] += '.'
        else:
            clean_words.append(w.strip())
    return clean_words

In [3]:
def get_journal_articles(sentences_path):
    """
    Loads and formats sentences_nlp352 json file and converts to a dataframe

    Parameters
    ----------
    path : string
        Path where the individual sentences are stored.

    Returns
    -------
    journal_articles: pd.DataFrame
        pd.DataFrame with cleaned individual sentences for all articles 
    """
    journal_articles = pd.read_csv(sentences_path, 
                                sep='\t',
                                names = ['gddid', 
                                        'sentid',
                                        'wordidx',
                                        'words',
                                        'part_of_speech',
                                        'special_class',
                                        'lemmas',
                                        'word_type',
                                        'word_modified'], 
                                usecols = ['gddid', 'sentid', 'words'])

    journal_articles = journal_articles.replace('"', '', regex = True)\
                                .replace(',--,', '-', regex = True)\
                                .replace('.,/,', '. / ', regex = True)\
                                .replace('\{', '', regex = True)\
                                .replace('}', '', regex = True)\
                                .replace(r'\W{4,}', '', regex=True)\
                                .replace(',,,', 'comma_sym', regex=True)\
                                .replace(',', ' ', regex=True)\
                                .replace('comma_sym', ', ', regex=True)\
                                .replace('-LRB-', '(', regex=True)\
                                .replace('-LSB-', '[', regex=True)\
                                .replace('LRB', '(', regex=True)\
                                .replace('LSB', '[', regex=True)\
                                .replace('-RRB-', ')', regex=True)\
                                .replace('-RSB-', ']', regex=True)\
                                .replace('RRB', ')', regex=True)\
                                .replace('RSB', ']', regex=True)\
                                .replace('-RRB', ')', regex=True)\
                                .replace('-RSB', ']', regex=True)
    
    journal_articles['words']= journal_articles['words'].str.split(" ")
    journal_articles['words'] = journal_articles['words'].apply(clean_words)
    journal_articles['sentence'] = journal_articles['words'].apply(lambda x: ' '.join(map(str, x)))
    
    return journal_articles

# Loading and processing data from bibliography file

In [4]:
def preprocessed_bibliography(path):
    """
    Loads and formats bibliography json file and converts to a dataframe

    Parameters
    ----------
    path : string
        Path where the bibliography database is stored.

    Returns
    -------
    bibliography: pd.DataFrame
        pd.DataFrame with GDD ID and the Digital Object Identifier.
    """
    with open(path, 'r') as f:
        bib_dict = json.load(f)
    
    gdd = []
    doi = []
    
    for article in bib_dict:
        gdd.append(article['_gddid'])
        if "identifier" not in article:
            doi.append("")
        else:
            for iden in article['identifier']:
                if iden['type'] == "doi":
                    doi.append(iden['id'])
    
    return pd.DataFrame({"doi": doi,
                         "gddid": gdd})

In [5]:
journal_articles = get_journal_articles('../data/sentences_nlp352')

In [6]:
journal_articles.head()

Unnamed: 0,gddid,sentid,words,sentence
0,54b4325de138239d8684d7e0,1,"[LES, DISTORSIONS, DE, L'ENREGISTREMENT, POLLI...",LES DISTORSIONS DE L'ENREGISTREMENT POLLINIQUE...
1,54b4325de138239d8684d7e0,2,"[(, de, -)PONEL, PhREILLE, M., 1997.]",( de -)PONEL PhREILLE M. 1997.
2,54b4325de138239d8684d7e0,3,"[Les, distorsions, de, l'enregistrement, polli...",Les distorsions de l'enregistrement pollinique...
3,54b4325de138239d8684d7e0,4,"[[, Distorsions, in, the, pollen, record, of, ...",[ Distorsions in the pollen record of vegetati...
4,54b4325de138239d8684d7e0,5,"[GEOBIOS,, M.S., n, °, 21, :, 195-202.]","GEOBIOS, M.S. n ° 21 : 195-202."


In [7]:
# import bibliography to get the DOIs
bib_df = preprocessed_bibliography("../data/bibjson")

In [8]:
bib_df.head()

Unnamed: 0,doi,gddid
0,10.1016/j.palaeo.2004.07.027,54b4324ae138239d8684a3c1
1,10.1016/0277-3791(92)90024-3,5504acd5e1382326932d887d
2,10.1016/j.epsl.2010.01.007,54fccea7e138239936c6de88
3,10.2307/3515451,571062c9cf58f1419caa214d
4,10.1111/j.1502-3885.1992.tb00030.x,56c16f4acf58f15c72c8ff9a


In [9]:
full_text = journal_articles.groupby("gddid")['sentence'].agg(lambda x: ' '.join(x)).reset_index()
full_text.head()

Unnamed: 0,gddid,sentence
0,54b43244e138239d8684933b,"Palaeogeography, Palaeoclimatology, Palaeoecol..."
1,54b43245e138239d8684949c,"Palaeogeography, Palaeoclimatology, Palaeoecol..."
2,54b43245e138239d86849568,"Palaeogeography, Palaeoclimatology, Palaeoecol..."
3,54b43246e138239d868497cd,"Palaeogeography, Palaeoclimatology, Palaeoecol..."
4,54b43246e138239d86849975,"Palaeogeography, Palaeoclimatology, Palaeoecol..."


# Merging 2 dataframe to get DOI-GDD relation

In [10]:
data = full_text.merge(bib_df, on ='gddid')
data.head()

Unnamed: 0,gddid,sentence,doi
0,54b43244e138239d8684933b,"Palaeogeography, Palaeoclimatology, Palaeoecol...",10.1016/0031-0182(77)90040-2
1,54b43245e138239d8684949c,"Palaeogeography, Palaeoclimatology, Palaeoecol...",10.1016/0031-0182(84)90010-5
2,54b43245e138239d86849568,"Palaeogeography, Palaeoclimatology, Palaeoecol...",10.1016/0031-0182(83)90024-X
3,54b43246e138239d868497cd,"Palaeogeography, Palaeoclimatology, Palaeoecol...",10.1016/0031-0182(89)90008-4
4,54b43246e138239d86849975,"Palaeogeography, Palaeoclimatology, Palaeoecol...",10.1016/0031-0182(92)90137-T


# Writing the RAW text files

In [11]:
for row in full_text.iterrows():
    with open(f"../data/raw/" + row[1]['gddid'] + '.txt', 'w') as f:
        f.write(row[1]['sentence'])

# Splitting training files by sections

In [12]:
# section_pattern = r"[1-9]\.[1-9][0-9]\.? [A-Z][a-zA-Z]{3,}"
patterns = ["Introduction", "Abstract", "Material And Method", "Site Description", "Interpretation", "Results", "Background", "Discussion", "Objectives", "Conclusion"]
endwords = ["Acknowledgement", "Reference"]

In [13]:
nlp = spacy.load("en_core_web_lg")    
taxa, all_taxa_words = load_taxa_data()

In [17]:
def return_json(chunk,
                chunk_local,
                chunk_global,
                chunk_subsection,
                gdd,
                doi):
    # Return the JSON for 1 training file
    # Get all the labels
    training_json = {
        "data": {
            "text": chunk,
            "subsection": chunk_subsection,
            "global_index": chunk_global,
            "local_index": chunk_local,
            "gdd_id": gdd,
            "doi": doi
        },
        "annotations": [{
            "model_version": "pre-labeling",
            "result": []
        }]
    }
    labels = baseline_extract_all(chunk, taxa, all_taxa_words, nlp)
    entities = []
    for label in labels:
        # print(label)
        entities.append({
            "value": {
                "start": label['start'],
                "end": label['start'],
                "text": label['text'],
                "labels": label['labels']
            }}
        )
    training_json['annotations'][0]['result'] = entities

    return training_json

def get_hash(text):
    # Returns a hash key that is used to name the file
    return hashlib.shake_128(text.encode('utf-8')).hexdigest(4)


In [18]:
prefix = "../data/raw"
char_len = 4500    # If a section is very long, each chunk will be approximately char_len in length
min_len = 1500      # If a section is very small (smaller than min_len), then it will be combined with the next section
files = os.listdir(prefix)
tot = 0

for fin in files:
    chunks = []
    chunk_subsection = []
    chunk_local = []
    section_names = []
    indices = []
    local = 0
    subsection = None
        
    with open(f"{prefix}/{fin}", 'r') as f:
        print(fin)
        article = f.readlines()[0]
        sections = []
        # # Look for a sections
        # matches = re.finditer(section_pattern, article)
        # for match in matches:
        #     indices.append(match.start())
        #     section_names.append(match.group())
        
        # # Divide the article into sections
        # # The length of indices is rarely greater than 0, hence we can remove this part
        # if len(indices) > 0:
        #     fin_indices = [0, indices[0]]
        #     last = 0 # Index of the last index added 
            
        #     # Add indices if they are far away
        #     for i in range(1, len(indices)):
        #         if indices[i] - indices[last] < char_len:
        #             continue
        #         else:
        #             fin_indices.append(indices[i])
        #             last = i

        #     for i in range(len(fin_indices)-1):
        #         sections.append(article[fin_indices[i] : fin_indices[i+1]])
                
        #     sections.append(article[fin_indices[-1] : ])
            
        # else:
        #     # The entire article is considered as a single section
        #     sections.append(article)
        #     section_names.append("")
        sections.append(article)
        section_names.append("")
        
        '''
        The above section is useful only to split up extremely long articles. 
        It usually does not find a section in smaller articles and can hence be removed.
        '''
        
        for num, sec in enumerate(sections):
            local = 0
            # If the section length is small, add the entire section as a chunk
            if len(sec) < char_len:
                chunks.append(sec)
                chunk_subsection.append(sec.split(" ")[0])
                chunk_local.append(local)
            else:
                # If it is very long, then split the section based on headers
                cur_para = ""        
                sentences = sec.split('. ')
                
                for si, sent in enumerate(sentences):
                    
                    if subsection == None:
                        subsection = sent.split(" ")[0]
                        
                    # If the paragraph is long enough, add it as a chunk
                    # If the next sentence is very long, add the current paragraph as a chunk and reset cur_para
                    if len(cur_para) > char_len or len(sent) > char_len:
                        chunks.append(cur_para.strip())
                        chunk_subsection.append(subsection)
                        chunk_local.append(local)
                        cur_para = ""
                        local += 1
                    
                    check=True
                    for pat in patterns:
                        # If there is a pattern present:
                        # then everything before the pattern goes in the current para
                        # Text starting from the pattern goes in the next para after
                        if pat in sent or pat.upper() in sent:
                            try:
                                index = sent.index(pat)
                            except ValueError:
                                index = sent.index(pat.upper())
            
                            cur_para += sent[:index]
                            
                            if len(cur_para) > min_len:
                                chunks.append(cur_para.strip())
                                chunk_subsection.append(subsection)
                                chunk_local.append(local)
                                subsection = pat
                                local = 0
                                cur_para = sent[index: ] + '. '
                            else:
                                cur_para += sent[index: ] + '. '
                                
                            check = False
                            break
                            
                    end=False
                    # Check if there is an endwords
                    for pat in endwords:
                        if pat in sent or pat.upper() in sent:
                            end=True
                            break
                    if end:
                        break
                    
                    # If no pattern or ending condition is present in the current sentence, then add it to the current para
                    if check:
                        cur_para += sent + ". "
                if len(cur_para) > 0:
                    chunks.append(cur_para.strip())
                    chunk_subsection.append(subsection)
                    chunk_local.append(local)
    
    # Writing files
    for i, chunk in enumerate(chunks):
        gdd = fin.split('.')[0]
        doi = data[data['gddid'] == gdd].iloc[0]['doi']
        filename = get_hash(chunk)
        
        with open(f"../data/train_files_json/{filename}.json",'w') as fout:
            json_chunk = return_json(chunk, 
                                    chunk_local[i],
                                    i,
                                    chunk_subsection[i],
                                    gdd,
                                    doi)
            json.dump(json_chunk, fout)
        # with open(f"../data/train_files_txt/{gdd}_{i}_{chunk_subsection[i]}_{chunk_local[i]}.txt",'w') as fout:
        #     fout.write(chunk)

55c7e851cf58f1a8110ba2e3.txt
5697ec53cf58f1143ae00811.txt
54b43269e138239d8684f8b5.txt
573bbd6acf58f151fc3e56b7.txt
557af350e1382390b43c7494.txt
54b43248e138239d86849e09.txt
5506a7cde1382326932d9244.txt
5501e1b5e1382326932d7436.txt
54b43265e138239d8684ee5f.txt
54b4325ce138239d8684d5ae.txt
54b4326de138239d8685034c.txt
56818c02cf58f1ba274d4652.txt
5724521fcf58f1bc023df2d2.txt
54b43283e138239d86854158.txt
54e86a77e138237cc9150d9a.txt
54b4325de138239d8684d700.txt
54b4326be138239d86850036.txt
55070990e1382326932d93c8.txt
5507ac25e1382326932d9671.txt
54b43267e138239d8684f39e.txt
562cbce7cf58f10e5cb76e65.txt
570fc4aacf58f109a30a7509.txt
55044f61e1382326932d85d6.txt
54b43269e138239d8684f895.txt
54b43269e138239d8684f9c0.txt
5504ed13e1382326932d8a51.txt
5746564ccf58f1698be7f7c1.txt
54b4326ee138239d8685063a.txt
54b4324ee138239d8684ae59.txt
557c8867e1382390b43c7ba9.txt
56c160f5cf58f15c72c8fe73.txt
56f90dd1cf58f179466c7159.txt
54b43250e138239d8684b23a.txt
570fbe4acf58f10725bd2dc7.txt
54b4326de13823