In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import spacy
import pandas as pd
import os, re, json
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import Span
from spacy.util import filter_spans
from spacy.tokens import DocBin

In [None]:
!python -m pip install --upgrade --user pandas





In [None]:
!pip show pandas

Name: pandas
Version: 1.1.5
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: 
License: BSD
Location: c:\users\ishan_borker\appdata\roaming\python\python37\site-packages
Requires: numpy, python-dateutil, pytz
Required-by: nlp, odo, seaborn


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [None]:
# nlp = spacy.load("en_core_web_trf", disable=["ner", "tagger", ])
nlp = spacy.blank("en")

In [None]:
def remove_trail(fname):
    return fname.split(".")[0]

def sanitize_txt_file(ehr_lines):
    return " \n".join([l.strip() for l in ehr_lines.split("\n")])

In [None]:
FILTER_DISCONTINOUS = True
PROCESS_TOPN_ONLY = True
TOP_N_ENTS = ["Condition", "Scope", "Qualifier", "Value", "Drug", "Procedure", "Measurement", "Temporal", "Observation", "Person"]


df = pd.read_csv("/content/drive/MyDrive/chia_clinical_trials/chia_rel_dataset.tsv", sep="\t", header=0)

if FILTER_DISCONTINOUS:
    df = df[df['hrem_idx_start'].isnull()]
    df.drop(['hrem_idx_start', 'hrem_idx_end'], axis=1, inplace=True)
    
if PROCESS_TOPN_ONLY:
    df = df[df['head_ent_type'].isin(TOP_N_ENTS)]
    
df['head_ent_type'] = df['head_ent_type'].apply(lambda x: x.upper())

# apply_txt_patch()
ehr_list = list(map(remove_trail, df["source"].unique()))


In [None]:
%%time
processed_ner_data = []
failed_ner_data = []
file_count = 0
filtered_spans_count = 0

for ehr_file in ehr_list:
    with open(f"chia_txt/{ehr_file}.txt", 'r') as f:
        con_file_name = f"{ehr_file}.ann"
        ehr_df = df[df["source"]==con_file_name]
        ents = []
        try:
            ehr_lines = f.read()
            doc = nlp(sanitize_txt_file(ehr_lines))
            for index, row in ehr_df.iterrows():
                ents.append(doc.char_span(row["hspan_start"], row["hspan_end"], label=row["head_ent_type"]))
                
                
            #first find where we span failed due to the index error
            failed_spans_idx = [i for i,v in enumerate(ents) if v == None]
            if failed_spans_idx:
                failed_ner_data.append(ehr_df.iloc[failed_spans_idx, :])
                
            #filter none ents
            ents = list(filter(None, ents))
            filtered_spans = filter_spans(ents)
            filtered_spans_count = filtered_spans_count + len(ents) - len(filtered_spans)
            
            doc.set_ents(filtered_spans)
            processed_ner_data.append(doc)
            file_count += 1
        except Exception as e:
            print(e)

print(f"succesfully processed {file_count} files..")

'charmap' codec can't decode byte 0x81 in position 170: character maps to <undefined>
succesfully processed 1885 files..
Wall time: 13.9 s


In [None]:
filtered_spans_count

4651

In [None]:
training_data = [ent.label_ for doc in processed_ner_data for ent in doc.ents]
total_training_ents = sum(list(df['head_ent_type'].value_counts()))
total_label_ents = sum(list(pd.DataFrame(training_data).value_counts()))

print("total missed ents..", total_training_ents-total_label_ents)

total missed ents.. 7155


In [None]:
failed_df = pd.concat(failed_ner_data)
failed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2480 entries, 57 to 20734
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rel_type        2480 non-null   object 
 1   head_ent        2480 non-null   object 
 2   tail_ent        2414 non-null   object 
 3   head_ent_span   2480 non-null   object 
 4   head_ent_txt    2480 non-null   object 
 5   tail_ent_span   2414 non-null   object 
 6   tail_ent_txt    2414 non-null   object 
 7   head_ent_type   2480 non-null   object 
 8   hspan_start     2480 non-null   float64
 9   hspan_end       2480 non-null   float64
 10  tail_ent_type   2414 non-null   object 
 11  tspan_start     2414 non-null   float64
 12  tspan_end       2414 non-null   float64
 13  source          2480 non-null   object 
 14  trem_idx_start  24 non-null     float64
 15  trem_idx_end    24 non-null     float64
dtypes: float64(6), object(10)
memory usage: 329.4+ KB


In [None]:
doc_bin = DocBin()
for doc in processed_ner_data:
    doc_bin.add(doc)
doc_bin.to_disk("./content/drive/MyDrive/chia_clinical_trials/chia_rel_full.spacy")