In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[90]:


import json 
from cassis import * # pip install dkpro-cassis
import os
import subprocess
import numpy as np
import pandas as pd
from pathlib import Path
import zipfile

PROJECT_NAME = 'Transferability+-+CDI'
EXPORT_LOC = './data/'
NE_TYPE = 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity'
CHUNK_TYPE = 'de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity'


In [2]:
def _get_doc_dirs(curation_dir):
    doc_dirs = [f.path for f in os.scandir(curation_dir) if f.is_dir()]
    return doc_dirs

def _find_most_recent_export(export_loc,project_name):
    export_dirs = pd.Series([f.path for f in os.scandir(EXPORT_LOC) if f.is_dir() and PROJECT_NAME in str(f)])

    exports = pd.DataFrame()
    exports['path'] = export_dirs
    exports['date'] = exports['path'].apply(lambda x: int(x[-15:].replace('-','').replace('_','')))
    target_folder = exports.path[exports.date.argmax()]

    curation_dir = Path(target_folder) / 'curation'
    source_dir =  Path(target_folder) / 'source'
    
    doc_dirs = _get_doc_dirs(curation_dir)
    
    return target_folder, curation_dir, source_dir, doc_dirs

target_folder, curation_dir, source_dir, doc_dirs = _find_most_recent_export(EXPORT_LOC, PROJECT_NAME)


In [3]:
def _construct_df_of_paths(doc_dirs: list, source_dir):
    docs = pd.DataFrame()
    for doc_dir in doc_dirs:
        zips = pd.Series([f.name for f in os.scandir(doc_dir) if '.zip' in str(f)])
        for z in zips:
            with zipfile.ZipFile(os.path.join(doc_dir,z), 'r') as zip_ref:
                zip_ref.extractall(doc_dir)
            #subprocess.Popen(['unzip',z],cwd=doc_dir)
        doc_dir = Path(doc_dir)
        source_path = source_dir / doc_dir.name
        docs = docs.append([[doc_dir.name, doc_dir / "TypeSystem.xml", doc_dir / "CURATION_USER.xmi", source_path]])

    docs = docs.reset_index().drop(columns=['index'])
    docs.columns = ['doc_name','typesystem_path','annot_path','source_path'] 
        
    return docs

In [4]:
docs = _construct_df_of_paths(doc_dirs, source_dir) 
docs.head()

Unnamed: 0,doc_name,typesystem_path,annot_path,source_path
0,10079105.txt,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...
1,10952564.txt,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...
2,11254479.txt,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...
3,11254650.txt,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...
4,11451694.txt,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...,data\Transferability+-+CDI_curated_documents_2...


In [5]:
def load_cas(typesystem_path, annot_path):
    with open(typesystem_path,'rb') as f:
        typesystem = load_typesystem(f)
    with open(annot_path,'rb') as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)
    return cas

In [6]:
def _identify_typenames():
    for t in cas.typesystem.get_types():
        print(t.name)

In [7]:
objs = []
for index, doc in docs.iterrows():
    obj = {}
    obj['doc_name'] = [doc['doc_name']]
    cas = load_cas(doc['typesystem_path'],doc['annot_path'])
    for named_entity in cas.select(NE_TYPE):
        entity_type = named_entity.value
        entity_text = named_entity.get_covered_text()
        try:
            obj[entity_type].append(entity_text)
        except KeyError:
            obj[entity_type] = [entity_text]
    objs.append(obj)

In [8]:
import itertools

In [9]:
[[obj[x] for x in obj.keys()] for obj in objs][0]

[['10079105.txt'],
 ['Clostridium difficile'],
 ['rat'],
 ['neurotensin receptor antagonist',
  'inhibited by the substance P (neurokinin-1) receptor antagonist CP-96,345'],
 ['toxin A-induced'],
 ['SR-48,692']]

In [10]:
expanded = [[dict(zip(obj.keys(),y)) for y in itertools.product(*[obj[x] for x in obj.keys()])] for obj in objs]
expanded = [item for sublist in expanded for item in sublist]

In [11]:
pd.DataFrame(expanded).to_csv('./data/exported_annotations_cdi.csv')

In [12]:
json.dump(objs,open('./data/exported_annotations_cdi.json','w'))