In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pandas as pd
import json
from nltk.tokenize import sent_tokenize, word_tokenize
import re


with open('all_cases_output_with_title_18.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)
    
def process_texts(json_data):
    records = []
    
    for text_id, item in enumerate(json_data, start=1):
        full_text = item['full_text']
        text_name = item['title'].split('.')[0]
        entities = item['entities']
        sentences = sent_tokenize(full_text)

        sent_position = 0
        for sent_id, sentence in enumerate(sentences, start=1):
            tokens = word_tokenize(sentence)
            current_pos = full_text.find(sentence, sent_position)
            
            for token in tokens:
                token_start = full_text.find(token, current_pos)
                token_end = token_start + len(token)
                label = 'O'
                
                for entity in entities:
                    entity_ranges = [(int(pos.split()[0]), int(pos.split()[1])) for pos in entity['positions'].split(';')]
                    if any(start <= token_start < end for start, end in entity_ranges):
                        label = entity['type']
                        break
                
                records.append((text_name, sent_id, token, label))
                current_pos = token_end
                
            sent_position = current_pos
            
    return pd.DataFrame(records, columns=['Text_ID', 'Sent_ID', 'Token', 'Label'])
    
    
def process_texts_for_iob(json_data):
    records = []

    for text_id, item in enumerate(json_data, start=1):
        full_text = item['full_text']
        text_name = item['title'].split('.')[0]
        entities = item['entities']
        sentences = sent_tokenize(full_text)

        sent_position = 0
        for sent_id, sentence in enumerate(sentences, start=1):
            tokens = word_tokenize(sentence)
            current_pos = full_text.find(sentence, sent_position)

            for token in tokens:
                token_start = full_text.find(token, current_pos)
                token_end = token_start + len(token)
                label = 'O'

                for entity in entities:
                    entity_ranges = [(int(pos.split()[0]), int(pos.split()[1])) for pos in entity['positions'].split(';')]
                    for start, end in entity_ranges:
                        if start <= token_start < end:
                            if start == token_start:
                                label = 'B-' + entity['type']
                            else:
                                label = 'I-' + entity['type']
                            break
                    if label != 'O':
                        break

                records.append((text_name, sent_id, token, label))
                current_pos = token_end

            sent_position = current_pos

    return pd.DataFrame(records, columns=['Text_ID', 'Sent_ID', 'Token', 'Label'])

In [4]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import BertTokenizerFast

def process_texts_for_iob_with_tokenizer(json_data, tokenizer):
    records = []

    for text_id, item in enumerate(json_data, start=1):
        full_text = item['full_text']
        text_name = item['title'].split('.')[0]
        entities = item['entities']
        sentences = sent_tokenize(full_text)

        sent_position = 0
        for sent_id, sentence in enumerate(sentences, start=1):
            tokens = word_tokenize(sentence)
            current_pos = full_text.find(sentence, sent_position)
            sentence_labels = []

            for token in tokens:
                token_start = full_text.find(token, current_pos)
                token_end = token_start + len(token)
                label = 'O'

                for entity in entities:
                    entity_ranges = [(int(pos.split()[0]), int(pos.split()[1])) for pos in entity['positions'].split(';')]
                    for start, end in entity_ranges:
                        if start <= token_start < end:
                            label = 'B-' + entity['type'] if start == token_start else 'I-' + entity['type']
                            break
                    if label != 'O':
                        break

                sentence_labels.append(label)
                current_pos = token_end

            tokenized_sentence, new_labels = tokenize_and_preserve_labels(sentence, sentence_labels, tokenizer)
            for token, label in zip(tokenized_sentence, new_labels):
                records.append((text_name, sent_id, token, label))

            sent_position = current_pos

    return pd.DataFrame(records, columns=['Text_ID', 'Sent_ID', 'Token', 'Label'])

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    for word, label in zip(word_tokenize(sentence), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        if n_subwords > 0:
            if label.startswith('B-'):
                labels.append(label)
                labels.extend(['I-' + label.split('-')[1]] * (n_subwords - 1))
            else:
                labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
df = process_texts_for_iob_with_tokenizer(json_data, tokenizer)



In [None]:
df = process_texts_for_iob(json_data)

In [5]:
df

Unnamed: 0,Text_ID,Sent_ID,Token,Label
0,15939911,1,case,O
1,15939911,1,:,O
2,15939911,1,a,O
3,15939911,1,28,B-Age
4,15939911,1,-,I-Age
...,...,...,...,...
136385,28767567,33,##d,I-Sign_symptom
136386,28767567,33,event,I-Sign_symptom
136387,28767567,33,was,O
136388,28767567,33,presented,O


In [6]:
df[df['Text_ID']== '15939911']

Unnamed: 0,Text_ID,Sent_ID,Token,Label
0,15939911,1,case,O
1,15939911,1,:,O
2,15939911,1,a,O
3,15939911,1,28,B-Age
4,15939911,1,-,I-Age
...,...,...,...,...
412,15939911,10,after,I-Date
413,15939911,10,the,O
414,15939911,10,ab,O
415,15939911,10,##lation,O


In [10]:
word_labels_list = []

current_word = ""
current_label = None
current_text_id = None
current_sent_id = None

for index, row in df.iterrows():
    token = row['Token']
    label = row['Label']
    text_id = row['Text_ID']
    sent_id = row['Sent_ID']

    if text_id != current_text_id or sent_id != current_sent_id:
        if current_word:
            word_labels_list.append({'Text_ID': current_text_id, 'Sent_ID': current_sent_id, 'Token': current_word, 'Label': current_label})
        current_text_id = text_id
        current_sent_id = sent_id
        current_word = token
        current_label = label
    elif token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels_list.append({'Text_ID': current_text_id, 'Sent_ID': current_sent_id, 'Token': current_word, 'Label': current_label})
        current_word = token
        current_label = label

if current_word:
    word_labels_list.append({'Text_ID': current_text_id, 'Sent_ID': current_sent_id, 'Token': current_word, 'Label': current_label})

new_word_labels_df = pd.DataFrame(word_labels_list)

In [11]:
new_word_labels_df[new_word_labels_df['Text_ID']== '15939911']

Unnamed: 0,Text_ID,Sent_ID,Token,Label
0,15939911,1,case,O
1,15939911,1,:,O
2,15939911,1,a,O
3,15939911,1,28,B-Age
4,15939911,1,-,I-Age
...,...,...,...,...
325,15939911,10,months,I-Date
326,15939911,10,after,I-Date
327,15939911,10,the,O
328,15939911,10,ablation,O


In [12]:
csv_file_path = 'processed_entities_tokenized.csv'
new_word_labels_df.to_csv(csv_file_path, index=False)

In [12]:
csv_file_path = 'version2_processed_entities_with_sentences_iob_18.csv'
df.to_csv(csv_file_path, index=False)

## Dataset 20

In [19]:
with open('all_cases_output_with_title_20.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)

In [20]:
df = process_texts_for_iob(json_data)

In [21]:
df

Unnamed: 0,Text_ID,Sent_ID,Token,Label
0,15939911,1,CASE,O
1,15939911,1,:,O
2,15939911,1,A,O
3,15939911,1,28-year-old,B-Age
4,15939911,1,previously,B-History
...,...,...,...,...
98055,28767567,33,unanticipated,I-Sign_symptom
98056,28767567,33,event,I-Sign_symptom
98057,28767567,33,was,O
98058,28767567,33,presented,O


In [22]:
csv_file_path = 'processed_entities_with_sentences_iob_20.csv'
df.to_csv(csv_file_path, index=False)