<a href="https://colab.research.google.com/github/LeonGoergen/informationExtraction/blob/main/data_prep/DataPreparationMedical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm
import json
import unicodedata
import os
from collections import Counter
import re

# Data Preparation

In [None]:
df = pd.read_json('/content/drive/MyDrive/Masterarbeit/Medical/GERNERMED_dataset.json')
df = df[['de', 'annotations']]
print(df.iloc[7]['de'])
print(df.iloc[7]['annotations'])

Er kann auch wiederkehrende Anfälle haben, die mit ativan IV oder IM behandelt werden sollten und nicht notwendigerweise darauf hindeuten, dass der Patient ins Krankenhaus zurückkehren muss, es sei denn, sie dauern länger als 5 Minuten an oder er hat mehrere wiederkehrende Anfälle oder Komplikationen wie Aspiration.
[{'id': 'T3', 'type': 'Drug', 'spans': [9981, 9987], 'content': 'ativan', 'de_spans': [51, 57]}, {'id': 'T5', 'type': 'Route', 'spans': [9994, 9996], 'content': 'IM', 'de_spans': [66, 68]}, {'id': 'T4', 'type': 'Route', 'spans': [9988, 9990], 'content': 'IV', 'de_spans': [58, 60]}]


In [None]:
def tokenize_and_label(text, annotations):
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)

    labels = ['O'] * len(tokens)

    current_idx = 0
    token_spans = []

    for token in tokens:
        start_idx = text.find(token, current_idx)
        end_idx = start_idx + len(token)
        token_spans.append((start_idx, end_idx))
        current_idx = end_idx

    for annotation in annotations:
        entity_start, entity_end = annotation['de_spans']
        entity_type = annotation['type']

        for i, (token_start, token_end) in enumerate(token_spans):
            if token_start >= entity_start and token_end <= entity_end:
                if token_start == entity_start:
                    labels[i] = f'B-{entity_type}'
                else:
                    labels[i] = f'I-{entity_type}'

    return tokens, labels

def tokenize_and_label_row(row):
    text = row['de']
    annotations = row['annotations']
    tokens, labels = tokenize_and_label(text, annotations)
    return pd.Series([tokens, labels])

df[['tokens', 'ner_tags']] = df.apply(tokenize_and_label_row, axis=1)
df = df = df[['tokens', 'ner_tags']]

In [None]:
print(df.iloc[7]['tokens'])
print(df.iloc[7]['ner_tags'])

['Er', 'kann', 'auch', 'wiederkehrende', 'Anfälle', 'haben', ',', 'die', 'mit', 'ativan', 'IV', 'oder', 'IM', 'behandelt', 'werden', 'sollten', 'und', 'nicht', 'notwendigerweise', 'darauf', 'hindeuten', ',', 'dass', 'der', 'Patient', 'ins', 'Krankenhaus', 'zurückkehren', 'muss', ',', 'es', 'sei', 'denn', ',', 'sie', 'dauern', 'länger', 'als', '5', 'Minuten', 'an', 'oder', 'er', 'hat', 'mehrere', 'wiederkehrende', 'Anfälle', 'oder', 'Komplikationen', 'wie', 'Aspiration', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Drug', 'B-Route', 'O', 'B-Route', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
train, test = train_test_split(df, test_size=0.15, random_state=1)
len(train), len(test)

(7309, 1290)

In [None]:
def count_tags(tags):
    tag_counter = Counter()
    for tag in tags:
        if tag.startswith('B-') or tag.startswith('I-'):
            tag = tag[2:]  # Remove 'B-' or 'I-'
        tag_counter[tag] += 1
    return tag_counter

total_counts = Counter()
for tags in train['ner_tags']:
    total_counts.update(count_tags(tags))
sorted(total_counts.items(), key=lambda pair: pair[0])

[('Dosage', 10293),
 ('Drug', 13678),
 ('Duration', 1256),
 ('Form', 8881),
 ('Frequency', 14614),
 ('O', 93085),
 ('Route', 4022),
 ('Strength', 9824)]

In [None]:
train_json = train.to_json(orient='records')
test_json = test.to_json(orient='records')

with open('/content/drive/MyDrive/Masterarbeit/Medical/train.json', 'w') as f:
  f.write(train_json)

with open('/content/drive/MyDrive/Masterarbeit/Medical/test.json', 'w') as f:
  f.write(test_json)