In [None]:
### Goal is to extract relevant information out of reports and compile to SQL database

# Importing modules

In [1]:
!pip install tika

Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-2.6.0-py3-none-any.whl size=32621 sha256=a6d4c604cc7b36951c363d5b1e0af03b4b9c0668081f980e96a1ebd68e2956b5
  Stored in directory: /root/.cache/pip/wheels/5f/71/c7/b757709531121b1700cffda5b6b0d4aad095fb507ec84316d0
Successfully built tika
Installing collected packages: tika
Successfully installed tika-2.6.0


In [42]:
from tika import parser
import numpy as np
import re
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Code

In [3]:
# Reading data
raw = parser.from_file('Mock01.pdf')

2023-09-26 18:47:55,516 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2023-09-26 18:47:56,138 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2023-09-26 18:47:56,514 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [44]:
### Helper functions
# Function to reduce list of identical to single element
def collapse_list(list_identical_elements):
  if all(x == list_identical_elements[0] for x in list_identical_elements):
    return list_identical_elements[0]
  else:
    print(f'Different elements detected: {list_identical_elements}')
    return np.nan

In [54]:
content = raw['content']
extracted_content = {}
# ID in format "Capital_Letter YY.dddddd"
ID_pattern = r'([A-Z]\d{2}\.\d{6})'
ID_matches = re.findall(ID_pattern, content)
extracted_content['ID'] = ID_matches[0]

# Age of lesion with date format dd.mm.yyyy
date_pattern = r'\b\d{2}\.\d{2}\.\d{4}\b'
date_matches = re.findall(date_pattern, content)
if len(date_matches) == 2:
  date_format = "%d.%m.%Y"
  date1 = datetime.strptime(date_matches[0], date_format)
  date2 = datetime.strptime(date_matches[1], date_format)
  difference = relativedelta(date2, date1)
  extracted_content['age'] = difference.years

# T stage
T_pattern = r'pT([0-4])'
T_matches = re.findall(T_pattern, content)
extracted_content['T'] = collapse_list(T_matches)

# N stage
N_pattern = r'N([0-3])'
N_matches = re.findall(N_pattern, content)
extracted_content['N'] = collapse_list(N_matches)

# N stage
M_pattern = r'M([0-2])'
M_matches = re.findall(M_pattern, content)
extracted_content['M'] = collapse_list(M_matches)

# Diagnosis
diagnosis_keys = ['Adenokarzinom','Plattenepithel-Karzinom']
Dx_matches = []
for diagnosis in diagnosis_keys:
  if diagnosis in content:
    Dx_matches.append(diagnosis)
extracted_content['Diagnosis'] = Dx_matches

print(extracted_content)

{'ID': 'X23.437284', 'age': 63, 'T': '3', 'N': '2', 'M': '0', 'Diagnosis': ['Adenokarzinom']}
