# **NLP for Prostate Cancer Radiology Reports **
Nagel, Meddeb et al.

This script is designed to process and extract specific pieces of clinical information from MRI reports pertaining to the prostate. Utilizing a combination of regular expressions and natural language processing techniques, the script automates the extraction of the following data points:

    Prostate Size: Determines the sagittal and axial dimensions of the prostate and standardizes them in centimeters.
    Prostate Volume: Extracts the volume measurement and provides it in milliliters.
    PSA Value: Detects and appends the PSA (Prostate-Specific Antigen) value in ng/ml.
    Gleason Score: Recognizes the presence of a Gleason score, indicating the aggressiveness of prostate cancer if present.
    Number of Suspected Lesions: Quantifies the number of lesions that are suspected in the MRI report.
    PI-RADS Score: Determines the Prostate Imaging Reporting and Data System (PI-RADS) score for each lesion, which indicates the probability of significant prostate cancer.
    Lesion Location: Distinguishes the location of each lesion, identifying whether it's in the Transitionalzone or Periphere Zone.
    Presence of BPH: Recognizes the mentioning of Benign Prostatic Hyperplasia (BPH) in the report.
    Other Observations: Captures other clinical notes from the "Beurteilung" section of the report that are not directly related to prostate cancer.



In [None]:
# Envirenmont Installation
!pip install python-docx pandas
!pip install spacy
!python -m spacy download de_core_news_sm

In [None]:
# Data from Word to CSV
import os
import pandas as pd
from docx import Document

def read_word_file(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Folder path
main_folder_path = "/content/drive/MyDrive/Prostate_NLP/nlp_prostata"

# Collect all word document filenames in the folder and subfolders
reports = []
for dirpath, dirnames, filenames in os.walk(main_folder_path):
    for filename in filenames:
        if filename.endswith('.docx'):
            file_path = os.path.join(dirpath, filename)
            reports.append(read_word_file(file_path))

# Create a dataframe
df = pd.DataFrame({
    'patient_id': range(1, len(reports) + 1),
    'report': reports
})

# Save the dataframe to a CSV file
df.to_csv("/content/drive/MyDrive/Prostate_NLP/MRI_reports.csv", index=False)


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Prostate_NLP/MRI_reports.csv")
df

Unnamed: 0,patient_id,report
0,1,Klinische Angaben:\nPatient unter AS bei PCa G...
1,2,Klinische Angaben:\nVor MRT-TRUS-fusionierter ...
2,3,Klinische Angaben:\nV.a. PCA bei PSA Anstieg a...
3,4,"Klinische Angaben:\nPSA-Wert-Erhöhung auf 28,2..."
4,5,Klinische Angaben:\nAzinäres Prostatakarzinom ...
5,6,Klinische Angaben:\nProstata-Ca. unter Active ...
6,7,Klinische Angaben:\nPatient mit PCa unter AS. ...
7,8,Klinische Angaben:\nVor MRT-TRUS-fusionierter ...
8,9,Klinische Angaben:\nPatient kommt zur MRT-Fusi...
9,10,Klinische Angaben:\nV.a. Prostataca. bei Dynam...


In [None]:
import re
import pandas as pd
import spacy

# Load German language model
nlp = spacy.load("de_core_news_sm")

def extract_information(report):

    # Extract prostate volume
    volume_match = re.search(r'(\d+) ml', report)
    prostate_volume = volume_match.group(1) if volume_match else None

    # Extract PSA and append "ng/ml"
    psa_match = re.search(r'(\d+) ng/ml', report)
    psa_value = psa_match.group(1) if psa_match else None

    # Extract Gleason Score
    gleason_match = re.search(r'Gleason[- ]Score (\d+)', report)
    gleason_score = gleason_match.group(1) if gleason_match else None

    # Extract number of suspected lesions, PI-RADS, and location
    lesions = re.findall(r'(Areal|Läsion|Malignitätssuspekt|Karzinomsuspekt|Karzinomsuspekter Befund).*?PI-RADS (\d+)', report)
    num_lesions = len(lesions)
    #locations = [l[1] for l in lesions]
    pirads_values = [l[1] for l in lesions] if num_lesions else ["1"]


    # Check for BPH presence
    bph_present = 1 if "BPH" in report or "benigne Prostatahyperplasie" in report else 0

    # Extract other observations from Beurteilung section, excluding prostate cancer related notes
    beurteilung_section = re.search(r'Beurteilung:(.*?)(?=\d\.)', report, re.DOTALL)
    if beurteilung_section:
        beurteilung_text = beurteilung_section.group(1)
        cancer_terms = ["Areal", "Läsion", "Malignitätssuspekt", "Karzinomsuspekt", "Prostatakarzinom", "Karzinomsuspekter Befund", "PI-RADS"]
        observations = [sent.text for sent in nlp(beurteilung_text).sents if not any(term in sent.text for term in cancer_terms)]
    else:
        observations = []

    return prostate_volume, psa_value, gleason_score, num_lesions, pirads_values, bph_present, observations

# Apply the extraction function to each report in the DataFrame
df['Prostate_Volume'], df['PSA'], df['Gleason_Score'], df['Number_of_Suspected_Lesions'], df['PI-RADS'], df['BPH_Present'], df['Other_Observations'] = zip(*df['report'].map(extract_information))


In [None]:
df

Unnamed: 0,patient_id,report,Prostate_Volume,PSA,Gleason_Score,Number_of_Suspected_Lesions,PI-RADS,Lesion_Location,BPH_Present,Other_Observations
0,1,Klinische Angaben:\nPatient unter AS bei PCa G...,24.0,11.0,,1,[5],[Transitionalzone],1,[]
1,2,Klinische Angaben:\nVor MRT-TRUS-fusionierter ...,55.0,,,2,"[5, 4]",[Transitionalzone],1,[\n]
2,3,Klinische Angaben:\nV.a. PCA bei PSA Anstieg a...,46.0,13.0,,0,[1],[],1,[]
3,4,"Klinische Angaben:\nPSA-Wert-Erhöhung auf 28,2...",30.0,20.0,,1,[5],[],1,[\n\n]
4,5,Klinische Angaben:\nAzinäres Prostatakarzinom ...,75.0,,6.0,0,[1],[],1,[\n\n]
5,6,Klinische Angaben:\nProstata-Ca. unter Active ...,40.0,,,0,[1],[],1,[\n\n]
6,7,Klinische Angaben:\nPatient mit PCa unter AS. ...,30.0,11.0,,2,"[4, 3]",[Transitionalzone],1,[]
7,8,Klinische Angaben:\nVor MRT-TRUS-fusionierter ...,30.0,,,0,[1],[],1,[\n]
8,9,Klinische Angaben:\nPatient kommt zur MRT-Fusi...,68.0,96.0,,1,[4],[],1,[\n]
9,10,Klinische Angaben:\nV.a. Prostataca. bei Dynam...,60.0,8.0,,0,[1],[],1,[\n]
