# Filter Text Data

In [7]:
import pandas as pd

df=pd.read_csv('./TCGA_Reports.csv')
df['case_id']=df['patient_filename'].apply(lambda x: x.split('.')[0])
df.head()

Unnamed: 0,patient_filename,text,case_id
0,TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0,Date of Recelpt: Clinical Diagnosis & History:...,TCGA-BP-5195
1,TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8,"Material: 1) Material: stomach, Method of coll...",TCGA-D7-8573
2,TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4,page 1 / 1. copy No. 3. Examination: Histopath...,TCGA-EI-7004
3,TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A,Patient ID: Gross Description: A mass is locat...,TCGA-EB-A82B
4,TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...,TCGA-A6-3808


## Filter using string matching

In [4]:
# ref:https://stackoverflow.com/a/6117124
import re
FILTER_LIST=['ductal', 'carcinoma', 'invasive', 'lobular','infiltrating','adenocarcinoma','squamous', 'squamous cell','cell']
FILTER_LIST+=['chromophobe', 'clear cell','papillary', 'clear'] # RCC specific filter words
def filter_using_regex(text, filter_list=FILTER_LIST):
    text=text.lower()
    # rep = {"ductal": "", "invasive": "","carcinoma":""} # define desired replacements here
    rep={x:'' for x in filter_list}
    # use these three lines to do the replacement
    rep = dict((re.escape(k), v) for k, v in rep.items()) 
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text

In [6]:
df['filtered_text']=df['text'].apply(lambda x: filter_using_regex(x))
df.head()

Unnamed: 0,patient_filename,text,case_id,filtered_text
0,TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0,Date of Recelpt: Clinical Diagnosis & History:...,TCGA-BP-5195,date of recelpt: clinical diagnosis & history:...
1,TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8,"Material: 1) Material: stomach, Method of coll...",TCGA-D7-8573,"material: 1) material: stomach, method of coll..."
2,TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4,page 1 / 1. copy No. 3. Examination: Histopath...,TCGA-EI-7004,page 1 / 1. copy no. 3. examination: histopath...
3,TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A,Patient ID: Gross Description: A mass is locat...,TCGA-EB-A82B,patient id: gross description: a mass is locat...
4,TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d,SPECIMEN. Right colon. CLINICAL NOTES. PRE-OP ...,TCGA-A6-3808,specimen. right colon. clinical notes. pre-op ...


## Generate local (sentence-level) and global (document-level) representations using fine-tuned sentence transformer based on ModernBERT 

GitHub Repo: https://github.com/AnswerDotAI/ModernBERT

Specificially used [train_st.py](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_st.py) to fine tune a sentence transformer on Nvidia A100 GPU. We used this fine-tuned model to encode sentences and documents. The 8192 context length is useful for encoding large documents.

In [19]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
from tqdm.notebook import tqdm
import numpy as np

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = SentenceTransformer('./output/ModernBERT-large/ModernBERT-large-DPR-8e-05/final/')
model = model.to(device)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


### Global representations

In [21]:
import shutil
import os
save_dir='./filtered_lung_notes_embedding/global/'
os.makedirs(save_dir, exist_ok=True)

In [22]:
for index, row in tqdm(fdf.iterrows()):
    name = row['filename'][:-4] # remove .svs from filename
    global_text=[row['text_filtered']]
    embed=model.encode(global_text)
    torch.save(embed, f'{save_dir}/{name}.pt')

0it [00:00, ?it/s]

### Local representations

In [23]:
local_save_dir='./filtered_lung_notes_embedding/local/'
os.makedirs(local_save_dir, exist_ok=True)

In [25]:
for index, row in tqdm(fdf.iterrows()):
    name = row['filename'][:-4] # remove .svs from filename
    global_text=row['text_filtered']
    local_texts=global_text.split('.')
    embeds=list()
    for i, txt in enumerate(local_texts):
        embed=model.encode([txt])
        embeds.append(embed)
    
    embeds=torch.tensor(np.vstack(embeds))
    torch.save(embeds, f'{local_save_dir}/{name}.pt')

0it [00:00, ?it/s]