# Detecting Renal Cysts from CT Reports Using DistilBERT base model (uncased)


In this project, I used the DistilBERT base model to classify CT reports based on the identification of renal cysts. For efficiency, I froze the model's pretrained weights and optimized only the head layer.


## 1. Load Dataset



*   Load dataset from Google Drive
*   Drop unnecessary features
*   Check Data types


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

#load dataset
data = pd.read_csv("path_to_your_dataset.csv")

#Drop unnessary features

data=data.drop(['MRN','PRODUCT','PATIENT_WEIGHT','NATIONALITY','EPISODE_DATE','PATIENT_HEIGHT','Unnamed: 10','PATIENT_BMI'], axis=1)

data.head()

In [None]:
#check data type
print(data.dtypes)

##  2. Preprocessing

In this step:


*   We convert datatypes for all columns
*   We perfromed cleaning CT_REPORT
*   We auto labeled data using regular expression since data is too large and was not labeld manually.




### Convert data types and remove Y from age column

In [None]:
# remove Y from age column
data["PATIENT_AGE"] = data["PATIENT_AGE"].str.replace("Y", "", regex=False)

# convert data types
data["PATIENT_AGE"] = data["PATIENT_AGE"].astype(int)
data["PATIENT_GENDER"] = data["PATIENT_GENDER"].astype(str)
data["CT_REPORT"] = data["CT_REPORT"].astype(str)

# check data types
print(data.dtypes)

### Perfrom cleaning for CT REPORT

In [None]:
import re


def clean_ct_report(report):
    """
    Cleans escape characters and unwanted spaces from a CT report.
    """
    # remove newlines, tabs, and carriage returns and other noisy charcaters observed in the reports such as \T\
    report = report.replace("
", " ").replace("	", " ").replace("", " ").replace("\T\\", " ")

    # this extra step to remove Unicode escape sequences, if any
    report = re.sub(r"\\u[0-9A-Fa-f]{4}", "", report)  # Matches \uXXXX
    report = re.sub(r"\\x[0-9A-Fa-f]{2}", "", report)  # Matches \xXX

    # replace multiple spaces with a single space, if any
    report = re.sub(r"\s+", " ", report)

    # strip leading and trailing spaces, if any
    report = report.strip()

    return report

# apply cleaning
data["CT_REPORT"] = data["CT_REPORT"].apply(clean_ct_report)

print(data)

### Auto labeling data
Since data is large and is not labeled, I used rule-based approach forlabeling data using regular expression where (1: renal cyst, 0: no renal cyst)






In [None]:
# Define a labeling function
def contains_renal_cyst(text):
    return 1 if re.search(r"renal cyst", text, re.IGNORECASE) else 0

# Apply the function to create label
data["label"] = data["CT_REPORT"].apply(contains_renal_cyst)

print(data)


In [None]:
# Filter records with label=1 to check labeling
data_filtered = data[data['label'] == 1]
print(data_filtered)

## 3. Prparing For Model Training



*   Split dataset into train-test, with 40% for testing
*   Define model and toeknizer
*   Tokenize data




### Split data set for train-test

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset for train-test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["CT_REPORT"].tolist(),
    data["label"].tolist(),
    test_size=0.4,
    stratify=data["label"],
    random_state=42,
)


### Define Model and Tokenizer



In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
import torch

# load the pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2, id2label={0: "no renal cyst", 1: "renal cyst"},
    label2id={"no renal cyst": 0, "renal cyst": 1})


# freeze weights of the pre-trained model
for param in model.base_model.parameters():
    param.requires_grad = False

print(model)


### Tokenize data

In [None]:
from torch.utils.data import Dataset

# Tokenize the data
def tokenize_data(texts, labels):
    tokens = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    tokens["labels"] = torch.tensor(labels)  # Add labels to the tokens
    return tokens

# Custom Dataset Class
class CTReportDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.encodings = tokenize_data(texts, labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

# Prepare datasets
train_dataset = CTReportDataset(train_texts, train_labels)
test_dataset = CTReportDataset(test_texts, test_labels)


## 4. Training Setup and Execution

In [None]:
from transformers import Trainer,TrainingArguments,DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

## 5. Model Evaluation

In [None]:
results = trainer.evaluate()
print(results)