In [2]:
!pip install transformers

import torch
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import DataLoader, TensorDataset




In [7]:
# Upload your CSV file to Colab and change the filename
uploaded_file_path = "merged_data_subset.csv"

# Read CSV using Pandas
df = pd.read_csv(uploaded_file_path)

# Extract input data and labels from your DataFrame
input_data = df["Text"].tolist()  # Change "text_column" to the actual column name in your CSV
labels = df["Label"].tolist()  # Change "label_column" to the actual column name in your CSV

input_data = [str(sentence) for sentence in input_data]

# Load pre-trained RoBERTa model and tokenizer


In [13]:
from transformers import RobertaConfig

In [18]:
config = RobertaConfig.from_pretrained('roberta-base', num_labels=9)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize input data and convert labels to tensors
tokenized_input = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt", max_length=512)
input_ids = tokenized_input["input_ids"]
attention_mask = tokenized_input["attention_mask"]
labels = torch.tensor(labels, dtype=torch.long)

# Create DataLoader


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(labels, dtype=torch.long)


In [19]:
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Device setup



In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop (same as before)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)



In [23]:
from tqdm import tqdm

In [25]:
total_epochs = 1

In [26]:
for epoch in range(total_epochs):
    # Create a progress bar for the epoch
    epoch_progress = tqdm(dataloader, desc=f'Epoch {epoch + 1}/{total_epochs}', dynamic_ncols=True)

    for batch in epoch_progress:
        input_ids_batch, attention_mask_batch, labels_batch = [t.to(device) for t in batch]

        optimizer.zero_grad()
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Update the progress bar with the current loss
        epoch_progress.set_postfix({'Loss': loss.item()}, refresh=True)

    # Close the progress bar for the epoch
    epoch_progress.close()

Epoch 1/1: 100%|██████████| 2500/2500 [4:49:21<00:00,  6.94s/it, Loss=0.0176]    


In [27]:
from sklearn.metrics import accuracy_score

In [28]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [29]:
all_predictions = []
all_labels = []

In [30]:
for batch in validation_dataloader:
    input_ids_batch, attention_mask_batch, labels_batch = [t.to(device) for t in batch]

    # Forward pass (no gradient calculation needed during evaluation)
    with torch.no_grad():
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
    
    # Get predicted labels
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    # Collect predictions and ground truth labels
    all_predictions.extend(predictions)
    all_labels.extend(labels_batch.cpu().numpy())

NameError: name 'validation_dataloader' is not defined

In [None]:
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [37]:
input_text = """LPL - LPL-ROHINI (NATIONAL REFERENCE
LAB)
SECTOR - 18, BLOCK -E ROHINI
DELHI 110085
Name
A/c Status
Lab No.
Ref By :
Gender: Age:
Report Status
Reported
Received
Collected
P
:
:
:
:
:
:
: Final
30 Years
15/9/2017 3:37:00PM
15/9/2017 3:59:23PM
135091274 Male
Dr. UNKNWON
Mr. Z836
19/9/2017 7:26:39PM
Test Name Results Units Bio. Ref. Interval
NAFLD FIBROSIS SCORE
(Spectrophotometry)
AST (SGOT) 11.00 U/L <50.00
ALT (SGPT) 12.00 U/L <50.00
Albumin 4.00 g/dL 3.50 - 5.20
Glucose, Fasting 88.00 mg/dL 70.00 - 100.00
Glucose, PP 101.00 mg/dL 70.00 - 140.00
Platelet count 264.00 thou/mm3 150.00 - 450.00
Height 165.00 cm
Weight 56.00 kg
NAFLD score 2.49 <-1.455
Note
1. Enhanced liver fibrosis test may be used to further characterize patients with indeterminate score.
2. The test conducted in serum, plasma & whole blood.
Interpretation
 -----------------------------------------------------------------------------
| NAFLD FIBROSIS SCORE | REMARKS |
|----------------------|------------------------------------------------------|
| < -1.455 | Predictor of absence of significant fibrosis (F0-F2) |
|----------------------|------------------------------------------------------|
| -1.455- 0.675 | Indeterminate |
|----------------------|------------------------------------------------------|
| >0.675 | Predictor of presence of significant fibrosis (F3-F4)|
 -----------------------------------------------------------------------------
Comment
NAFLD fibrosis score is the most studied score which has been extensively validated in a large population of
patients with Non Alcoholic Fatty Liver disease (NAFLD) for detecting significant fibrosis.. NAFLD is the most
common cause of abnormal liver function tests in primary care. It is associated with obesity, insulin
resistance, Diabetes, Dyslipidemia, hypertension and is considered as hepatic manifestation of the Metabolic
syndrome. The pivotal issue in managing patients with NAFLD is the diagnosis of steatohepatitis & fibrosis at
early stage. Liver biopsy (the Gold standard for diagnosing & assessing fibrosis) is not considered appropriate
as first line tool for diagnosing fibrosis in unselected NAFLD patients. This test has high negative predictive
value, thus can be used as first line test to rule out patients without advanced fibrosis thereby preventing
unnecessary secondary care referrals.
Uses
· Metabolic syndrome / Diabetes with suspected NAFLD
PatientReportSCSuperPanel.GENERAL_PANEL_ANALYTE_SC (Version: 6) *135091274*
Page 1 of 2
LPL - LPL-ROHINI (NATIONAL REFERENCE
LAB)
SECTOR - 18, BLOCK -E ROHINI
DELHI 110085
Name
A/c Status
Lab No.
Ref By :
Gender: Age:
Report Status
Reported
Received
Collected
P
:
:
:
:
:
:
: Final
30 Years
15/9/2017 3:37:00PM
15/9/2017 3:59:23PM
135091274 Male
Dr. UNKNWON
Mr. Z836
19/9/2017 7:26:39PM
Test Name Results Units Bio. Ref. Interval
· Fatty liver index >60
· Evidence of hepatic steatosis in USG (other causes of fatty liver disease such as alcoholic liver
disease, viral hepatitis, drugs etc. excluded)"""

tokenized_input = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt", max_length=512)

input_ids = tokenized_input["input_ids"].to(device)
attention_mask = tokenized_input["attention_mask"].to(device)

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()

print("Predicted Labels:", predicted_labels)


Predicted Labels: [6]
