In [13]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import evaluate
import numpy as np
import os
import pandas as pd
import fitz

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
waste_data = pd.read_csv("csv_data/waste_40.csv")



In [15]:
def chunk_text(text, tokenizer, max_length=512):
    # Encode without adding special tokens to get raw token ids
    tokens = tokenizer.encode(text, add_special_tokens=False)
    # Reserve two tokens for [CLS] and [SEP]
    chunk_size = max_length - 2
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        # Add special tokens back
        chunk_tokens = [tokenizer.cls_token_id] + chunk_tokens + [tokenizer.sep_token_id]
        chunk_text_str = tokenizer.decode(chunk_tokens, skip_special_tokens=False)
        # print(chunk_text_str)  # Debugging: print the chunked text
        chunks.append(chunk_text_str)
    return chunks

def expand_dataset(dataset):
    expanded_data = {"text": [], "label": []}
    for example in dataset:
        # Split the text into chunks
        chunks = chunk_text(example["text"], tokenizer, max_length=512)
        # For each chunk, store the chunk and the original label
        for chunk in chunks:
            expanded_data["text"].append(chunk)
            expanded_data["label"].append(example["label"])

    return Dataset.from_dict(expanded_data)

# Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(waste_data)

# Use the custom function to expand the dataset
dataset_chunked = expand_dataset(dataset)

# Define a tokenization function for the chunks
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the chunked dataset
dataset_tokenized = dataset_chunked.map(tokenize_function, batched=True)
dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Token indices sequence length is longer than the specified maximum sequence length for this model (11051 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/1008 [00:00<?, ? examples/s]

In [16]:
# Load the model (optionally loading pre-trained weights)
waste_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# Uncomment and adjust if you have custom weights to load:
# model.load_state_dict(torch.load("medical_modle.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")), strict=False)
waste_model.load_state_dict(torch.load("medical_modle.pth", weights_only=True))
waste_model = waste_model.to(device)

# Set up evaluation metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Load the model (optionally loading pre-trained weights)
waste_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# Uncomment and adjust if you have custom weights to load:
# model.load_state_dict(torch.load("medical_modle.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")), strict=False)
waste_model.load_state_dict(torch.load("medical_modle.pth", weights_only=True))
waste_model = waste_model.to(device)

# Set up evaluation metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}



  0%|          | 0/630 [00:00<?, ?it/s]

{'loss': 0.7466, 'grad_norm': 7.198878288269043, 'learning_rate': 1.9936507936507938e-05, 'epoch': 0.03}
{'loss': 1.2235, 'grad_norm': 10.171074867248535, 'learning_rate': 1.9873015873015875e-05, 'epoch': 0.06}
{'loss': 0.6589, 'grad_norm': 8.94666862487793, 'learning_rate': 1.980952380952381e-05, 'epoch': 0.1}
{'loss': 0.7798, 'grad_norm': 7.16019344329834, 'learning_rate': 1.9746031746031748e-05, 'epoch': 0.13}
{'loss': 0.647, 'grad_norm': 2.5255496501922607, 'learning_rate': 1.9682539682539684e-05, 'epoch': 0.16}
{'loss': 0.615, 'grad_norm': 2.4629263877868652, 'learning_rate': 1.961904761904762e-05, 'epoch': 0.19}
{'loss': 0.6593, 'grad_norm': 2.811938524246216, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.22}
{'loss': 0.5681, 'grad_norm': 2.1291096210479736, 'learning_rate': 1.9492063492063494e-05, 'epoch': 0.25}
{'loss': 0.5399, 'grad_norm': 2.6759133338928223, 'learning_rate': 1.942857142857143e-05, 'epoch': 0.29}
{'loss': 0.5721, 'grad_norm': 4.5783562660217285, 'learnin

TrainOutput(global_step=630, training_loss=0.09321409591384941, metrics={'train_runtime': 325.0302, 'train_samples_per_second': 31.013, 'train_steps_per_second': 1.938, 'total_flos': 1335271378452480.0, 'train_loss': 0.09321409591384941, 'epoch': 10.0})

## Inference

In [18]:

def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            full_text += page.get_text().strip()
        return full_text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def predict_label_for_pdf(pdf_path, model, tokenizer, device):
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():
        print(f"No text extracted from {pdf_path}")
        return None

    # Split the full text into chunks that do not exceed 512 tokens
    chunks = chunk_text(text, tokenizer, max_length=512)
    
    # Store logits from each chunk
    all_logits = []
    for chunk in chunks:
        # Tokenize each chunk (padding and truncation ensure fixed size input)
        inputs = tokenizer(chunk, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits  # shape [1, num_labels]
            all_logits.append(logits)
    
    # Aggregate predictions by averaging the logits across chunks.
    # You can also consider majority voting on the predicted labels.
    aggregated_logits = torch.mean(torch.cat(all_logits, dim=0), dim=0, keepdim=True)  # shape [1, num_labels]
    predicted_label = torch.argmax(aggregated_logits, dim=1).item()
    return predicted_label

def process_pdfs_after_training(directory_path, output_csv, model, tokenizer, device):
    results = []
    
    # Iterate through all PDF files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")
            
            # Predict label for the PDF (aggregating over chunks)
            predicted_label = predict_label_for_pdf(pdf_path, model, tokenizer, device)
            if predicted_label is None:
                continue  # Skip files with no extracted text
            
            # Convert numeric prediction to string label
            new_label = "YES" if predicted_label == 1 else "NO"
            results.append({"filename": filename, "predicted_label": new_label})
    
    # Save the results to a CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

pdfs_dir = "pdf_data/test"
output_csv_path = "csv_data/test_predictions_40.csv"

process_pdfs_after_training(
    directory_path=pdfs_dir,
    output_csv=output_csv_path,
    model=waste_model,      
    tokenizer=tokenizer,     
    device=device           
)


Processing abah 2022.pdf...
Processing abdurrahman 2020.pdf...
Processing adebiyi 2020.pdf...
Processing bajyacharya 2021.pdf...
Processing Das 2022.pdf...
Processing Lin 2013.pdf...
Processing tabian 2021.pdf...
Processing timonen 2021.pdf...
Processing uttajug 2021.pdf...
Processing uttajug 2022.pdf...
Processing vreeland 2016.pdf...
Processing wu 2006.pdf...
Processing zak 2021.pdf...
Processing zakey 2008.pdf...
Processing zalakeviciute 2020.pdf...
Processing zalakeviciute 2021.pdf...
Processing zalasiewicz 2019.pdf...
Processing zalel 2015.pdf...
Processing zalzal 2024.pdf...
Processing zhang 2023.pdf...
Predictions saved to csv_data/test_predictions_40.csv
