In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader, Dataset

In [41]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

class InferenceModel:
    def __init__(self, model_path, batch_size=1, device="cpu"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
        self.model.eval()
        self.batch_size = batch_size
        self.device = device

    def prepare_examples(self, texts):
        # Tokenize the texts and prepare the dataset
        encodings = self.tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
        return TextDataset(encodings)

    def predict(self, texts):
        dataset = self.prepare_examples(texts)
        predictions = []
        data_loader = DataLoader(dataset, batch_size=self.batch_size)

        for batch in data_loader:
            batch = {k: v.to(self.device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = self.model(**batch)
                logits = outputs.logits
                predictions.extend(torch.argmax(logits, dim=-1).cpu().tolist())

        return predictions

In [42]:
# Configuration variables
model_path = "./a_model_dir_dkleczek/bert-base-polish-cased-v1_100"
input_file = "in_c.tsv"
output_file = "out_c_00epochs.tsv"

# Instantiate model
model = InferenceModel(model_path)

# Reading and processing input
with open(input_file, 'r') as file:
    texts = [line.strip().split('\t')[1] for line in file if line.strip()]

In [43]:
# Predicting
predictions = model.predict(texts)

# Writing predictions to output file
with open(output_file, 'w', encoding='utf-8') as file:
    for text, prediction in zip(texts, predictions):
        file.write(f"Text: {text}\n")
        file.write("Predictions:\n")
        for token, label in zip(text.split(), prediction):
            file.write(f"{token}: {label}\n")
        file.write("\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [38]:
label_map = {
    0: "0",  
    1: ":",  
    2: ";",
    3: ",",
    4: ".",
    5: "-",
    6: "...",
    7: "?",
    8: "!"
}

def process_file(input_file_path, output_file_path):
    # Read the input file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Prepare to write to the output file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        i = 0
        while i < len(lines):
            if lines[i].startswith("Text:"):
                file.write(lines[i])  # Write the text line as is
                i += 1
                file.write(lines[i])  # Write the "Predictions:" line as is
                i += 1
                while i < len(lines) and ':' in lines[i]:  # Process each prediction line
                    parts = lines[i].strip().split(': ')
                    if parts[1].isdigit():  # Ensure it's a digit to avoid errors
                        label = label_map.get(int(parts[1]), "B")  # Default to "B" if not found
                    else:
                        label = "B"  # Default if not a digit
                    file.write(f"{parts[0]}: {label}\n")
                    i += 1
                file.write("\n")  # Add a newline after each block of predictions
            else:
                i += 1  # Skip any lines that don't start a new text block


In [39]:
# Example usage
input_file_path = output_file
output_file_path = 'cleaned_output_c_100epochs.tsv'
process_file(input_file_path, output_file_path)