In [25]:
import transformers
import accelerate
import outlines
import json
import pandas as pd
import torch
import tqdm
import gc
import ast
from outlines import from_transformers, Generator, models
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional

In [3]:
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", dtype=torch.bfloat16),
    transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
)
#"meta-llama/Llama-3.2-1B"

In [31]:
# Defining the pydantic class which ensures the structured output from the llm
class BlameeDetection(BaseModel):
    text: str = Field(description="The exact original sentence being analyzed")
    blamee: Optional[str] = Field(default = None, description="Who or what is being blamed")
    arguments: Optional[str] = Field(default = None, description="What the blamee is being blamed for - the specific negative outcome"
    )

In [5]:
text_data = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/annotation_data_translated_version_03_10.csv", encoding='utf-8')

In [6]:

paragraph_entry = {}
for i, text in enumerate(text_data["da_segmented_text"]): #check if i is sctually number


    da_segmented_sentences = ast.literal_eval(text_data.loc[i]["da_segmented_text"])

    sentece_entry = {}
    for p, sentence in enumerate(da_segmented_sentences):
        sentece_entry[p] = sentence
    
    paragraph_entry[i] = sentece_entry



In [44]:
with open('/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/labelstudio_merged.json', 'r') as file:
    orig_data = json.load(file)

In [50]:
orig_data[0]

{'paragraph': '2',
 'sentence_nr': '235',
 'text': 'Regeringen vil også fortsætte sin offensive  miljøpolitik.',
 'speaker': 'Poul Nyrup Rasmussen',
 'party': 'S'}

In [47]:
json_data.items()

AttributeError: 'str' object has no attribute 'items'

In [None]:
spoken_sentences =

for i in range(len(data)):
    text = data[i]["text"]
    speaker = data[i]["speaker"]
    spoken_sentences.append(text)

In [32]:
generator = Generator(model, BlameeDetection)

In [None]:
for sentence in tqdm.tqdm(spoken_sentences, desc = "Blamee detection"):
    prompt = f"""Perform blamee identification on the following sentence.
    Sentence: {sentence}

    Rules:
    - Identify specifically who or what is being blamed for causing a negative outcome in the above sentence
    - The "text" field must be EXACTLY the sentence provided above - do not modify it
    - Identify a specific part of the above sentence which indicates what the blamee is being accused of
    - Never leave arguments as an empty string

    - Roles:
    - Blamee: The patient receiving the blame (who or what is being blamed)
    - Argument: What is the blamee being blamed for (the negative outcome)"""
    with torch.no_grad():  # Disable gradient tracking
        result = generator(prompt, max_new_tokens=256, use_cache=False)
        #print(result)
    try:
        result_out = BlameeDetection.model_validate_json(result)
    except (ValidationError, json.JSONDecodeError):
        print("Skipping invalid entry.")
        continue

    with open("result_blamee_detection.json", "a") as f:
        json.dump(result_out.model_dump(), f, indent=2)




Blamee detection:   0%|          | 0/4325 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 1/4325 [00:00<14:51,  4.85it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 2/4325 [00:00<10:49,  6.66it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 3/4325 [00:00<13:32,  5.32it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 4/4325 [00:00<13:53,  5.18it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 5/4325 [00:00<12:42,  5.67it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 6/4325 [00:01<12:43,  5.65it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 

KeyboardInterrupt: 

In [None]:
for inner