In [None]:
import transformers
import accelerate
import outlines
import json
import pandas as pd
import torch
import tqdm
import gc
import ast
from outlines import from_transformers, Generator, models
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", dtype=torch.bfloat16),
    transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
)
#"meta-llama/Llama-3.2-1B"

In [25]:
# Defining the pydantic class which ensures the structured output from the llm
class BlameeDetection(BaseModel):
    text: str = Field(description="The exact original sentence being analyzed")
    previous: str = Field(description="The exact previous sentence used for context")
    following: str = Field(description="The exact following sentence used for context")
    blamee: Optional[str] = Field(default = None, description="Who or what is being blamed")
    arguments: Optional[str] = Field(default = None, description="What the blamee is being blamed for - the specific negative outcome"
    )

In [5]:
text_data = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/annotation_data_translated_version_03_10.csv", encoding='utf-8')

In [6]:

paragraph_entry = {}
for i, text in enumerate(text_data["da_segmented_text"]): #check if i is sctually number


    da_segmented_sentences = ast.literal_eval(text_data.loc[i]["da_segmented_text"])

    sentece_entry = {}
    for p, sentence in enumerate(da_segmented_sentences):
        sentece_entry[p] = sentence
    
    paragraph_entry[i] = sentece_entry



In [31]:
with open('/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/labelstudio_with_metadata.json', 'r') as file:
    orig_data = json.load(file)

In [32]:
orig_data[0]

{'paragraph': '2',
 'sentence_nr': '235',
 'text': 'Regeringen vil også fortsætte sin offensive  miljøpolitik.',
 'speaker': 'Poul Nyrup Rasmussen',
 'party': 'S',
 'preceding_sentence': 'sentence_nr 234 \nDer indføres en belønningsordning for virksomheder, der gør  en særlig ekstraindsats ud over det, loven stiller krav om,  for at forbedre arbejdsmiljøet.',
 'succeeding_sent': 'sentence_nr 236 \nDet er vores næste store felt.',
 'current_speaker_in_government': True,
 'parties_in_government': ['S', 'RV'],
 'date': '1997-10-07 00:00:00'}

In [None]:
spoken_sentences =

for i in range(len(data)):
    text = data[i]["text"]
    speaker = data[i]["speaker"]
    spoken_sentences.append(text)

In [26]:
generator = Generator(model, BlameeDetection)

In [30]:
orig_data[0]

{'paragraph': '2',
 'sentence_nr': '235',
 'text': 'Regeringen vil også fortsætte sin offensive  miljøpolitik.',
 'speaker': 'Poul Nyrup Rasmussen',
 'party': 'S',
 'preceding_sentence': 'Der indføres en belønningsordning for virksomheder, der gør  en særlig ekstraindsats ud over det, loven stiller krav om,  for at forbedre arbejdsmiljøet.',
 'succeeding_sent': 'Det er vores næste store felt.',
 'current_speaker_in_government': True,
 'parties_in_government': ['S', 'RV'],
 'date': '1997-10-07 00:00:00'}

In [None]:
for i in orig_data:
    print(i)
    i["preceding_sentence"] = i["preceding_sentence"].split("\n")[1]
    i["succeeding_sent"] = i["succeeding_sent"].split("\n")[1]

# REMEMBER TO TRY CATCH FOR SENTENCES WHICH EITHER ARE THE START OR THE END OF A PARAGRAPH. 
# ALSO OVERVEJ IF THE SKIPPED LINES WHICH BREAK THE JSON SHOULD BE APPENDED TO A LIST FOR LATER PROCESSING WITH MORE AVAILABLE TOKENS FOR GENERATION.

{'paragraph': '2', 'sentence_nr': '235', 'text': 'Regeringen vil også fortsætte sin offensive  miljøpolitik.', 'speaker': 'Poul Nyrup Rasmussen', 'party': 'S', 'preceding_sentence': 'sentence_nr 234 \nDer indføres en belønningsordning for virksomheder, der gør  en særlig ekstraindsats ud over det, loven stiller krav om,  for at forbedre arbejdsmiljøet.', 'succeeding_sent': 'sentence_nr 236 \nDet er vores næste store felt.', 'current_speaker_in_government': True, 'parties_in_government': ['S', 'RV'], 'date': '1997-10-07 00:00:00'}
{'paragraph': '4', 'sentence_nr': '85', 'text': 'i det synspunkt.  \xa0\xa0\xa0\xa0\xa0I skærende kontrast til det synspunkt står  landbrugsorganisationernes reaktion og Venstres og De  Konservatives finanslovudspil kemisk renset for afgifter som  styringsredskab, altså ligegyldighed over for vandmiljøet for  at tilfredsstille landbrugslobbyens høge.', 'speaker': 'Torben Lund', 'party': 'S', 'preceding_sentence': 'sentence_nr 84 \nJeg er 100 pct. enig', 'succe

IndexError: list index out of range

In [27]:
for sentence in tqdm.tqdm(orig_data, desc = "Blamee detection"):
    prev_sent = sentence["preceding_sentence"]
    sent = sentence["text"]
    suc_sent = sentence["succeeding_sent"]
    prompt = f"""Perform blamee identification on the following sentence based on the context of the previous and following sentence.
    Sentence: {sent}
    Previous sentence: {prev_sent}
    Following sentence: {suc_sent}

    Rules:
    - Identify specifically who or what is being blamed for causing a negative outcome in the above sentence
    - The "text" field must be EXACTLY the sentence provided above - do not modify it
    - Identify a specific part of the above sentence which indicates what the blamee is being accused of
    - Never leave arguments as an empty string
    - Never use "sentence_nr [x]" for classifying blame

    - Roles:
    - Blamee: The patient receiving the blame (who or what is being blamed)
    - Argument: What is the blamee being blamed for (the negative outcome)"""
    with torch.no_grad():  # Disable gradient tracking
        result = generator(prompt, max_new_tokens=256, use_cache=False)
        #print(result)
    try:
        result_out = BlameeDetection.model_validate_json(result)
    except (ValidationError, json.JSONDecodeError):
        print("Skipping invalid entry.")
        continue

    with open("result_blamee_detection.json", "a") as f:
        json.dump(result_out.model_dump(), f, indent=2)




Blamee detection:   0%|          | 0/4325 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 1/4325 [00:00<36:27,  1.98it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 2/4325 [00:01<38:29,  1.87it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 3/4325 [00:01<41:49,  1.72it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 4/4325 [00:02<34:57,  2.06it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 5/4325 [00:02<31:44,  2.27it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 6/4325 [00:02<34:37,  2.08it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   0%|          | 

Skipping invalid entry.


Blamee detection:   1%|          | 41/4325 [00:28<1:25:16,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   1%|          | 42/4325 [00:29<1:16:06,  1.07s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   1%|          | 43/4325 [00:29<1:05:45,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   1%|          | 44/4325 [00:30<59:57,  1.19it/s]  Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Blamee detection:   1%|          | 44/4325 [00:30<49:38,  1.44it/s]


KeyboardInterrupt: 