In [None]:
%pip install -r "requirements_outlines.txt"

In [15]:
import transformers
import accelerate
import outlines
import json
import pandas as pd
import torch
import tqdm
import gc
import ast
from outlines import from_transformers, Generator
from pydantic import BaseModel, Field
from typing import List, Optional

In [None]:
#model = from_transformers(
 #   transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto"),
  #  transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct"),
#)

In [None]:
'''
This is a chunk for clearing model cache if it becomes necessary to switch to another model without having to reset
'''

# Delete the model object
del model
gc.collect()

# Clear PyTorch cache on GPU
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# This is a comment to test git


In [2]:
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# Call this before and after model loading
print_gpu_memory()

GPU memory allocated: 0.00 GB
GPU memory reserved: 0.00 GB


In [2]:
'''
This is the DeepSeek 14b model, which at first glance seems to perform better than the Llama model. 
Definitely worth considering if this should be used instead.
'''

model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", device_map="auto"),
    transformers.AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
)

Fetching 4 files: 100%|██████████| 4/4 [00:37<00:00,  9.31s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.73s/it]


In [3]:
# Defining the pydantic class which ensures the structured output from the llm
class BlameAnalysis(BaseModel):
    text: str = Field(description="The exact original sentence being analyzed")
    blame: bool = Field(description="Whether blame is present in the sentence")
    blamee: Optional[str] = Field(
        default=None,
        description="Who or what is being blamed (must not be empty if blame=true)"
    )
    arguments: Optional[str] = Field(
        default=None,
        description="What the blamee is being blamed for - the specific negative outcome (must not be empty if blame=true)"
    )

In [4]:
data = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/annotation_data_translated_version_03_10.csv")
sentences = data["da_segmented_text"]

In [16]:

data = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/annotation_data_translated_version_03_10.csv")

# Parse and flatten the sentences
all_sentences = []
for text in data["da_segmented_text"]:
    if pd.notna(text):  # Skip NaN values
        try:
            # Parse the string representation of a list into actual list
            sentence_list = ast.literal_eval(text)
            if isinstance(sentence_list, list):
                all_sentences.extend(sentence_list)
            else:
                # If it's not a list, treat it as a single sentence
                all_sentences.append(str(sentence_list))
        except (ValueError, SyntaxError):
            # If parsing fails, treat the whole thing as one sentence
            all_sentences.append(text)

sentences = all_sentences

print(f"Total sentences after flattening: {len(sentences)}")
print(f"First sentence: {sentences[0]}")
print(f"First sentence length: {len(sentences[0])} chars")
print(f"Longest sentence: {max(len(s) for s in sentences)} chars")

Total sentences after flattening: 399018
First sentence: Mødet er åbnet.
First sentence length: 15 chars
Longest sentence: 1804 chars


In [5]:
generator = Generator(model, BlameAnalysis)

In [18]:
for sentence in tqdm.tqdm(sentences, desc = "Deepseek blame (GPU)"):
    prompt = f"""Perform blame identification on the following sentence.
    Sentence: {sentence}

    Rules:
    - Start by determining whether blame is present at all in the sentence
    - Identify who is being blamed, what they are being blamed for, and the arguments used
    - Set blame=true ONLY if someone/something is being blamed for causing a negative outcome
    - The "text" field must be EXACTLY the sentence provided above - do not modify it
    - If blame=true, "blamee" must NOT be empty and "arguments" must contain the specific outcome they are blamed for
    - Do not leave arguments as an empty string

    Semantic roles:
    - Blamee: The patient receiving the blame (who or what is being blamed)
    - Argument: What is the blamee being blamed for (the negative outcome)

    Output your analysis in JSON format."""
    with torch.no_grad():  # Disable gradient tracking
        result = generator(prompt, max_new_tokens=256, use_cache=False)

    data = json.loads(result)
    #print(json.dumps(data, indent=2))
    # Parsing json for saving
    result_out = BlameAnalysis.model_validate_json(result)
    # (Over)Writing to file to avoid duplicates
    with open("result_blame.json", "a") as f:
       json.dump(result_out.model_dump(), f, indent=2)
    torch.cuda.empty_cache()


Deepseek blame (GPU):   0%|          | 0/399018 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 1/399018 [00:07<848:40:35,  7.66s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 2/399018 [00:14<768:45:48,  6.94s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 3/399018 [00:24<966:27:47,  8.72s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 4/399018 [00:43<1386:46:50, 12.51s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 5/399018 [00:50<1158:16:41, 10.45s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 6/399018 [00:57<1038:40:46,  9.37s/it]Setting `pad_token_id` to `eos_token_id

KeyboardInterrupt: 

In [10]:
len(sentences)

36314

In [None]:
data = json.loads(result)
print(json.dumps(data, indent=2))