In [None]:
%pip install -r "requirements_outlines.txt"

In [1]:
import transformers
import accelerate
import outlines
import json
import pandas as pd
import torch
import tqdm
import gc
import ast
from outlines import from_transformers, Generator
from pydantic import BaseModel, Field
from typing import List, Optional

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
'''
This is a chunk for clearing model cache if it becomes necessary to switch to another model without having to reset
'''

# Delete the model object
del model
gc.collect()

# Clear PyTorch cache on GPU
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# This is a comment to test git


In [None]:
def print_gpu_memory():
    if torch.cuda.is_available():
        print("Cuda available")
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# Call this before and after model loading
print_gpu_memory()

GPU memory allocated: 25.47 GB
GPU memory reserved: 25.48 GB


In [2]:
'''
This is the DeepSeek 14b model, which at first glance seems to perform better than the Llama model. 
Definitely worth considering if this should be used instead.
'''

model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", device_map="auto"),
    transformers.AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
)

Fetching 4 files: 100%|██████████| 4/4 [00:40<00:00, 10.21s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.67s/it]


In [None]:
# Dansk version af promp classen
class BlameAnalysis(BaseModel):
    text: str = Field(description="Den præcise originale sætning som analyseres")
    blame: bool = Field(description="Hvorvidt skyld er til stede i sætningen eller ej")
    blamee: Optional[str] = Field(
        default=None,
        description="Hvem eller hvad bliver beskyldt for noget (Må ikke være tom hvis blame=true)"
    )
    arguments: Optional[str] = Field(
        default=None,
        description="Hvad blamee bliver beskyldt for - det specifikke negative udfald (Må ikke være tom hvis blame=true)"
    )

In [13]:

data = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/annotation_data_translated_version_03_10.csv", encoding='utf-8')

# Parse and flatten the sentences
all_sentences = []
for text in data["da_segmented_text"]:
    if pd.notna(text):  # Skip NaN values
        try:
            # Parse the string representation of a list into actual list
            sentence_list = ast.literal_eval(text)
            if isinstance(sentence_list, list):
                all_sentences.extend(sentence_list)
            else:
                # If it's not a list, treat it as a single sentence
                all_sentences.append(str(sentence_list))
        except (ValueError, SyntaxError):
            # If parsing fails, treat the whole thing as one sentence
            all_sentences.append(text)

sentences = all_sentences

print(f"Total sentences after flattening: {len(sentences)}")
print(f"First sentence: {sentences[0]}")
print(f"First sentence length: {len(sentences[0])} chars")
print(f"Longest sentence: {max(len(s) for s in sentences)} chars")

Total sentences after flattening: 399018
First sentence: Mødet er åbnet.
First sentence length: 15 chars
Longest sentence: 1804 chars


In [6]:
generator = Generator(model, BlameAnalysis)

In [14]:
for sentence in tqdm.tqdm(sentences, desc = "Deepseek blame (GPU)"):
    prompt = f"""Udfør skyldidentifikation på følgende sætning.
    Sætning: {sentence}
    Regler:
    - Start med at afgøre, om der overhovedet er skyld til stede i sætningen
    - Identificer hvem der gives skylden, hvad de gives skylden for, og de argumenter der bruges
    - Sæt blame=true KUN hvis nogen/noget gives skylden for at forårsage et negativt udfald
    - "text"-feltet SKAL være PRÆCIS den sætning, der er angivet ovenfor - modificer den ikke
    - Hvis blame=true, må "blamee" IKKE være tom, og "arguments" skal indeholde det specifikke udfald, de gives skylden for
    - Lad ikke arguments være en tom streng

    Semantiske roller:
    - Blamee: Den patient, der modtager skylden (hvem eller hvad gives skylden)
    - Argument: Hvad gives blamee skylden for (det negative udfald)

    Angiv din analyse i JSON-format."""
    with torch.no_grad():  # Disable gradient tracking
        result = generator(prompt, max_new_tokens=256, use_cache=False)

    data = json.loads(result)
    #print(json.dumps(data, indent=2))
    # Parsing json for saving
    result_out = BlameAnalysis.model_validate_json(result)
    # (Over)Writing to file to avoid duplicates
    with open("result_blame.json", "a") as f:
       json.dump(result_out.model_dump(), f, indent=2)
    torch.cuda.empty_cache()



Deepseek blame (GPU):   0%|          | 0/399018 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 1/399018 [00:12<1372:14:57, 12.38s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 2/399018 [00:35<2052:07:16, 18.51s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 3/399018 [00:47<1727:30:46, 15.59s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 4/399018 [00:57<1487:02:47, 13.42s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 5/399018 [01:13<1588:38:25, 14.33s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Deepseek blame (GPU):   0%|          | 6/399018 [01:23<1429:48:42, 12.90s/it]Setting `pad_token_id` to `eos_token

KeyboardInterrupt: 