In [1]:
import pandas as pd
import numpy as np
from huggingface_hub import InferenceClient
# https://huggingface.co/docs/huggingface_hub/guides/inference
import typing

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

## Hugging Face Inference Client settings

In [2]:
api_key = "INSERT"
model = "meta-llama/Meta-Llama-3-8B-Instruct"
temperature = 0.5 # optional
max_tokens = 1024
top_p = 0.7 # optional

## Formatting

In [3]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Functions to build prompts

In [4]:

def build_chat_claim_detection(system_instructions, text):
    
    chat_claim_detection = [
        {"role": "system", "content": system_instructions},
        {"role": "user", "content": f"Here is the text: {text}"}
    ]
    
    return chat_claim_detection 


In [5]:

def build_chat_updated_claim_list(system_instructions, new_claims):
    
    chat_claim_list = [
    {"role": "system", "content": system_instructions},
    {"role": "user", "content": f"Here is the list of  claims: {new_claims}"} 
    ]
    
    return chat_claim_list


In [6]:

def build_chat_paragraph_from_claim(system_instructions, claim, text):
    
    chat_claim_paragraph = [
        {"role": "system", "content": system_instructions},
        {"role": "user", "content": f"Here is the claim: ```{claim}``` and here is the text: ```{text}```"}
    ]
    
    return chat_claim_paragraph


In [7]:

def build_chat_extract_arguments(system_instructions, subject, paragraph):
    
    chat_argumentative_units = [
        {"role": "system", "content": system_instructions},
        {"role": "user", "content": f"Here is the complaint: ```{subject}``` and here is the text: ```{paragraph}```"}
    ]
    
    return chat_argumentative_units
    

In [8]:

def build_chat_extract_arguments_from_original_text(system_instructions, complaints, text_window):
    
    chat_argumentative_units = [
        {"role": "system", "content": system_instructions},
        {"role": "user", "content": f"Here is the list of complaints: ```{complaints}``` and here is the text: ```{text_window}```"}
    ]
    
    return chat_argumentative_units


In [9]:

def build_chat_extract_arguments_from_original_text_no_complaints(system_instructions, text_window):
    
    chat_argumentative_units = [
        {"role": "system", "content": system_instructions},
        {"role": "user", "content": f"Here is the text: ```{text_window}```"}
    ]
    
    return chat_argumentative_units


## Helper Functions

In [10]:

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [11]:

def get_sentence_similarity(sentence1, sentence2):
    
    # Sentences we want sentence embeddings for
    sentences = [sentence1, sentence2]

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    cos = torch.nn.CosineSimilarity(dim=0)
    sim = cos(sentence_embeddings[0], sentence_embeddings[1])
    
    return sim.item()


In [12]:

def get_text_window_with_overlap(text_windows, windows_idx, CHAR_COUNT, OVERLAP):
    
    window_whole = ''
    
    if ((len(windows_idx) == 1) and (windows_idx[0] < (len(text_windows)-1))):
        
        window_whole = text_windows[windows_idx[0]][:CHAR_COUNT - OVERLAP] + " "
        window_whole = window_whole + text_windows[windows_idx[0]+1]
        
    elif len(windows_idx) == 1:
        
        window_whole = text_windows[windows_idx[0]]
    
            
    elif (windows_idx[-1]) < (len(text_windows)-1):
        
        for idx in windows_idx:
            window_whole = window_whole + text_windows[idx][:CHAR_COUNT - OVERLAP] + " "
        
        window_whole = window_whole + text_windows[windows_idx[-1] + 1]
    
    else:
        
        for idx in windows_idx[:-1]:
            window_whole = window_whole + text_windows[idx][:CHAR_COUNT - OVERLAP] + " "
        
        window_whole = window_whole + text_windows[windows_idx[-1]]
        
    return window_whole
        

In [13]:

# function to check whether an output of the LLM is paraphrased or not
def is_exact_wording(text, output):
    
    text = text.replace("\'", "").replace(" ", "").lower()
    output = output.replace("\'", "").replace(" ", "").lower()
        
    if output in text:
        return True
    else:
        return False 
    

In [14]:

def extract_args_from_str_updated(arg_str):
    
    claim_str = '**Claim:** '

    idx_claim = arg_str.find(claim_str)
    idx_claim_end = idx_claim + arg_str[idx_claim:].find('\n')

    claim = arg_str[idx_claim+len(claim_str):+idx_claim_end]
    has_next_claim = arg_str[idx_claim+len(claim_str):].find(claim_str)
        
    claims_and_premises = []
    
    while True:
                    
        premises = []
        
        if has_next_claim == -1:
            claim_substr = arg_str[arg_str.find(claim) + len(claim) :]
        else:
            idx_next_claim = idx_claim +  len(claim_str) + arg_str[idx_claim+len(claim_str):].find(claim_str) 
            claim_substr = arg_str[arg_str.find(claim) + len(claim) : idx_next_claim]

        dash_str = '\n- '
        idx_premise = claim_substr.find(dash_str)
        has_next_premise = claim_substr[idx_premise:].find(dash_str)

        if has_next_premise == -1:
            pass
        else:
            idx_next_premise = idx_premise +  len(dash_str) + claim_substr[idx_premise:].find(dash_str)   

            while True:

                premise = claim_substr[idx_premise:idx_next_premise-len(dash_str)]
                premises.append(premise)

                idx_premise = idx_next_premise
                has_next_premise = claim_substr[idx_premise:].find(dash_str)
                if has_next_premise == -1:
                    premise = claim_substr[idx_premise : idx_premise + claim_substr[idx_premise:].find('\n')]
                    premises.append(premise)
                    break
                else:
                    idx_next_premise = idx_premise +  len(dash_str) + claim_substr[idx_premise:].find(dash_str)
        
        claim_premise_obj = {'claim': claim,
                             'premises': list(filter(None, premises))
                            }
        
        claims_and_premises.append(claim_premise_obj)
        
        if has_next_claim == -1:
            break
        else:
            idx_claim = idx_next_claim
            idx_claim_end = idx_claim + arg_str[idx_claim:].find('\n')

            claim = arg_str[idx_claim+len(claim_str):+idx_claim_end]
            has_next_claim = arg_str[idx_claim+len(claim_str):].find(claim_str)
                        
    return claims_and_premises  


## Config

In [15]:

# define config with input to pipeline
class ArgumentationConfig():
    
    def __init__(self):     
    
        self.system_instructions_claim_detection = """
            You are a legal expert working in the European Court of Human Rights (ECHR).
            You will be provided with a text, and your task is to identify all complaints made by the applicant.
            Guidelines for identifying complaints:
            - Complaints are statements or assertions made by the applicant.
            Formatting:
            - Provide all complaints you identify as a list, with each complaint preceded by a dash.
            - Provide the same wording as in the text and do not paragraphase.
        """

        # --> use examples from different text
        # -->  


        self.system_instructions_claim_postprocessing = """
            You are a legal expert working in the European Court of Human Rights (ECHR).
            You will be provided with a list of claims made by an applicant. Your task is to review the list and perform the following steps:
            Steps to Follow:
            - Identify Claims on the Same Topic: If multiple claims cover the same topic or express the same idea, group them into one.
            Combine the relevant claims into a single entry.
            - Remove Duplicate Claims: If you find duplicate claims (exactly the same claim repeated), keep only one instance of that claim.
            - Maintain Original Order: After updating the list, ensure that the claims are presented in their original order,
            with no changes to their sequence.
            Formatting:
            - Provide the updated list of claims using dashes as bullets.
        """

        self.system_instructions_paragraph_from_claim = """ 
            You are a legal expert working in the European Court of Human Rights (ECHR).   
            You will be provided with a text and a complaint made by an applicant, both enclosed in triple backticks. 
            Your task is to identify the **entire** paragraph of the text which contains context, explanation, attacks, support
            and decisions made by the Commission regarding the complaints.
            Provide the **exact** paragraph in the text that corresponds to the above criteria, **in its entirety**, 
            enclosed in triple backticks. Do **not** omit any part of the paragraph, and do **not** paraphrase or alter the sentences in any way. 
            The paragraph must be presented exactly as it appears in the text, without any additions, deletions, or modifications.
        """

        # --> maybe test examples

        self.system_instructions_extract_arguments = """
            You are a legal expert working at the European Court of Human Rights (ECHR).
            You will be provided with two texts:
                - A complaint made by an applicant.
                - An extract of a court decision treating this complaint.
            Your task is to identify all arguments within the text extract that are related to the complaint, no matter if they support or attack the complaint.
            An argument consists of:
                - A claim (the main point being made). \
                - One or more premises (assumptions or evidence supporting the claim).
            Guidelines for Identifying Arguments:
                - A premise provides evidence that supports the claim. It is an assumption that is presented as true to support the argument.
                - A claim is the main point or conclusion of the argument.
            Notes on Legal Text and References:
                - Legal references, such as Articles from legal texts, are often linked to the preceding sentence. Pay attention to this when reading.
                - If a reference (e.g., an Article) supports an argument, it should be grouped with the sentence that makes the reference.
                - Use the **exact** wording from the text for both premises and claims and do **not** paraphrase or alter the sentences in any way.
                - Premises and claims must be presented exactly as it appears in the text, without any additions, deletions, or modifications.
                - Premises and claims do not necessarily have to be entire sentences but can be part of sentences from the original text.
            Formatting:
                - For each claim, provide first the number of the argument, then 
                the claim and list the corresponding premises beneath the claim, each marked with a dash.
            Formatting Example:
                1. Argument
                **Claim:** {put claim here}
                **Premises:**
                - {premise 1}
                - {premise 2}
                ...
        """

        self.system_instructions_extract_arguments_from_original_text = """
            You are a legal expert working at the European Court of Human Rights (ECHR).
            You will be provided with:
                - A list of complaints made by an applicant.
                - An extract of a court decision treating these complaints.
            Your task is to identify all arguments within the text extract that are related to the complaints, no matter if they support or attack the complaints.
            An argument consists of:
                - A claim (the main point being made). \
                - One or more premises (assumptions or evidence supporting the claim).
            Guidelines for Identifying Arguments:
                - A premise provides evidence that supports the claim. It is an assumption that is presented as true to support the argument.
                - A claim is the main point or conclusion of the argument.
            Notes on Legal Text and References:
                - Legal references, such as Articles from legal texts, are often linked to the preceding sentence. Pay attention to this when reading.
                - If a reference (e.g., an Article) supports an argument, it should be grouped with the sentence that makes the reference.
                - Use the **exact** wording from the text for both premises and claims and do **not** paraphrase or alter the sentences in any way.
                - Premises and claims must be presented exactly as it appears in the text, without any additions, deletions, or modifications.
                - Premises and claims do not necessarily have to be entire sentences but can be part of sentences from the original text.
            Formatting:
                - For each argument, provide first the number of the argument, then 
                the claim and list the corresponding premises beneath the claim, each marked with a dash.
            Formatting Example:
                1. Argument
                **Claim:** {put claim here}
                **Premises:**
                - {premise 1}
                - {premise 2}
                ...
            """

        # example without complaint list
        self.system_instructions_extract_arguments_from_original_text_no_complaints = """
            You are a legal expert working at the European Court of Human Rights (ECHR).
            You will be provided with:
                - An extract of a court decision.
            Your task is to identify all arguments within the text.
            An argument consists of:
                - A claim (the main point being made). \
                - One or more premises (assumptions or evidence supporting the claim).
            Guidelines for Identifying Arguments:
                - A premise provides evidence that supports the claim. It is an assumption that is presented as true to support the argument.
                - A claim is the main point or conclusion of the argument.
            Notes on Legal Text and References:
                - Legal references, such as Articles from legal texts, are often linked to the preceding sentence. Pay attention to this when reading.
                - If a reference (e.g., an Article) supports an argument, it should be grouped with the sentence that makes the reference.
                - Use the **exact** wording from the text for both premises and claims and do **not** paraphrase or alter the sentences in any way.
                - Premises and claims must be presented exactly as it appears in the text, without any additions, deletions, or modifications.
                - Premises and claims do not necessarily have to be entire sentences but can be part of sentences from the original text.
            Formatting:
                - For each argument, provide first the number of the argument, then 
                the claim and list the corresponding premises beneath the claim, each marked with a dash.
            Formatting Example:
                1. Argument
                **Claim:** {put claim here}
                **Premises:**
                - {premise 1}
                - {premise 2}
                ...
            """


In [16]:

class ArgumentationConfigExamples(ArgumentationConfig):
    
    def __init__(self):
        ArgumentationConfig.__init__(self)   

        self.system_instructions_claim_detection = """
            Below are a few examples of complaints made by an applicant. Do not include these examples in your response, and answer only based on the given prompt.
            
            - The applicant contested the Government’s assertion that his video work was refused a certificate for distribution in order to "protect the right of citizens not to be offended in their religious feelings"
            - The first and the second applicant complain that the first applicant was prohibited from taking the second applicant home on his release from his pre-trial detention,
            - He complains in particular that the authorities refused to take into account the recommendations of various medical experts; as a result, his health seriously deteriorated.
            
            Now, based on the prompt below, provide an answer:
        """ + "\n" + self.system_instructions_claim_detection
        
        
        self.system_instructions_claim_postprocessing = """
            Below are a few examples of input and output. Do not include these examples in your response, and answer only based on the given prompt.
            
            Example: Duplicate complaints
            Input:
            Successful prosecutions against members of the security forces for the destruction of villages and the expulsion of villagers.
            The applicant has not been given information about successful prosecutions against members of the security forces for the destruction of villages and the expulsion of villagers.
            Output:
            The applicant has not been given information about successful prosecutions against members of the security forces for the destruction of villages and the expulsion of villagers.

            Example: Two complaints cover a similar idea
            Input:
            The applicant fears reprisal if they complain about acts of the security forces.
            The applicant's fear of reprisal is not wholly without foundation, given the vulnerability of dispossessed applicants under pressure from both the security forces and the terrorist activities of the PKK.
            Output:
            The applicant's fear of reprisal if they complaint about acts of the sucrity forces is not wholly without foundation, given the vulnerability of dispossessed applicants under pressure from both the security forces and the terrorist activities of the PKK.
        
            Now, based on the prompt below, provide an answer:   
        
        """ + "\n" + self.system_instructions_claim_postprocessing
                
        extracted_arguments_example = """ 
            Here are two examples of an extracted argument:
            
            Argument 1:
            ***Claim:***
            In these circumstances, the Commission finds that the application cannot be declared inadmissible for non-exhaustion of domestic remedies. 
            ***Premises:***
            - Consequently, the Commission finds that the remedy cannot realistically have been available to the applicants in this case. 
            - Further, the Commission finds that the applicants' choice of pursuing proceedings in the courts of Northern Ireland was not unreasonable or without basis in domestic law.
            - The applicants in those proceedings (in respect of which they were granted legal aid by the relevant domestic authorities) intended, inter alia, to argue that liability arose out of the actions of Crown agents based in the United  Kingdom and Northern Ireland.
            - The certificates issued by the Secretary of State were however conclusive of the fact that the alleged liability did not arise in respect of the Government in the United Kingdom or in Northern Ireland and effectively put an end to the proceedings. 
        
            Argument 2:
            ***Claim:***
            The Government submit that specific protection of life is provided by the Gibraltar Constitution and that this is in conformity with the standards imposed by Article 2 (Art. 2).
            ***Premises:***
            - They point out that the jury on consideration of the evidence found that the killings were lawful.
            - They contend that it is implicit from that finding that the jury considered that there had been no plot to kill the deceased terrorists, that the soldiers had honestly and reasonably believed that there was a car bomb which could be detonated by remote control and  that the degree of force used by the soldiers was proportionate to the  aim of protecting the lives of the people of Gibraltar. 
            - As regards the Inquest proceedings, the Government submit that it provided an exhaustive fact-finding investigation by an independent body.
            - In particular, they highlight the fact that 78 witnesses (including the S.A.S soldiers and Security Service personnel involved in the incident) were extensively examined and cross-examined by the legal representatives of the applicants, that the Inquest was held within six months of the incident, that the applicants' representatives made no challenge to the public immunity certificates, and that the Coroner exercised his discretion to allow certain questions, despite objection by the Crown on the basis of the certificates. 


        """
        
        self.system_instructions_extract_arguments = self.system_instructions_extract_arguments + "\n" + extracted_arguments_example
        self.system_instructions_extract_arguments_from_original_text = self.system_instructions_extract_arguments_from_original_text + "\n" + extracted_arguments_example


In [17]:

class ArgumentationConfigReworked(ArgumentationConfig):
    
    def __init__(self):
        ArgumentationConfig.__init__(self)   
    
        self.system_instructions_extract_arguments_from_original_text_no_complaints = """

            You are a legal expert working at the European Court of Human Rights (ECHR).
            You will be provided with an extract of a court decision. Your task is to identify all arguments within the text.

            Task:
            An argument consists of a claim (the main point being made) and one or more premises (assumptions or evidence supporting the claim).
            Important: An argument cannot consist of only a claim. It must include at least one premise.

            Guidelines for Identifying Arguments:
            A claim is the main point or conclusion being made in the argument.
            A premise provides evidence or assumptions that support the claim. It may present information that is taken to be true for the purposes of the argument.
            The claim and premises must be presented exactly as they appear in the text without any modifications, paraphrasing, or alterations.

            Additional Instructions:
            Referencing Legal Texts and Precedents:
            Legal references, such as Articles from legal texts or references to prior cases, can be used as part of the argument. They may serve as claims or premises within the argument.
            Article references or precedent cases should be considered carefully, as they could support or challenge the main claim or premise.

            Argument Structures:
            Arguments can have the following structures, which you can follow to generate your output:
            - Deductive arguments: Where a premise supports a claim. This is the typical structure but not the only one.
            - Counter-arguments: Where a premise attacks or challenges the claim. These arguments should also be identified and extracted.
            - Argument from position to know: Example: "X asserts P. X knows if P is true or not. X would assert what they know about P."
            - Argument from established rule: Example: "For all x, if doing A is the established rule for x, then x must do A (subject to exceptional cases)."
            - Argument from precedent case: Example: "Case C0 would set a precedent with respect to an existing rule R. Case C0 is similar to case C1. C1 needs to be treated equally with respect to rule R as case C0."

            Preferred Structure:
            Prefer one large argument over splitting it into multiple smaller ones, if both options would convey the same meaning. Group relevant premises and claims together as part of the same argument for clarity and coherence.

            Formatting:
            For each argument:
            - First, provide the argument number.
            - Then, the claim followed by its premises, each listed with a dash beneath the claim.
            - Ensure that claims and premises are taken exactly as they appear in the text.
            
            Formatting:
            - For each argument, provide first the number of the argument, then 
            - The claim and list the corresponding premises beneath the claim, each marked with a dash.
            Formatting Example:
                1. Argument
                **Claim:** {put claim here}
                **Premises:**
                - {premise 1}
                - {premise 2}
                ...

            Argument Examples:
            Argument 1: 
            Claim:
            In these circumstances, the Commission finds that the application cannot be declared inadmissible for non-exhaustion of domestic remedies.
            Premises:
            - Consequently, the Commission finds that the remedy cannot realistically have been available to the applicants in this case.
            - Further, the Commission finds that the applicants' choice of pursuing proceedings in the courts of Northern Ireland was not unreasonable or without basis in domestic law.
            - The applicants in those proceedings (in respect of which they were granted legal aid by the relevant domestic authorities) intended, inter alia, to argue that liability arose out of the actions of Crown agents based in the United Kingdom and Northern Ireland.
            - The certificates issued by the Secretary of State were, however, conclusive of the fact that the alleged liability did not arise in respect of the Government in the United Kingdom or in Northern Ireland and effectively put an end to the proceedings.

            Argument 2: 
            Claim:
            The Government submits that specific protection of life is provided by the Gibraltar Constitution and that this is in conformity with the standards imposed by Article 2 (Art. 2).
            Premises:
            - They point out that the jury, upon consideration of the evidence, found that the killings were lawful.
            - They contend that it is implicit from that finding that the jury considered that there had been no plot to kill the deceased terrorists, that the soldiers had honestly and reasonably believed that there was a car bomb which could be detonated by remote control, and that the degree of force used by the soldiers was proportionate to the aim of protecting the lives of the people of Gibraltar.
            - As regards the Inquest proceedings, the Government submits that it provided an exhaustive fact-finding investigation by an independent body.
            - In particular, they highlight the fact that 78 witnesses (including the SAS soldiers and Security Service personnel involved in the incident) were extensively examined and cross-examined by the legal representatives of the applicants, that the Inquest was held within six months of the incident, that the applicants' representatives made no challenge to the public immunity certificates, and that the Coroner exercised his discretion to allow certain questions, despite objections by the Crown on the basis of the certificates.

        """


In [18]:

# here define config

#config = ArgumentationConfig()
#config = ArgumentationConfigExamples()
config = ArgumentationConfigReworked()


## Text Preprocessing

In [65]:
text_filename = '29_short.txt'

In [66]:
with open(f'echr_corpus/plain_texts/{text_filename}', 'r') as file:
    text = file.read().replace('\n', '')

In [67]:

has_double_space = True
counter = 0

while has_double_space == True:
    if '  ' in text:
        text = text.replace('  ', ' ')
        counter += 1
        print(f'Iteration: {counter}')
    else:
        has_double_space == False
        break
        

In [68]:
CHAR_COUNT = 2500
OVERLAP = round(CHAR_COUNT * 0.2)
TOTAL_LENGTH = len(text)

In [69]:

text_windows = []
start = 0
end = CHAR_COUNT

while start < TOTAL_LENGTH:
    
    window = text[start:end]
    text_windows.append(window)
    
    start = round(end - OVERLAP)
    end = round(start + CHAR_COUNT)

print(f'The text was splitted into {len(text_windows)} text windows.')


The text was splitted into 11 text windows.


______

## Task 1: Claim Detection

In [24]:

def get_claim_detection_output():
    
    claim_detection_output = []

    windows_counter = 1
    for text_window in text_windows:

        print(f'CURRENT TEXT WINDOW: {windows_counter}')

        messages = build_chat_claim_detection(config.system_instructions_claim_detection, text_window)

        client = InferenceClient(api_key=api_key)

        stream = client.chat_completion(
            model=model, 
            messages = messages, 
            temperature = temperature, 
            max_tokens = max_tokens,
            top_p = top_p,
            stream = True
        )

        output = ''

        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")
            output = output + chunk.choices[0].delta.content

        claim_detection_output.append(output)

        windows_counter += 1

        print('\n')
        print('---------------------------------------')
        
    return claim_detection_output
    

In [25]:

def get_claims_with_window(claim_detection_output):

    claims_from_window = []
    counter = 0

    for output in claim_detection_output:
        finding_claims = True
        claims_detected = output

        while finding_claims:

            idx_first_claim = claims_detected.find('\n-')

            # If no separator is found, treat the whole text as the first claim
            if idx_first_claim == -1:
                # If the claim_detected is not empty, we process it as a single claim
                if claims_detected.strip():  # Ensure it's not an empty string
                    claim_windows_object = {'window': counter, 'claim': claims_detected.strip()}
                    claims_from_window.append(claim_windows_object)
                finding_claims = False  # Stop searching since there's only one claim
                break

            # Otherwise, extract the first claim normally
            tmp_splitted = claims_detected[idx_first_claim:]
            idx_next_linebreak = idx_first_claim + len('\n-') + 1 + tmp_splitted[len('\n-') + 1:].find('\n')

            # Extract the first claim text
            first_claim = claims_detected[idx_first_claim:idx_next_linebreak]

            if first_claim[3:].strip():  # Ensure there is content after '- '
                claim_windows_object = {'window': counter, 'claim': first_claim[3:].strip()}
                claims_from_window.append(claim_windows_object)

            # Move to the next part of the string for further claims (if any)
            claims_detected = claims_detected[idx_next_linebreak:]

        counter += 1

    # Convert the list of claims into a DataFrame
    return pd.DataFrame(claims_from_window, columns=['window', 'claim'])
    

In [26]:

def extract_all_claims(claim_detection_output):
    
    claims_all = []

    for output in claim_detection_output:
        finding_claims = True
        claims_detected = output

        while finding_claims:
            # Try to find the first claim separator
            idx_first_claim = claims_detected.find('\n-')

            # If no separator is found, treat the entire remaining string as the first claim
            if idx_first_claim == -1:
                if claims_detected.strip():  # Ensure we don't add an empty claim
                    claims_all.append(claims_detected.strip())
                finding_claims = False  # Stop searching, as we only have one claim
                break

            # Otherwise, extract the first claim normally
            tmp_splitted = claims_detected[idx_first_claim:]
            idx_next_linebreak = idx_first_claim + len('\n-') + 1 + tmp_splitted[len('\n-') + 1:].find('\n')

            # Extract the first claim and trim any leading or trailing spaces
            first_claim = claims_detected[idx_first_claim:idx_next_linebreak].strip()

            if first_claim[3:].strip():  # Ensure the claim text after '- ' is not empty
                claims_all.append(first_claim[3:].strip())  # Remove the '- ' from the claim

            # Move to the next part of the string for further claims
            claims_detected = claims_detected[idx_next_linebreak:]

    # Get the length of the extracted claims
    print(len(claims_all))
    
    if len(claims_all) > 30:
        claims_all = claims_all[:30]
        print('ATTENTION: Shortened claims_all for articles with long claims for testing purposes.')
    
    return claims_all


In [27]:

def llm_claim_postprocessing(claims_all):
      
    messages = build_chat_updated_claim_list(config.system_instructions_claim_postprocessing, str(claims_all))

    client = InferenceClient(api_key=api_key)

    stream = client.chat_completion(
        model=model, 
        messages = messages, 
        temperature = temperature, 
        max_tokens = max_tokens,
        top_p = top_p,
        stream = True
    )

    output = ''

    for chunk in stream:
        print(chunk.choices[0].delta.content, end="")
        output = output + chunk.choices[0].delta.content
        
    return output


def get_all_claims_cleaned(output):
    
    claims_all_cleaned = []
    finding_claims = True
    claims_detected = output

    while finding_claims:

        idx_first_claim = claims_detected.find('\n-')

        if idx_first_claim == -1:

            finding_claims = False
            break

        tmp_splitted = claims_detected[idx_first_claim:]

        idx_next_linebreak = tmp_splitted.find('\n', len('\n-') + 1)
        if idx_next_linebreak == -1: 
            idx_next_linebreak = len(tmp_splitted) 

        first_claim = claims_detected[idx_first_claim:idx_first_claim + idx_next_linebreak]

        claims_all_cleaned.append(first_claim[3:])

        claims_detected = claims_detected[idx_first_claim + idx_next_linebreak:]

    return claims_all_cleaned


## Task 2: Identify relevant claim paragraph

In [28]:

def get_claim_windows(claims_all_cleaned):
    
    claim_windows_cleaned = []

    for claim_cleaned in claims_all_cleaned:

        windows = set()

        for index, row in df_claims_from_window.iterrows():

            cos_sim = get_sentence_similarity(claim_cleaned, row.claim)

            if cos_sim == 1:
                windows = set([row.window])
                break
            else:
                if cos_sim > 0.7:
                    windows.add(row.window)

        claim_windows_object = {'claim':claim_cleaned, 'windows':list(windows)}
        claim_windows_cleaned.append(claim_windows_object)
        
    return claim_windows_cleaned


### Identify claim paragraph

In [29]:

def get_claim_paragraph(claim_windows_cleaned):
    
    client = InferenceClient(api_key=api_key)

    claim_with_paragraph = []

    for claim_with_windows in claim_windows_cleaned:

        claim = claim_with_windows['claim']
        text_windows_with_overlap = get_text_window_with_overlap(text_windows, claim_with_windows['windows'], CHAR_COUNT, OVERLAP)

        messages = build_chat_paragraph_from_claim(config.system_instructions_paragraph_from_claim, 
                                                   claim, 
                                                   text_windows_with_overlap)

        stream = client.chat_completion(
            model=model, 
            messages = messages, 
            temperature = temperature, 
            max_tokens = max_tokens,
            top_p = top_p,
            stream = True
        )

        output = ''

        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")
            output = output + chunk.choices[0].delta.content

        claim_paragraph_object = {'claim':claim, 'paragraph':output}
        claim_with_paragraph.append(claim_paragraph_object)

        print('\n')
        print('---------------------------------------')
        
    return claim_with_paragraph
    

In [30]:

def get_claim_paragraph_extracted(claim_with_paragraph):
    
    # extract paragraph in triple backticks
    # create new dict with claim and extracted paragraph
    # then use this to identify arguments

    claim_with_paragraph_extracted = [ ]

    for claim_paragraph in claim_with_paragraph:
    
        claim = claim_paragraph['claim']
        paragraph = claim_paragraph['paragraph']
    
        print(claim)
        print(paragraph)
    
        if paragraph == '':
            continue
        else:
            try:
                paragraph = paragraph.split("```")[1].split("```")[0]
            except:
                try:
                    paragraph = paragraph.split("`")[1].split("`")[0]
                except:
                    paragraph = paragraph.split('"')[1].split('"')[0]

        claim_paragraph_object = {'claim':claim, 'paragraph':paragraph[1:-1]}
        claim_with_paragraph_extracted.append(claim_paragraph_object)
        
    return claim_with_paragraph_extracted


## Task 3: Extract Arguments

In [31]:

def llm_get_arguments(claim_with_paragraph_extracted):
    
    client = InferenceClient(api_key=api_key)

    claim_with_argumentative_units = []

    for claim_paragraph in claim_with_paragraph_extracted:

        claim = claim_paragraph['claim']
        paragraph = claim_paragraph['paragraph']

        messages = build_chat_extract_arguments(config.system_instructions_extract_arguments, claim, paragraph)

        stream = client.chat_completion(
            model=model, 
            messages = messages, 
            temperature = temperature, 
            max_tokens = max_tokens,
            top_p = top_p,
            stream = True
        )

        output = ''

        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")
            output = output + chunk.choices[0].delta.content

        claim_paragraph_object = {'claim':claim, 'paragraph':paragraph, 'argumentative_units':output}
        claim_with_argumentative_units.append(claim_paragraph_object)

        print('\n')
        print('---------------------------------------')
        
    return claim_with_argumentative_units
    



In [32]:

def extract_arguments(claim_with_argumentative_units):
    
    claims_with_arguments_extracted = []

    for entry in claim_with_argumentative_units:
    
        claim_arg_obj = {'complaint': entry['claim'], 
                     'arguments': extract_args_from_str_updated(entry['argumentative_units'])}
            
        claims_with_arguments_extracted.append(claim_arg_obj)
    
    return claims_with_arguments_extracted


In [33]:

def print_claims_with_arguments_extracted(claims_with_arguments_extracted):
    
    for i in range(0, len(claims_with_arguments_extracted)):

        print(color.BOLD + 'New Argument' + color.END)
        print('\n')

        entry = claims_with_arguments_extracted[i]
        complaint = entry['complaint']

        print(color.BOLD + 'Complaint: ' + color.END + complaint)
        print('\n')

        counter = 1
        for arg in entry['arguments']:
            print(color.BOLD + f'Argument {counter}' + color.END)

            claim = arg['claim']
            print(color.BOLD + 'Claim: ' + color.END + claim )

            counter_premises = 1
            for premise in arg['premises']:

                print(color.BOLD + f'Premise {counter_premises}:' + color.END + premise)
                counter_premises +=1

            print('\n')
            counter += 1

        print('_____________________________________________')
        print('\n')


# **Base for all Pipelines**

In [70]:
claim_detection_output = get_claim_detection_output()
df_claims_from_window = get_claims_with_window(claim_detection_output)

CURRENT TEXT WINDOW: 1
Here is the list of complaints made by the applicant:

- The making, contents, and publication of the Inspectors' report on the affairs of the fourth applicant, a company, allegedly breached the applicants' honour and reputation.
- The report allegedly determined their civil right to honour and reputation, in breach of Article 6 para. 1 of the Convention.
- The applicants were denied effective access to the civil courts in that they cannot bring a civil claim against the Inspectors or the Secretary of State for Trade and Industry in order to clear their reputations.
- The applicants felt obliged to withdraw libel proceedings against The Observer newspaper because of the publication of the report, which partly related to matters that were also the subject of the libel proceedings.

---------------------------------------
CURRENT TEXT WINDOW: 2
Here is the list of complaints made by the applicant:

- that the applicants cannot bring a civil claim against the Inspec

# **Pipeline 1: Modular with LLM tasks**

In [None]:

claims_all = extract_all_claims(claim_detection_output)

llm_claim_postprocessing_output = llm_claim_postprocessing(claims_all)
claims_all_cleaned = get_all_claims_cleaned(llm_claim_postprocessing_output)

claim_windows_cleaned = get_claim_windows(claims_all_cleaned)

claim_with_paragraph = get_claim_paragraph(claim_windows_cleaned)

claim_with_paragraph_extracted = get_claim_paragraph_extracted(claim_with_paragraph)

# check if prompt to use exact wording works
df = pd.DataFrame(claim_with_paragraph_extracted)
df['is_exact_wording'] = df['paragraph'].apply(lambda x: is_exact_wording(text, x))
print(df)

claim_with_argumentative_units = llm_get_arguments(claim_with_paragraph_extracted)
claims_with_arguments_extracted = extract_arguments(claim_with_argumentative_units)
print_claims_with_arguments_extracted(print_claims_with_arguments_extracted)


# **Pipeline 2: Direct from text with complaints**

In [55]:

def llm_extract_arguments_from_text_with_complaints(df_claims_from_window):
    
    client = InferenceClient(api_key=api_key)

    claims_with_arguments = []

    for window in list(set(df_claims_from_window['window'])):

        claim_list = list(df_claims_from_window.loc[df_claims_from_window['window'] == window]['claim'])

        text_window_with_overlap = get_text_window_with_overlap(text_windows, [window], CHAR_COUNT, OVERLAP)    

        messages = build_chat_extract_arguments_from_original_text(config.system_instructions_extract_arguments_from_original_text_examples, claim_list, text_window_with_overlap)
        
        stream = client.chat_completion(
            model=model, 
            messages = messages, 
            temperature = temperature, 
            max_tokens = max_tokens,
            top_p = top_p,
            stream = True
        )

        output = ''

        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")
            output = output + chunk.choices[0].delta.content

        claim_paragraph_object = {'claim':claim_list, 'paragraph':text_window_with_overlap, 'argumentative_units':output}
        claims_with_arguments.append(claim_paragraph_object)

        print('\n')
        print('---------------------------------------')
        
    return claim_with_arguments




In [None]:
arguments_from_text_with_complaints = llm_extract_arguments_from_text_with_complaints(df_claims_from_window)
arguments_from_text_with_complaints_extracted = extract_arguments(arguments_from_text_with_complaints)
print_claims_with_arguments_extracted(arguments_from_text_with_complaints_extracted)

# **Pipeline 3: Direct from text no complaints**

In [48]:
config.system_instructions_extract_arguments_from_original_text_no_complaints = config.system_instructions_extract_arguments_from_original_text_no_complaints + '.'

In [35]:

# TODO just give claim windows
def llm_extract_arguments_from_text_no_complaints(df_claims_from_window):
    
    client = InferenceClient(api_key=api_key)

    claims_direct_no_complaints = []
    claim_list = '-'

    for window in list(set(df_claims_from_window['window'])):

        text_window_with_overlap = get_text_window_with_overlap(text_windows, [window], CHAR_COUNT, OVERLAP)    
        messages = build_chat_extract_arguments_from_original_text_no_complaints(config.system_instructions_extract_arguments_from_original_text_no_complaints, text_window_with_overlap)

        stream = client.chat_completion(
            model=model, 
            messages = messages, 
            temperature = temperature, 
            max_tokens = max_tokens,
            top_p = top_p,
            stream = True
        )

        output = ''

        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")
            output = output + chunk.choices[0].delta.content
        
        # TODO change
        claim_paragraph_object = {'claim':claim_list, 'paragraph':text_window_with_overlap, 'argumentative_units':output}
        claims_direct_no_complaints.append(claim_paragraph_object)

        print('\n')
        print('---------------------------------------')
        
    return claims_direct_no_complaints


In [71]:
arguments_from_text_no_complaints = llm_extract_arguments_from_text_no_complaints(df_claims_from_window)
arguments_from_text_no_complaints_extracted = extract_arguments(arguments_from_text_no_complaints)
print_claims_with_arguments_extracted(arguments_from_text_no_complaints_extracted)

Here are the arguments extracted from the text:

1. Argument
**Claim:** The application is incompatible ratione personae with the provisions of the Convention and this aspect of the case must be rejected under Article 27 para. 2 (Art. 27-2) of the Convention.
**Premises:**
- Even though they own the fourth applicant company, no evidence of any criticism of the company or any prejudice to its reputation has been put forward.
- In these circumstances, the Commission concludes that the fourth applicant company cannot claim to be a victim of a violation of the Convention.

2. Argument
**Claim:** The complaint has two aspects: (a) that the applicants cannot bring a civil claim against the Inspectors or the Secretary of State for Trade and Industry in order to clear their reputations, allegedly sullied in the report; and (b) that the applicants felt obliged to withdraw libel proceedings against The Observer newspaper because, after the publication of the report, which partly related to matte

In [None]:

claims_with_arguments_extracted_alt = []

for entry in claim_with_argumentative_units_alt:
    
    claim_arg_obj = {'complaint': entry['claim'], 
                     'arguments': extract_args_from_str_updated(entry['argumentative_units'])}
            
    claims_with_arguments_extracted_alt.append(claim_arg_obj)



In [52]:
text_window_with_overlap = get_text_window_with_overlap(text_windows, [0], CHAR_COUNT, OVERLAP)    
messages = build_chat_extract_arguments_from_original_text_no_complaints(config.system_instructions_extract_arguments_from_original_text_no_complaints, text_window_with_overlap)
messages


[{'role': 'system',
  'content': '\n            You are a legal expert working at the European Court of Human Rights (ECHR).\n            You will be provided with:\n                - An extract of a court decision.\n            Your task is to identify all arguments within the text.\n            An argument consists of:\n                - A claim (the main point being made).                 - One or more premises (assumptions or evidence supporting the claim).\n            Guidelines for Identifying Arguments:\n                - A premise provides evidence that supports the claim. It is an assumption that is presented as true to support the argument.\n                - A claim is the main point or conclusion of the argument.\n            Notes on Legal Text and References:\n                - Legal references, such as Articles from legal texts, are often linked to the preceding sentence. Pay attention to this when reading.\n                - If a reference (e.g., an Article) supports a

In [52]:
text_window_with_overlap = get_text_window_with_overlap(text_windows, [0], CHAR_COUNT, OVERLAP)    
messages = build_chat_extract_arguments_from_original_text_no_complaints(config.system_instructions_extract_arguments_from_original_text_no_complaints, text_window_with_overlap)
messages


[{'role': 'system',
  'content': '\n            You are a legal expert working at the European Court of Human Rights (ECHR).\n            You will be provided with:\n                - An extract of a court decision.\n            Your task is to identify all arguments within the text.\n            An argument consists of:\n                - A claim (the main point being made).                 - One or more premises (assumptions or evidence supporting the claim).\n            Guidelines for Identifying Arguments:\n                - A premise provides evidence that supports the claim. It is an assumption that is presented as true to support the argument.\n                - A claim is the main point or conclusion of the argument.\n            Notes on Legal Text and References:\n                - Legal references, such as Articles from legal texts, are often linked to the preceding sentence. Pay attention to this when reading.\n                - If a reference (e.g., an Article) supports a

In [141]:

for i in range(0, len(claims_with_arguments_extracted_alt)):
    
    print(color.BOLD + 'New Argument' + color.END)
    print('\n')
    
    entry = claims_with_arguments_extracted_alt[i]
    complaint = entry['complaint']
          
    print(color.BOLD + 'Complaint: ' + color.END)
    print(complaint)
    print('\n')
    
    counter = 1
    for arg in entry['arguments']:
        print(color.BOLD + f'Argument {counter}' + color.END)
        
        claim = arg['claim']
        print(color.BOLD + 'Claim: ' + color.END + claim )
        
        counter_premises = 1
        for premise in arg['premises']:
            
            print(color.BOLD + f'Premise {counter_premises}:' + color.END + premise)
            counter_premises +=1
        
        print('\n')
        counter += 1
    
    print('_____________________________________________')
    print('\n')
    

[1mNew Argument[0m


[1mComplaint: [0m
['The applicant was of the opinion that the facts which had been invoked against him at the time of his arrest and during his continued detention could not, in the eyes of an objective observer, be construed as misappropriation of funds or as a breach of official duties aimed at facilitating the commission of such an offence.', 'The applicant stressed that the decisions leading to the charges against him and his being detained on remand had been taken collectively by the government at the time and in a manner which was consistent with the relevant law, including the then Bulgarian Constitution.']


[1mArgument 1[0m
[1mClaim: [0mhe arguments related to the complaints:

**Complaint about violation of Article 5 para. 1 of the Convention (art. 5-1)**

* Claim: The applicant's detention was not in accordance with Article 5 para. 1 of the Convention (art. 5-1).
* Premises:
	+ The applicant was not suspected of having committed an offence, as the