In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import json
with open('Dataset/labels.json', 'r') as json_file:
    narratives = json.load(json_file)

In [None]:
# ChatGPT

In [None]:
import os
import openai
openai.api_key = ""
openai.organization = "" 

def GPT_call(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini-2024-07-18", #gpt-3.5-turbo, gpt-4o-mini
        messages=[
            # {"role": "system", "content": "You are a helpful AI assistant tasked with classifying text based on provided instructions and explanations. Return only the specified outputs."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
    )
    return response.choices[0].message.content.strip()

In [None]:
#huggingface pipeline

In [None]:
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import os
import transformers

# Load the model once
llm_pipe = pipeline(
    "text-generation",
    model = "meta-llama/Llama-3.2-3B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map = "auto" # "auto" for mult-gpu
)

# Define the function to use the pre-loaded model
def llm_call(prompt, pipe):
    messages = [
        {"role": "user", "content": prompt},
    ]
    outputs = pipe(
        messages, 
        max_new_tokens=512, 
        # temperature=0.2,
        # do_sample=True,
        do_sample=False
    )
    assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
    return assistant_response

In [None]:
#unsloth 

In [None]:
max_seq_length = 6000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-3B-Instruct",
        # model_name = "training/UnslothTrain/lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def unsloth_llm_call(prompt):
    messages = [
        {"role": "user", "content": prompt},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")
    outputs = model.generate(input_ids = inputs,
                             max_new_tokens = 500,
                             use_cache = True,
                             do_sample=False,
                             # temperature = 1.5, min_p = 0.1
                            )
    assistant_response = tokenizer.batch_decode(outputs)[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1][:-10].strip()
    return assistant_response

In [None]:
# Hierarchical Three-Step Prompting (H3Prompt) 

In [None]:
import json
with open('Dataset/sub_narratives_with_explanations.json', 'r') as json_file:
    sub_narratives_with_explanations = json.load(json_file)
with open('Dataset/main_narratives_with_explanations.json', 'r') as json_file:
    main_narratives_with_explanations = json.load(json_file)   

# Step 1: Classify the document into a category
def classify_category(document_text):
    prompt = f"""
    Given the following document text, classify it into one of the two categories: "Ukraine-Russia War" or "Climate Change". 

    Document Text: 
    {document_text}

    Determine the category that closely or partially fits the document. If neither category applies, return "Other". Return only the output, without any additional explanations or text.
    """
    
    # generated_output = GPT_call(prompt)
    # generated_output = llm_call(prompt, llm_pipe)
    generated_output = unsloth_llm_call(prompt)
    
    return generated_output

# Step 2: Identify the main narratives
def classify_narratives(document_text, category):
        
    # Generate narratives list with explanations for the given main narrative
    narratives_list_with_explanations = "\n".join(
        f'- {narrative}: {main_narratives_with_explanations[narrative]}'
        for narrative in narratives[category]
    )
    
    prompt = f"""
    The document text given below is related to "{category}". 
    Please classify the document text into the most relevant narratives. Below is a list of narratives along with their explanations:

    {narratives_list_with_explanations}

    Document Text: 
    {document_text}
    
    Return the most relevant narratives as a hash-separated string (eg. Narrative1#Narrative2..). If no specific narrative can be assigned, just return "Other" and nothing else. Return only the output, without any additional explanations or text.
    """

    # generated_output = GPT_call(prompt)
    # generated_output = llm_call(prompt, llm_pipe)
    generated_output = unsloth_llm_call(prompt)

    return generated_output

# Step 3: Identify the sub-narratives based on main narratives
def classify_sub_narrative(document_text, category, main_narrative):
    if main_narrative == "Hidden plots by secret schemes of powerful groups": return "Other"
    
    # Generate sub-narratives list with explanations for the given main narrative
    sub_narratives_list_with_explanations = "\n".join(
        f'- {sub_narrative}: {sub_narratives_with_explanations[sub_narrative]}'
        for sub_narrative in narratives[category][main_narrative]
    )

    prompt = f"""
    The document text given below is related to "{category}" and its main narrative is: "{main_narrative}".
    Please classify the document text into the most relevant sub-narratives. Below is a list of sub-narratives along with their explanations:

    {sub_narratives_list_with_explanations}

    Document Text:
    {document_text}

    Return the most relevant sub-narratives as a hash-separated string (e.g., Sub-narrative1#Sub-narrative2..). If no specific sub-narrative can be assigned, just return "Other" and nothing else. Return only the output, without any additional explanations or text.
    """

    # generated_output = GPT_call(prompt)
    # generated_output = llm_call(prompt, llm_pipe)
    generated_output = unsloth_llm_call(prompt)
    
    return generated_output

# Function to classify the document and return labels
def classify_document(document_text):
    final_label = []

    # Step 1: Classify the document into a category
    category = classify_category(document_text).replace('"','').replace('.','').strip()
    # print(category)
    if category == "Other":
        final_label.append("Other : Other")
    else:
        # Step 2: Identify the main narratives
        if category in narratives:
            main_narratives = classify_narratives(document_text, category)
            # print(main_narratives)
            if main_narratives == "Other":
                final_label.append("Other : Other")
            else:
                # Step 3: Identify the single sub-narrative for each main narrative
                main_narratives_list = main_narratives.split("#")
                main_narratives_list = [it.strip("-").strip() for it in main_narratives_list]
                all_sub_narratives = {}

                for main_narrative in main_narratives_list: 
                    if main_narrative in narratives[category]:
                        sub_narratives = classify_sub_narrative(document_text, category, main_narrative)
                        # print(main_narrative, sub_narratives)
                        sub_narratives_list = [sub.strip("-").strip() for sub in sub_narratives.split("#")]
                        all_sub_narratives[main_narrative] = sub_narratives_list
                    else:
                        print("******* main_narrative not found *******")
                
                for main, subs in all_sub_narratives.items():
                    for sub in subs:
                        if sub and (sub in sub_narratives_with_explanations or sub=="Other"):
                            final_label.append(f"{main} : {sub}")
                        else:
                            print("******* sub_narrative not found *******", sub)

        else:
            final_label.append("Other : Other")
    
    if not final_label:
        return ["Other", ['Other : Other']]
    else:
        return [category, final_label]

In [None]:
# Binary Prompt

In [None]:
# import json
# with open('Dataset/sub_narratives_with_explanations.json', 'r') as json_file:
#     sub_narratives_with_explanations = json.load(json_file)
# with open('Dataset/main_narratives_with_explanations.json', 'r') as json_file:
#     main_narratives_with_explanations = json.load(json_file)

# # Step 1: Identify the main narrative using binary classification
# def classify_main_narrative(document_text, category):
#     main_narratives = narratives[category]
#     relevant_main_narratives = []
    
#     for main_narrative in main_narratives:
#         prompt = f"""
#         Please classify whether the document supports the following narrative:

#         Narrative: "{main_narrative}"
#         Explanation: {main_narratives_with_explanations[main_narrative]}

#         Document Text: 
#         {document_text}

#         Respond with "Yes" if the narrative applies to the document, or "No" if it does not. Return only "Yes" or "No" without any additional explanations or text.
#         """
        
#         # Call the LLM to classify each main narrative
#         # generated_output = GPT_call(prompt).strip()
#         # generated_output = llm_call(prompt, llm_pipe).strip()
#         generated_output = unsloth_llm_call(prompt).strip()
        
#         if generated_output.lower() == "yes":
#             relevant_main_narratives.append(main_narrative)
    
#     return relevant_main_narratives

# # Step 2: Identify the sub-narratives using binary classification
# def classify_sub_narratives(document_text, category, main_narratives):
#     all_sub_narratives = {}
#     # print(category, main_narratives)
#     for main_narrative in main_narratives:
#         if main_narrative == "Hidden plots by secret schemes of powerful groups":
#             continue
        
#         relevant_sub_narratives = []
#         for sub_narrative in narratives[category][main_narrative]:
#             prompt = f"""
#             Please classify whether the document supports the following narrative:

#             Narrative: "{sub_narrative}"
#             Explanation: {sub_narratives_with_explanations[sub_narrative]}

#             Document Text:
#             {document_text}

#             Respond with "Yes" if the narrative applies to the document, or "No" if it does not. Return only "Yes" or "No" without any additional explanations or text.
#             """
            
#             # Call the LLM for each sub-narrative classification
#             # generated_output = GPT_call(prompt).strip()
#             # generated_output = llm_call(prompt, llm_pipe).strip()
#             generated_output = unsloth_llm_call(prompt).strip()
            
#             if generated_output.lower() == "yes":
#                 relevant_sub_narratives.append(sub_narrative)

#         if relevant_sub_narratives:
#             all_sub_narratives[main_narrative] = relevant_sub_narratives

#     return all_sub_narratives

# # Function to classify the document and return labels
# def classify_document(document_text):
#     final_label = []

#     # Step 1: Identify the relevant main narratives
#     relevant_main_narratives = []
#     for category in narratives:
#         main_narratives = classify_main_narrative(document_text, category)
#         relevant_main_narratives.extend(main_narratives)

#     if not relevant_main_narratives:
#         print("not relevant_main_narratives")
#         final_label.append("Other : Other")
#     else:
#         # Step 2: Identify the sub-narratives for each main narrative
#         all_sub_narratives = {}
#         for main_narrative in relevant_main_narratives:
#             category = "Ukraine-Russia War" if main_narrative in narratives["Ukraine-Russia War"] else "Climate Change"
#             sub_narratives = classify_sub_narratives(document_text, category, [main_narrative])
#             if sub_narratives:
#                 all_sub_narratives.update(sub_narratives)

#         # Collect the final labels
#         for main, subs in all_sub_narratives.items():
#             for sub in subs:
#                 if sub and sub in sub_narratives_with_explanations:
#                     final_label.append(f"{main} : {sub}")

#     if not final_label:
        
#         return ["Other", ['Other : Other']]
#     else:
#         return [category, final_label]


In [None]:
# get test set predictions 

In [None]:
import glob
import json

# Directory containing the .txt files
test_lang = "EN"
directory = "Dataset/testdata_ST12/"+test_lang+"/subtask-2-documents" # download data from task website

# Dictionary to store the file content
output_file = []

# Use glob to find all .txt files in the directory
for file_path in glob.glob(f"{directory}/*.txt"):
    # Extract the filename from the path
    filename = file_path.split("/")[-1]  # For Windows, use os.path.basename(file_path)
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        category, predicted = classify_document(file_content)
        cat_mapping = {"Ukraine-Russia War":"URW", "Climate Change":"CC", "Other":"Other"}
        if "climate" in category.lower():
            category = "Climate Change"
        if "ukraine" in category.lower() or "russia" in category.lower():
            category = "Ukraine-Russia War"
        cat_code = cat_mapping[category]
        
        if len(predicted) == 1 and predicted[0] == "Other : Other":
            narr_final_joined = "Other"
            subnarr_final_joined = "Other"
        else:
            narr_final = []
            subnarr_final = []
            for i in range(len(predicted)):
                narr, subnarr = predicted[i].split(" : ")
                narr_final.append(cat_code + ": " + narr)
                subnarr_final.append(cat_code + ": " + narr + ": " + subnarr)
            narr_final_joined = ";".join(narr_final)
            subnarr_final_joined = ";".join(subnarr_final)
        
        output_file.append(f"{filename}\t{narr_final_joined}\t{subnarr_final_joined}")
        
        print(f"{filename}\t{narr_final_joined}\t{subnarr_final_joined}")
        
        print("-------------------------")

In [None]:
# Writing to the file
with open("test_set_predictions/GATENLP_"+test_lang+"_1.txt", "w") as file:
    for line in output_file:
        file.write(line + "\n")

In [None]:
# ensemble union

In [None]:
# Reading the file and saving lines into a list
with open("test_set_predictions/GATENLP_"+test_lang+"_1.txt", "r") as file:
    lines2 = [line.split("\t")[-1].strip().split(";") for line in file] 
with open("test_set_predictions/GATENLP_"+test_lang+"_3.txt", "r") as file:
    lines3 = [line.split("\t")[-1].strip().split(";") for line in file] 
with open("test_set_predictions/GATENLP_"+test_lang+"_5.txt", "r") as file:
    lines4 = [line.split("\t")[-1].strip().split(";") for line in file] 

run_outputs = [lines2, lines3, lines4]

def ensemble_union(run_outputs):
    ensembled_results = []
    for instance_preds in zip(*run_outputs):  # Group predictions for each instance
        union_labels = set()
        for run_pred in instance_preds:
            union_labels.update(run_pred)  # Add all unique labels
        ensembled_results.append(list(union_labels))
    return ensembled_results

# Perform union ensembling
new_predictions = ensemble_union(run_outputs)

# Directory containing the .txt files
directory = "Dataset/testdata_ST12/"+test_lang+"/subtask-2-documents"
output_file = []
x=0
for file_path in glob.glob(f"{directory}/*.txt"):
    # Extract the filename from the path
    filename = file_path.split("/")[-1]  # For Windows, use os.path.basename(file_path)

    if len(new_predictions[x]) == 1 and new_predictions[x][0] == "Other":
        narr_final_joined = "Other"
        subnarr_final_joined = "Other"
    else:
        narr_final = []
        subnarr_final = []
        for i in range(len(new_predictions[x])):
            if new_predictions[x][i]=="Other": continue
            items = new_predictions[x][i].split(": ")
            t1, t2 = ": ".join(items[:2]), new_predictions[x][i]
            narr_final.append(t1)
            subnarr_final.append(t2)
        narr_final_joined = ";".join(narr_final)
        subnarr_final_joined = ";".join(subnarr_final)
    
    x+=1
    output_file.append(f"{filename}\t{narr_final_joined}\t{subnarr_final_joined}")
    
    print(f"{filename}\t{narr_final_joined}\t{subnarr_final_joined}")
    
    print("-------------------------")

In [None]:
# Writing to the file
with open("test_set_predictions/GATENLP_"+test_lang+"_7.txt", "w") as file:
    for line in output_file:
        file.write(line + "\n")