In [None]:
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import os
import transformers
import json
import re

# Set device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load the model and tokenizer
prebuilt_model_id = "lmsys/vicuna-7b-v1.5"  # lmsys/vicuna-13b-v1.5 is also an option
tokenizer = AutoTokenizer.from_pretrained(
    prebuilt_model_id,
    use_fast=True,
    trust_remote_code=True,
)
model = AutoModelForCausalLM.from_pretrained(
    prebuilt_model_id,
    low_cpu_mem_usage=True,
    device_map="cuda",
)

# Load data
with open('../Dataset/labels.json', 'r') as json_file:
    narratives_all = json.load(json_file)
narratives = narratives_all['Climate Change']  # Replace with 'Ukraine-Russia War' as needed

with open('../Dataset/sub_narratives_with_explanations.json', 'r') as json_file:
    sub_narrative_contexts = json.load(json_file)

# Function to parse articles from the generated text
def parse_articles(text):
    article_pattern = re.compile(r"(Article \d+:)(.*?)(?=Article \d+:|$)", re.DOTALL)
    articles = article_pattern.findall(text)
    return [content.strip().strip("#") for _, content in articles]

# Initialize data storage
gen_data = []

# Generate articles
for main_narr in narratives.keys():
    for sub_narrative in narratives[main_narr]:
        print(f"Processing sub-narrative: {sub_narrative}")
        
        context = sub_narrative_contexts[sub_narrative]
        articles_count = 0  # Track the number of articles generated
        temp_values = [1.0, 1.3, 1.5]  # List of temperatures for diversity

        while articles_count < 200:
            DEFAULT_INSTRUCTIONS = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."""
            USER_NAME = "USER"
            BOT_NAME = "ASSISTANT"

            # Construct the prompt
            prompt = f"""
            You are an AI news curator. Generate 5 different news articles related to the following topic on Climate Change.

            Topic: {sub_narrative}
            Context: {context}

            Each article should be between 400-500 words and explore a unique aspect, perspective, or event related to this topic. Focus on delivering informative, coherent, and engaging articles that reflect diverse points of view or angles on the given topic. Avoid redundancy by ensuring that each article highlights a different aspect or argument related to the context provided.

            The output format should look like this:

            Article 1: 
            Article 2: 
            Article 3:
            Article 4:
            Article 5:
            """
            prompt = f"{DEFAULT_INSTRUCTIONS}\n{USER_NAME}: {prompt}\n{BOT_NAME}:"

            # Tokenize and generate
            temperature = temp_values[articles_count // 20 % len(temp_values)]  # Cycle through temp_values
            input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
            generation_output = model.generate(
                input_ids=input_ids, 
                temperature=temperature, 
                max_length=30000, 
                do_sample=True
            )
            response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
            response = response.split("ASSISTANT: ")[-1]
            articles = parse_articles(response)

            # Append articles to gen_data and count them
            for article in articles:
                if articles_count < 200:  # Check to avoid over-generating
                    gen_data.append({
                        "file_Content": article,
                        "narratives_list": [main_narr],
                        "subnarratives_list": [sub_narrative]
                    })
                    articles_count += 1
                else:
                    break  # Break if we've hit 200 articles
            
            print(f"Generated {articles_count}/200 articles for sub-narrative: {sub_narrative} (Temp: {temperature})")


In [None]:
import json
# Save the list of dictionaries to a JSON file
with open('CC_gen_data.json', 'w') as json_file:
    json.dump(gen_data, json_file, indent=4)