In [5]:
import os
import json
import math
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_postgres import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')
print('Imports ready')

Imports ready


In [6]:
# Paths setup
BASE = Path('c:/Users/rayaa/OneDrive/Documents/VSCode/CSCI5832/Semeval')
RAG_TASKS_PATH = BASE / 'human' / 'generation_tasks' / 'RAG.jsonl'
CORPUS_PATH = BASE / 'corpora' / 'passage_level' / 'cloud.jsonl'

# Model and database setup
EMBED_MODEL = 'Snowflake/snowflake-arctic-embed-l-v2.0'
GENERATION_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
PG_ENV_PATH = BASE / '.pg_env'

print('Paths and models configured')

Paths and models configured


In [7]:
def load_rag_tasks(jsonl_path, collection_name="mt-rag-ibmcloud-elser-512-100-20240502"):
    """Load RAG tasks from JSONL file, filtering by id.Collection."""
    tasks = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue

            obj = json.loads(line)

            # Safely extract Collection field
            collection = obj.get("Collection", "")

            if collection == collection_name:
                tasks.append(obj)

    return tasks

# Load RAG tasks
rag_tasks = load_rag_tasks(RAG_TASKS_PATH)
print(f"Loaded {len(rag_tasks)} filtered RAG tasks")


Loaded 205 filtered RAG tasks


In [8]:
def extract_conversation_text(task):
    """Extract the current question from conversation input"""
    input_data = task.get('input', [])
    if isinstance(input_data, list) and len(input_data) > 0:
        # Get the last user message
        for msg in reversed(input_data):
            if msg.get('speaker') == 'user':
                return msg.get('text', '')
    return ''

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

def setup_generator(model_name=GENERATION_MODEL):
    """Setup the text generation model"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True
        )
        return generator
    except Exception as e:
        print(f"Error setting up generator: {e}")
        return None

# Setup the generator
generator = setup_generator()
print('Generator setup complete')

Device set to use cuda:0


Generator setup complete


In [9]:
def create_generation_prompt(question, contexts, conversation_history=None):
    """Create a prompt for answer generation using retrieved contexts"""
    
    # Build context string
    context_text = ""
    for i, ctx in enumerate(contexts, 1):
        context_text += f"Context {i}: {ctx['text']}\n\n"
    
    # Build conversation history if available
    history = ""
    if conversation_history:
        turns = [
            f"{t['speaker'].capitalize()}: {t['text']}"
            for t in conversation_history
        ]
        history = "Conversation:\n" + "\n".join(turns) + "\n\n"

    prompt = f"""You are a concise conversational assistant.

Use ONLY the information found in the contexts.  
If the answer is not in the contexts, say exactly: **"The contexts do not contain the answer."**

Rules:
- Do NOT explain your reasoning.
- Do NOT mention the instructions.
- Do NOT invent information.
- Answer in a single short, natural paragraph (1â€“2 sentences).
- No meta-commentary (e.g., "I will now...").

{history}Contexts:
{context_text}

User: {question}
Assistant:"""

    return prompt

def generate_answer(prompt, generator):
    """Generate answer using the language model"""
    if generator is None:
        return "[Generation model not available]"
    
    try:
        outputs = generator(
            prompt,
            return_full_text=False,
            pad_token_id=generator.tokenizer.eos_token_id
        )
        
        if outputs and len(outputs) > 0:
            return outputs[0]['generated_text'].strip()
        else:
            return "[No response generated]"
    except Exception as e:
        print(f"Error in generation: {e}")
        return f"[Generation error: {e}]"
    
def trim_to_token_limit(text, tokenizer, max_length=4000):
    tokens = tokenizer.encode(text)
    if len(tokens) <= max_length:
        return text
    # Keep last max_length tokens so the question stays
    trimmed = tokenizer.decode(tokens[-max_length:])
    return trimmed


In [10]:
def run_task_b_rag(tasks, generator, output_path, do_subset=False):
    """Run full Task B pipeline: reference generation"""
    
    results = []
    
    if (do_subset):
        tasks = tasks[:5]
    
    for task in tqdm(tasks, desc="Processing RAG tasks"):
        # Extract current question
        current_question = extract_conversation_text(task)
        
        if not current_question:
            print(f"Warning: No question found for task {task.get('task_id')}")
            continue

        provided_contexts = task.get('contexts', [])

        formatted_contexts = []
        for i, ctx in enumerate(provided_contexts):
            if isinstance(ctx, dict):
                formatted_contexts.append({
                    'document_id': ctx.get('document_id', f'provided_{i}'),
                    'text': ctx.get('text', ''),
                    'score': 1.0  # All provided contexts get max score
                })
            else:
                # If context is just text
                formatted_contexts.append({
                    'document_id': f'provided_{i}',
                    'text': str(ctx),
                    'score': 1.0
                })
        
        # Get conversation history (all but last user message)
        conversation_history = []
        input_data = task.get('input', [])
        if isinstance(input_data, list):
            # Include all but the last user message (current question)
            found_last_user = False
            for msg in reversed(input_data):
                if msg.get('speaker') == 'user' and not found_last_user:
                    found_last_user = True
                    continue
                conversation_history.insert(0, msg)
        
        # Generate answer
        prompt = create_generation_prompt(current_question, formatted_contexts, conversation_history)
        prompt = trim_to_token_limit(prompt, generator.tokenizer, max_length=4000)
        generated_answer = generate_answer(prompt, generator)
        
        # Prepare result in evaluation format
        result_task = task.copy()
        
        # Add prediction in the format expected by evaluation script
        result_task['predictions'] = [{
            'text': generated_answer
        }]
        
        results.append(result_task)
    
    # Save results
    output_dir = Path(output_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')
    
    print(f"Saved {len(results)} results to {output_path}")
    return results

# Run Task B
output_file = BASE / 'rayaan' / 'outputs' / 'task_b_rag_predictions.jsonl'
task_b_results = run_task_b_rag(rag_tasks, generator, output_file, do_subset=True)

Processing RAG tasks:   0%|          | 0/5 [00:00<?, ?it/s]

Saved 5 results to c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\task_b_rag_predictions.jsonl
