In [None]:
# ========================================
# 1. Install Dependencies
# ========================================
!pip install transformers datasets scikit-learn newspaper3k PyMuPDF



In [None]:
# ========================================
# 2. Setup & Initialization
# ========================================
import pandas as pd
from datasets import load_dataset
from datasets import Dataset
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Trainer
from transformers import TrainingArguments
import torch
import newspaper
import fitz
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
# ========================================
# 3. Data Preparation
# ========================================
def load_and_prepare_data():
    # Load the LIAR dataset
    dataset = load_dataset("liar")

    # Convert to DataFrames
    df_train = pd.DataFrame(dataset['train'])
    df_val = pd.DataFrame(dataset['validation'])
    df_test = pd.DataFrame(dataset['test'])

    # Define robust label mapping
    def map_labels(label):
        if isinstance(label, str):
            label = int(label)
        # Original LIAR labels:
        # 0: pants-fire, 1: false, 2: barely-true,
        # 3: half-true, 4: mostly-true, 5: true
        # Map to binary: 0 = real (4,5), 1 = fake (0,1,2,3)
        return 0 if label in [4, 5] else 1

    # Apply label mapping and convert to integers
    for df in [df_train, df_val, df_test]:
        df['label'] = df['label'].apply(map_labels).astype(int)
        df = df.dropna(subset=['label', 'statement'])  # Remove any NA values

    # Convert back to HuggingFace datasets
    return {
        'train': Dataset.from_pandas(df_train),
        'validation': Dataset.from_pandas(df_val),
        'test': Dataset.from_pandas(df_test)
    }

dataset = load_and_prepare_data()

In [None]:
# ========================================
# 4. Tokenization
# ========================================
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    return roberta_tokenizer(batch['statement'], truncation=True, padding='max_length', max_length=128)

# Apply tokenization to all splits
for split in dataset:
    dataset[split] = dataset[split].map(tokenize, batched=True)
    dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

In [None]:
# ========================================
# 5. Model Setup & Fine-Tuning
# ========================================
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    problem_type="single_label_classification"
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [37]:
# Metrics computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [38]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics
)

In [39]:
# Fine-tune the model
print("Starting fine-tuning...")
trainer.train()

Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3889,0.667457,0.720405,0.828476,0.746127,0.931257
2,0.4523,0.677116,0.719626,0.825919,0.751099,0.917293


TrainOutput(global_step=1284, training_loss=0.4344906799518431, metrics={'train_runtime': 493.0904, 'train_samples_per_second': 41.652, 'train_steps_per_second': 2.604, 'total_flos': 1350943713745920.0, 'train_loss': 0.4344906799518431, 'epoch': 2.0})

In [40]:
# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(dataset['test'])
print("\nTest set results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")


Evaluating on test set...



Test set results:
eval_loss: 0.5921
eval_accuracy: 0.7568
eval_f1: 0.8562
eval_precision: 0.7787
eval_recall: 0.9509
eval_runtime: 7.8524
eval_samples_per_second: 163.3900
eval_steps_per_second: 10.3150
epoch: 2.0000


In [None]:
# ========================================
# 6. Input Parser (Text | URL | PDF)
# ========================================
def extract_text(input_type, value):
    """Extract text from various input types"""
    try:
        if input_type == 'url':
            article = newspaper.Article(value)
            article.download()
            article.parse()
            return article.text
        elif input_type == 'pdf':
            with fitz.open(value) as doc:
                return "\n".join([page.get_text() for page in doc])
        elif input_type == 'text':
            return value
        else:
            return "Invalid input type"
    except Exception as e:
        return f"Error processing input: {str(e)}"

In [None]:
# ========================================
# 7. Prediction & Explanation Pipeline
# ========================================
# Initialize FLAN-T5 for explanation
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

def classify_and_explain(text):
    """Classify text and generate explanation"""
    # Ensure models are on the same device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    flan_model.to(device)

    # Classification with fine-tuned RoBERTa
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    confidence = probs[0][pred].item()
    label = "Real" if pred == 0 else "Fake"

    # Explanation with FLAN-T5
    prompt = f"Explain why this news might be {label.lower()} in one sentence: {text}"
    explanation_inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    explanation_ids = flan_model.generate(
        explanation_inputs['input_ids'],
        max_new_tokens=100,
        num_beams=5,
        early_stopping=True
    )
    explanation = flan_tokenizer.decode(explanation_ids[0], skip_special_tokens=True)

    return label, confidence, explanation

In [None]:
# ========================================
# 8. Full Inference Pipeline
# ========================================
def analyze_news(input_type, value):
    """Complete pipeline from input to prediction"""
    print(f"\nProcessing {input_type} input...")

    # Step 1: Extract text
    text = extract_text(input_type, value)
    if text.startswith("Error") or text == "Invalid input type":
        print(f"Error: {text}")
        return

    print(f"\nExtracted text:\n{text[:500]}...\n")

    # Step 2: Classify and explain
    label, confidence, explanation = classify_and_explain(text)

    # Step 3: Display results
    print(f"\nPrediction Results:")
    print(f"Label: {label}")
    print(f"Confidence: {confidence:.2f}")
    print(f"Explanation: {explanation}")

    return label, confidence, explanation

In [None]:
# ========================================
# 9. Example Usage
# ========================================
# Example 1: Text input
analyze_news(
    input_type='text',
    value="Scientists confirm that eating chocolate daily improves longevity by 20 years."
)

# Example 2: Fake news example
analyze_news(
    input_type='text',
    value="The moon landing was filmed in a Hollywood studio."
)


Processing text input...

Extracted text:
Scientists confirm that eating chocolate daily improves longevity by 20 years....


🔍 Prediction Results:
🏷️ Label: Fake
📊 Confidence: 0.96
💡 Explanation: chocolate is a good source of Vitamin E.

Processing text input...

Extracted text:
The moon landing was filmed in a Hollywood studio....


🔍 Prediction Results:
🏷️ Label: Fake
📊 Confidence: 0.91
💡 Explanation: The moon landing was filmed in a Hollywood studio.


('Fake',
 0.9064982533454895,
 'The moon landing was filmed in a Hollywood studio.')

In [None]:
# ========================================
# 10. Save Model
# ========================================
model.save_pretrained("./fine_tuned_liar_detector")
roberta_tokenizer.save_pretrained("./fine_tuned_liar_detector")
print("\nModel saved to ./fine_tuned_liar_detector")


Model saved to ./fine_tuned_liar_detector


In [46]:
from google.colab import files
import shutil

# Zip the folder
shutil.make_archive("fine_tuned_liar_detector", 'zip', "./fine_tuned_liar_detector")

# Download the zip file
files.download("fine_tuned_liar_detector.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Multi Agent**

In [1]:
# ========================================
# 1. Imports & Setup
# ========================================
import torch
import fitz
import newspaper
from duckduckgo_search import DDGS
from langgraph.graph import StateGraph
from langgraph.graph import END
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ========================================
# 2. Load Models
# ========================================
roberta_tokenizer = RobertaTokenizer.from_pretrained('../fine_tuned_liar_detector')
roberta_model = RobertaForSequenceClassification.from_pretrained('../fine_tuned_liar_detector')

In [3]:
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_model.to(device)
flan_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [4]:
# ========================================
# 3. Define Agents
# ========================================

# Initial state

# InputHandlerAgent
def InputHandlerAgent(state):
    input_type = state["input_type"]
    value = state["value"]

    try:
        if input_type == 'url':
            article = newspaper.Article(value)
            article.download()
            article.parse()
            text = article.text
        elif input_type == 'pdf':
            with fitz.open(value) as doc:
                text = "\n".join([page.get_text() for page in doc])
        elif input_type == 'text':
            text = value
        else:
            text = ""

        return {**state, "text": text}
    except Exception as e:
        return {**state, "error": str(e), "text": ""}

In [5]:
# PlannerAgent
def PlannerAgent(state):
    text = state.get("text", "")
    if len(text.strip()) < 50:
        return {**state, "next": "FallbackSearchAgent"}
    return {**state, "next": "ToolRouterAgent"}

In [6]:
# FallbackSearchAgent
def FallbackSearchAgent(state):
    value = state.get("value")
    query = value if isinstance(value, str) else "latest news fake"
    results = DDGS().text(query)
    try:
        top_result = next(results)
        summary = top_result['body']
    except Exception:
        summary = "No fallback results found."

    return {**state, "text": summary, "fallback_used": True}

In [7]:
# ToolRouterAgent
def ToolRouterAgent(state):
    return {**state, "next": "ExecutorAgent"}

In [8]:
# ClassifierAgent
def ClassifierAgent(state):
    inputs = roberta_tokenizer(state['text'], return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
    probs = torch.softmax(logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    label = "Real" if pred == 0 else "Fake"
    confidence = probs[0][pred].item()

    return {**state, "label": label, "confidence": confidence}

In [9]:
# ExplanationAgent
def ExplanationAgent(state):
    label = state.get("label", "Fake")
    prompt = f"Explain why this news might be {label.lower()} in one sentence: {state['text']}"
    inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    output_ids = flan_model.generate(inputs['input_ids'], max_new_tokens=100)
    explanation = flan_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return {**state, "explanation": explanation}

In [10]:
# ExecutorAgent
def ExecutorAgent(state):
    state = ClassifierAgent(state)
    state = ExplanationAgent(state)
    return state

In [None]:
# ========================================
# 4. Build LangGraph
# ========================================
from typing import TypedDict
from typing import Optional
from typing import Literal
from langgraph.graph import StateGraph

# Define the state type
class AgentState(TypedDict):
    input_type: Literal["url", "pdf", "text"]
    value: str
    text: Optional[str]
    error: Optional[str]
    next: Optional[str]
    label: Optional[str]
    confidence: Optional[float]
    explanation: Optional[str]
    fallback_used: Optional[bool]

In [12]:
# Initialize the graph with the state schema
graph = StateGraph(AgentState)

graph.add_node("InputHandlerAgent", InputHandlerAgent)
graph.add_node("PlannerAgent", PlannerAgent)
graph.add_node("FallbackSearchAgent", FallbackSearchAgent)
graph.add_node("ToolRouterAgent", ToolRouterAgent)
graph.add_node("ExecutorAgent", ExecutorAgent)

graph.set_entry_point("InputHandlerAgent")

graph.add_edge("InputHandlerAgent", "PlannerAgent")
graph.add_conditional_edges("PlannerAgent", lambda s: s["next"], {
    "FallbackSearchAgent": "FallbackSearchAgent",
    "ToolRouterAgent": "ToolRouterAgent"
})
graph.add_edge("FallbackSearchAgent", "ToolRouterAgent")
graph.add_edge("ToolRouterAgent", "ExecutorAgent")
graph.add_edge("ExecutorAgent", END)

runnable = graph.compile()

In [13]:
# ========================================
# 5. Example Usage
# ========================================
result = runnable.invoke({
    "input_type": "text",
    "value": "The moon landing was staged in a Hollywood studio."
})

# Final output
print("\nPrediction Results")
print(f"Label: {result['label']}")
print(f"Confidence: {result['confidence']:.2f}")
print(f"Explanation: {result['explanation']}")
if result.get("fallback_used"):
    print("Used fallback source (DuckDuckGo/Wikipedia)")

  return forward_call(*args, **kwargs)



Prediction Results
Label: Fake
Confidence: 0.60
Explanation: The moon landing was staged in a Hollywood studio.


In [14]:
# ========================================
# 5. Hardcoded Example Usage
# ========================================

# Example 1: Text input
text_result = runnable.invoke({
    "input_type": "text",
    "value": "The moon landing was staged in a Hollywood studio."
})

# Example 2: URL input 
url_result = runnable.invoke({
    "input_type": "url",
    "value": "https://www.bbc.com/news/articles/cr5rdl1y8ndo"  # Replace with actual URL
})

# Example 3: PDF input
pdf_result = runnable.invoke({
    "input_type": "pdf",
    "value": "../news/news.pdf"  # Replace with actual PDF path
})

# Print results in your original format
def print_results(result, input_type):
    print(f"\n{input_type.capitalize()} Prediction Results")
    print(f"Label: {result['label']}")
    print(f"Confidence: {result['confidence']:.2f}")
    print(f"Explanation: {result['explanation']}")
    if result.get("fallback_used"):
        print("Used fallback source (DuckDuckGo/Wikipedia)")

# Display all results
print_results(text_result, "text input")
print_results(url_result, "url input") 
print_results(pdf_result, "pdf input")

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



Text input Prediction Results
Label: Fake
Confidence: 0.60
Explanation: The moon landing was staged in a Hollywood studio.

Url input Prediction Results
Label: Fake
Confidence: 0.65
Explanation: Ukraine's president

Pdf input Prediction Results
Label: Fake
Confidence: 0.64
Explanation: US envoy Steve Witkoff meets Vladimir Putin as Trump's ceasefire deadline looms
