In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install --upgrade --force-reinstall "torch==2.3.0" "torchvision" "torchaudio"

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting filelock (from torch==2.3.0)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.3.0)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sympy (from torch==2.3.0)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch==2.3.0)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch==2.3.0)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch==2.3.0)
  Downloading fsspec-2025.12.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.

In [None]:
#!/usr/bin/env python3
"""
integrated_mnli_inference_flipped_subset.py

Run MNLI-based NLI with BART and RoBERTa on a *small subset* of Reddit
propositions, in BOTH directions:

  (1) Original: premise = proposition, hypothesis = fixed agency sentence.
  (2) Flipped:  premise = fixed agency sentence, hypothesis = proposition.

Export:

- Per-model NLI probabilities in both directions:
    * P(ENTAILMENT), P(NEUTRAL), P(CONTRADICTION)
- Per-model agency scores in both directions:
    * s_m(p) = P_m(ENT) - P_m(CON)
- Mean agency scores (original vs flipped).

NOTE:
No ±0.3 thresholding or winner-takes-strongest here; this script is just for
sanity-checking / probing the effect of premise–hypothesis flipping.
"""

import os
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Configuration

MODELS = {
    "BART_MNLI": "facebook/bart-large-mnli",
    "ROBERTA_MNLI": "roberta-large-mnli",
}

FIXED_HYPOTHESIS_AGENCY = (
    "The proposition refers to the ability of humans to make choices, "
    "exert control, or take responsibility for the actions and outcomes of AI."
)

DATA_FILENAME   = "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
OUTPUT_FILENAME = "ai_human_agency_mnli_subset_flipped.csv"
PROPOSITION_COLUMN = "proposition"

# How many propositions to sample for the flip experiment
SUBSET_N = 100  # tweak as you like

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Data loading (and subsetting)

if not os.path.exists(DATA_FILENAME):
    raise FileNotFoundError(f"Input file not found: {DATA_FILENAME}")

df_full = pd.read_json(DATA_FILENAME, lines=True)

if PROPOSITION_COLUMN not in df_full.columns:
    raise KeyError(
        f"Column '{PROPOSITION_COLUMN}' not found. "
        f"Available columns: {list(df_full.columns)}"
    )

df_full[PROPOSITION_COLUMN] = df_full[PROPOSITION_COLUMN].fillna("").astype(str)

if SUBSET_N is not None and SUBSET_N < len(df_full):
    df = df_full.sample(n=SUBSET_N, random_state=42).reset_index(drop=True)
    print(f"Sampled subset of size {len(df)} from {len(df_full)} total rows.")
else:
    df = df_full.copy().reset_index(drop=True)
    print(f"Using full dataset of size {len(df)} (no subsetting).")

sentences = df[PROPOSITION_COLUMN].tolist()

# 3. General MNLI runner for arbitrary premise/hypothesis pairing

def run_mnli_pairs(model_id, premises, hypotheses, batch_size=None, suffix=""):
    """
    Run MNLI with arbitrary (premise, hypothesis) pairs.

    Returns a DataFrame with columns:

        {model_short}_entailment{suffix}
        {model_short}_neutral{suffix}
        {model_short}_contradiction{suffix}
    """
    assert len(premises) == len(hypotheses), "Premises and hypotheses must align."

    print(f"\n--- Running MNLI NLI with model: {model_id} ({suffix or 'orig'}) ---")

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)
    model.eval()

    if batch_size is None:
        batch_size = 32 if device.type == "cuda" else 8

    ent_scores = []
    neu_scores = []
    con_scores = []

    # Normalize labels to uppercase for robust mapping
    id2label = {i: lbl.upper() for i, lbl in model.config.id2label.items()}
    print(f"Model label mapping: {id2label}")

    for i in tqdm(range(0, len(premises), batch_size),
                  desc=f"NLI {model_id.split('/')[-1]}{suffix}"):
        prem_batch = premises[i:i + batch_size]
        hyp_batch  = hypotheses[i:i + batch_size]

        enc = tokenizer(
            prem_batch,
            hyp_batch,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            logits = model(**enc).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()

        for row in probs:
            scores = {id2label[j]: float(row[j]) for j in range(len(row))}
            ent_scores.append(scores.get("ENTAILMENT", 0.0))
            neu_scores.append(scores.get("NEUTRAL", 0.0))
            con_scores.append(scores.get("CONTRADICTION", 0.0))

    model_short = model_id.split("/")[-1]
    return pd.DataFrame({
        f"{model_short}_entailment{suffix}": ent_scores,
        f"{model_short}_neutral{suffix}": neu_scores,
        f"{model_short}_contradiction{suffix}": con_scores,
    })

# 4. Run both models: original vs flipped

df_results = df.copy()

prem_orig = sentences
hypo_orig = [FIXED_HYPOTHESIS_AGENCY] * len(sentences)

prem_flip = [FIXED_HYPOTHESIS_AGENCY] * len(sentences)
hypo_flip = sentences

for _, model_id in MODELS.items():
    # Original direction
    df_orig = run_mnli_pairs(
        model_id,
        premises=prem_orig,
        hypotheses=hypo_orig,
        suffix="_orig"
    )
    # Flipped direction
    df_flip = run_mnli_pairs(
        model_id,
        premises=prem_flip,
        hypotheses=hypo_flip,
        suffix="_flip"
    )

    df_results = pd.concat([df_results, df_orig, df_flip], axis=1)

# 5. Compute agency scores (orig vs flip) and mean scores

def safe_col(df_, name: str) -> pd.Series:
    """Return a float Series for the given column name, or error if missing."""
    if name not in df_.columns:
        raise KeyError(f"Expected column missing: {name}")
    return df_[name].astype(float)

# BART
bart_ent_orig = safe_col(df_results, "bart-large-mnli_entailment_orig")
bart_con_orig = safe_col(df_results, "bart-large-mnli_contradiction_orig")
bart_ent_flip = safe_col(df_results, "bart-large-mnli_entailment_flip")
bart_con_flip = safe_col(df_results, "bart-large-mnli_contradiction_flip")

df_results["bart-large-mnli_agency_score_orig"] = bart_ent_orig - bart_con_orig
df_results["bart-large-mnli_agency_score_flip"] = bart_ent_flip - bart_con_flip

# RoBERTa
rob_ent_orig = safe_col(df_results, "roberta-large-mnli_entailment_orig")
rob_con_orig = safe_col(df_results, "roberta-large-mnli_contradiction_orig")
rob_ent_flip = safe_col(df_results, "roberta-large-mnli_entailment_flip")
rob_con_flip = safe_col(df_results, "roberta-large-mnli_contradiction_flip")

df_results["roberta-large-mnli_agency_score_orig"] = rob_ent_orig - rob_con_orig
df_results["roberta-large-mnli_agency_score_flip"] = rob_ent_flip - rob_con_flip

# Mean agency scores
df_results["mean_agency_score_orig"] = (
    df_results["bart-large-mnli_agency_score_orig"] +
    df_results["roberta-large-mnli_agency_score_orig"]
) / 2.0

df_results["mean_agency_score_flip"] = (
    df_results["bart-large-mnli_agency_score_flip"] +
    df_results["roberta-large-mnli_agency_score_flip"]
) / 2.0

# 6. Export a compact comparison CSV

output_columns = [
    PROPOSITION_COLUMN,
    # BART original / flipped
    "bart-large-mnli_entailment_orig",
    "bart-large-mnli_contradiction_orig",
    "bart-large-mnli_agency_score_orig",
    "bart-large-mnli_entailment_flip",
    "bart-large-mnli_contradiction_flip",
    "bart-large-mnli_agency_score_flip",
    # RoBERTa original / flipped
    "roberta-large-mnli_entailment_orig",
    "roberta-large-mnli_contradiction_orig",
    "roberta-large-mnli_agency_score_orig",
    "roberta-large-mnli_entailment_flip",
    "roberta-large-mnli_contradiction_flip",
    "roberta-large-mnli_agency_score_flip",
    # Mean agency
    "mean_agency_score_orig",
    "mean_agency_score_flip",
]

existing_output_columns = [c for c in output_columns if c in df_results.columns]
df_output = df_results[existing_output_columns].copy()

df_output.to_csv(OUTPUT_FILENAME, index=False)

print("\n--- MNLI Flip Experiment Complete ---")
print(f"Results saved to: {OUTPUT_FILENAME}")
print(df_output.head())
print("-" * 40)

In [3]:
#!/usr/bin/env python3
"""
mnli_flip_sanity_check.py

Goal:
- Take a small subset of your Reddit propositions.
- Run MNLI NLI with BART in two directions:
    1) ORIGINAL:   premise = proposition, hypothesis = FIXED_HYPOTHESIS_AGENCY
    2) FLIPPED:    premise = FIXED_HYPOTHESIS_AGENCY, hypothesis = proposition
- For each direction, extract:
    P(ENTAILMENT), P(NEUTRAL), P(CONTRADICTION)
    agency_score = P(ENTAILMENT) - P(CONTRADICTION)
- Save a comparison CSV for manual inspection.
"""

import os
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Config

MODEL_ID = "facebook/bart-large-mnli"

DATA_FILENAME   = "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
OUTPUT_FILENAME = "mnli_flip_compare_sample.csv"
PROPOSITION_COLUMN = "proposition"

# How many propositions to test
N_SAMPLE = 200  # change if you want more/less

FIXED_HYPOTHESIS_AGENCY = (
    "The proposition refers to the ability of humans to make choices, "
    "exert control, or take responsibility for the actions and outcomes of AI."
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load data and pick subset

if not os.path.exists(DATA_FILENAME):
    raise FileNotFoundError(f"Input file not found: {DATA_FILENAME}")

df = pd.read_json(DATA_FILENAME, lines=True)

if PROPOSITION_COLUMN not in df.columns:
    raise KeyError(
        f"Column '{PROPOSITION_COLUMN}' not found. "
        f"Available columns: {list(df.columns)}"
    )

df[PROPOSITION_COLUMN] = df[PROPOSITION_COLUMN].fillna("").astype(str)

# Deterministic subset: first N rows (you can use .sample(N_SAMPLE, random_state=42) instead)
df_subset = df.head(N_SAMPLE).copy().reset_index(drop=True)
sentences = df_subset[PROPOSITION_COLUMN].tolist()

print(f"Loaded {len(df_subset)} propositions for flip sanity check.")

# 3. Helper: run MNLI NLI for arbitrary (premise, hypothesis) pairing

def run_mnli_nli_direction(model_id, premises, hypotheses, suffix, batch_size=None):
    """
    Run MNLI NLI with:
        premise   = premises[i]
        hypothesis = hypotheses[i]

    Returns a DataFrame with columns:
        bart-large-mnli_entailment_{suffix}
        bart-large-mnli_neutral_{suffix}
        bart-large-mnli_contradiction_{suffix}
    """
    print(f"\n--- Running MNLI with {model_id} [{suffix}] ---")

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)
    model.eval()

    if batch_size is None:
        batch_size = 32 if device.type == "cuda" else 8

    ent_scores = []
    neu_scores = []
    con_scores = []

    # Normalize labels to uppercase for robust mapping
    id2label = {i: lbl.upper() for i, lbl in model.config.id2label.items()}
    print(f"Label mapping: {id2label}")

    for i in tqdm(range(0, len(premises), batch_size),
                  desc=f"NLI {suffix}"):
        prem_batch = premises[i:i + batch_size]
        hyp_batch  = hypotheses[i:i + batch_size]

        enc = tokenizer(
            prem_batch,
            hyp_batch,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            logits = model(**enc).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()

        for row in probs:
            scores = {id2label[j]: row[j] for j in range(len(row))}
            ent_scores.append(scores.get("ENTAILMENT", 0.0))
            neu_scores.append(scores.get("NEUTRAL", 0.0))
            con_scores.append(scores.get("CONTRADICTION", 0.0))

    model_short = model_id.split("/")[-1]
    out_df = pd.DataFrame({
        f"{model_short}_entailment_{suffix}": ent_scores,
        f"{model_short}_neutral_{suffix}": neu_scores,
        f"{model_short}_contradiction_{suffix}": con_scores,
    })

    return out_df


# 4. Run ORIGINAL and FLIPPED directions

# ORIGINAL: premise = proposition, hypothesis = fixed hypothesis
orig_premises   = sentences
orig_hypotheses = [FIXED_HYPOTHESIS_AGENCY] * len(sentences)

df_orig = run_mnli_nli_direction(
    MODEL_ID,
    orig_premises,
    orig_hypotheses,
    suffix="orig"
)

# FLIPPED: premise = fixed hypothesis, hypothesis = proposition
flip_premises   = [FIXED_HYPOTHESIS_AGENCY] * len(sentences)
flip_hypotheses = sentences

df_flip = run_mnli_nli_direction(
    MODEL_ID,
    flip_premises,
    flip_hypotheses,
    suffix="flip"
)

# Merge into subset DataFrame
df_results = pd.concat([df_subset, df_orig, df_flip], axis=1)

# 5. Compute agency scores for orig vs flip

# Extract as floats
bart_ent_orig = df_results["bart-large-mnli_entailment_orig"].astype(float)
bart_con_orig = df_results["bart-large-mnli_contradiction_orig"].astype(float)
bart_ent_flip = df_results["bart-large-mnli_entailment_flip"].astype(float)
bart_con_flip = df_results["bart-large-mnli_contradiction_flip"].astype(float)

df_results["bart-large-mnli_agency_score_orig"] = bart_ent_orig - bart_con_orig
df_results["bart-large-mnli_agency_score_flip"] = bart_ent_flip - bart_con_flip

# Optional: quick delta to eyeball asymmetry
df_results["bart-large-mnli_agency_score_delta"] = (
    df_results["bart-large-mnli_agency_score_orig"] -
    df_results["bart-large-mnli_agency_score_flip"]
)


# 6. Save comparison CSV

cols_for_export = [
    PROPOSITION_COLUMN,
    "bart-large-mnli_entailment_orig",
    "bart-large-mnli_contradiction_orig",
    "bart-large-mnli_agency_score_orig",
    "bart-large-mnli_entailment_flip",
    "bart-large-mnli_contradiction_flip",
    "bart-large-mnli_agency_score_flip",
    "bart-large-mnli_agency_score_delta",
]

df_results[cols_for_export].to_csv(OUTPUT_FILENAME, index=False)
print(f"\nWrote flip comparison file: {OUTPUT_FILENAME}")

# Optional: show top rows with biggest disagreement between orig and flip
print("\nTop 10 propositions by |orig - flip| in agency score:\n")
print(
    df_results.sort_values("bart-large-mnli_agency_score_delta", key=lambda s: s.abs(), ascending=False)
    [[PROPOSITION_COLUMN,
      "bart-large-mnli_agency_score_orig",
      "bart-large-mnli_agency_score_flip",
      "bart-large-mnli_agency_score_delta"]]
    .head(10)
    .to_string(index=False)
)


Using device: cpu
Loaded 200 propositions for flip sanity check.

--- Running MNLI with facebook/bart-large-mnli [orig] ---
Label mapping: {0: 'CONTRADICTION', 1: 'NEUTRAL', 2: 'ENTAILMENT'}


NLI orig:   0%|          | 0/25 [00:00<?, ?it/s]


--- Running MNLI with facebook/bart-large-mnli [flip] ---
Label mapping: {0: 'CONTRADICTION', 1: 'NEUTRAL', 2: 'ENTAILMENT'}


NLI flip:   0%|          | 0/25 [00:00<?, ?it/s]


Wrote flip comparison file: mnli_flip_compare_sample.csv

Top 10 propositions by |orig - flip| in agency score:

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      