# VIBE — Headline Frame Game (Final)

Full pipeline + classifier stub.

Run cells top-to-bottom. Install packages in the first cell if needed.

In [None]:
# Install dependencies (run once)
!pip install --quiet feedparser requests diskcache beautifulsoup4 nltk spacy vaderSentiment scikit-learn matplotlib wordcloud ipywidgets plotly
# Optional heavy packages for HF & BERTopic (uncomment if you want these features):
# !pip install --quiet transformers sentence-transformers umap-learn hdbscan bertopic
# After installing spaCy model run:
# !python -m spacy download en_core_web_sm


In [None]:
import os, time, re, json, pickle, hashlib, logging
from datetime import datetime, timedelta
from typing import List, Dict, Any, Tuple, Optional

import feedparser
from bs4 import BeautifulSoup
import unicodedata

# spaCy (optional)
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
except Exception as e:
    print('spaCy not available or model missing:', e)
    nlp = None

# VADER sentiment
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    vader = SentimentIntensityAnalyzer()
except Exception:
    vader = None

# sklearn utilities
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Optional heavy libraries (loaded later if installed)
try:
    from transformers import pipeline
    HF_AVAILABLE = True
except Exception:
    HF_AVAILABLE = False
    pipeline = None

try:
    from sentence_transformers import SentenceTransformer
    from bertopic import BERTopic
    BERTOPIC_AVAILABLE = True
except Exception:
    BERTOPIC_AVAILABLE = False
    SentenceTransformer = None
    BERTopic = None

# Visualization & widgets
import matplotlib.pyplot as plt
from wordcloud import WordCloud
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
except Exception:
    widgets = None

import pandas as pd
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('VIBE_Final')

CACHE_DIR = './vibe_cache'
CACHE_EXPIRE_HOURS = 6

DEFAULT_RSS_FEEDS = {
    'NDTV': 'https://feeds.feedburner.com/ndtvnews-latest',
    'TheTimesOfIndia': 'https://timesofindia.indiatimes.com/rssfeedstopstories.cms',
    'HindustanTimes': 'https://www.hindustantimes.com/feeds/rss/topnews/rssfeed.xml',
}

FRAME_KEYWORDS = {
    'conflict': ['clash', 'attack', 'slams', 'condemns', 'fight', 'protest', 'violence', 'conflict'],
    'human_interest': ['family', 'children', 'stories', 'personal', 'meet', 'recounts'],
    'economic': ['economy', 'inflation', 'jobs', 'market', 'business', 'trade', 'GDP'],
    'responsibility': ['responsible', 'fail', 'blame', 'accountability', 'investigate'],
    'morality': ['moral', 'immoral', 'ethical', 'virtue', 'sin'],
}

os.makedirs(CACHE_DIR, exist_ok=True)


In [None]:
# Caching helpers
def cache_get(key: str):
    path = os.path.join(CACHE_DIR, hashlib.sha1(key.encode()).hexdigest() + '.pkl')
    if os.path.exists(path):
        try:
            data, ts = pickle.load(open(path,'rb'))
            if datetime.utcnow() - ts < timedelta(hours=CACHE_EXPIRE_HOURS):
                return data
        except Exception:
            return None
    return None

def cache_set(key: str, value: Any):
    path = os.path.join(CACHE_DIR, hashlib.sha1(key.encode()).hexdigest() + '.pkl')
    pickle.dump((value, datetime.utcnow()), open(path,'wb'))

# RSS fetching
def fetch_rss(url: str, retries: int = 2, backoff: float = 1.0) -> Any:
    key = f"rss::{url}"
    cached = cache_get(key)
    if cached:
        logger.info(f"Cache hit for {url}")
        return cached
    last_exc = None
    for i in range(retries+1):
        try:
            logger.info(f"Fetching {url} (attempt {i+1})")
            feed = feedparser.parse(url)
            cache_set(key, feed)
            return feed
        except Exception as e:
            last_exc = e
            logger.warning(f"Error fetching {url}: {e}")
            time.sleep(backoff * (2**i))
    raise last_exc

def fetch_all_feeds(feed_dict: Dict[str,str]) -> Dict[str, List[Dict]]:
    results = {}
    health = {}
    for name, url in feed_dict.items():
        try:
            feed = fetch_rss(url)
            entries = []
            for e in feed.entries:
                title = e.get('title','')
                link = e.get('link','')
                published = e.get('published', e.get('updated',''))
                summary = e.get('summary','')
                entries.append({'title': title, 'link': link, 'published': published, 'summary': summary, 'source': name})
            results[name] = entries
            health[name] = {'ok': True, 'count': len(entries)}
        except Exception as e:
            logger.error(f"Feed {name} failed: {e}")
            results[name] = []
            health[name] = {'ok': False, 'error': str(e)}
    cache_set('feed_health', {'timestamp': datetime.utcnow().isoformat(), 'health': health})
    return results


In [None]:
# Text cleaning, dedupe, lemmatize
def normalize_text(t: str) -> str:
    if not t:
        return ''
    t = unicodedata.normalize('NFKD', t)
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

def clean_html(text: str) -> str:
    if not text:
        return ''
    soup = BeautifulSoup(text, 'html.parser')
    return normalize_text(soup.get_text(separator=' '))

def dedupe_entries(all_entries: List[Dict], threshold: float = 0.92) -> List[Dict]:
    for e in all_entries:
        e['title_norm'] = normalize_text(e.get('title','')).lower()
    unique = []
    seen = set()
    for e in all_entries:
        key = (e.get('link') or '') or e['title_norm']
        if key in seen:
            continue
        seen.add(key)
        unique.append(e)
    if len(unique) > 1:
        vec = TfidfVectorizer(stop_words='english', max_df=0.85)
        X = vec.fit_transform([u['title_norm'] for u in unique])
        sim = cosine_similarity(X)
        to_drop = set()
        for i in range(sim.shape[0]):
            for j in range(i+1, sim.shape[1]):
                if sim[i,j] > threshold:
                    if len(unique[i]['title']) >= len(unique[j]['title']):
                        to_drop.add(j)
                    else:
                        to_drop.add(i)
        unique = [u for idx,u in enumerate(unique) if idx not in to_drop]
    return unique

def lemmatize_text(text: str) -> str:
    if nlp is None:
        return text
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(lemmas)


In [None]:
# Sentiment, entities, frames
def sentiment_vader(text: str) -> Dict:
    if vader is None:
        return {'neg': None, 'neu': None, 'pos': None, 'compound': None}
    return vader.polarity_scores(text)

def extract_entities(text: str) -> List[Tuple[str,str]]:
    if nlp is None:
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def detect_frames(text: str) -> Dict[str,int]:
    t = (text or '').lower()
    scores = {}
    for f,kws in FRAME_KEYWORDS.items():
        scores[f] = sum(1 for kw in kws if kw in t)
    return scores


In [None]:
# Compute metrics (optionally using HF pipeline if passed)
def compute_bias_metrics(entries: List[Dict], use_hf: bool=False, hf_pipeline=None, hf_batch_size: int=32):
    texts = []
    if use_hf and hf_pipeline is not None:
        for e in entries:
            texts.append((clean_html(e.get('title','')) + ' ' + clean_html(e.get('summary',''))).strip())
        hf_results = []
        for i in range(0, len(texts), hf_batch_size):
            batch = texts[i:i+hf_batch_size]
            try:
                hf_results.extend(hf_pipeline(batch))
            except Exception:
                for t in batch:
                    try:
                        hf_results.extend(hf_pipeline(t[:512]))
                    except Exception:
                        hf_results.append({'label': None, 'score': None})
    else:
        hf_results = [None] * len(entries)

    for idx, e in enumerate(entries):
        e['title_clean'] = clean_html(e.get('title',''))
        e['summary_clean'] = clean_html(e.get('summary',''))
        e['title_lem'] = lemmatize_text(e['title_clean'])
        e['summary_lem'] = lemmatize_text(e['summary_clean'])
        e['sentiment_vader'] = sentiment_vader(e['title_clean'] + ' ' + e['summary_clean'])
        e['sentiment_hf'] = hf_results[idx] if idx < len(hf_results) else None
        e['entities'] = extract_entities(e['title_clean'] + ' ' + e['summary_clean'])
        e['frames'] = detect_frames(e['title_clean'] + ' ' + e['summary_clean'])

    agg = {}
    for e in entries:
        s = e['source']
        if s not in agg:
            agg[s] = {'count': 0, 'vader_compound_sum': 0.0, 'hf_scores': [], 'frames': {}, 'entities': {}}
        agg[s]['count'] += 1
        if isinstance(e['sentiment_vader'], dict) and e['sentiment_vader'].get('compound') is not None:
            agg[s]['vader_compound_sum'] += e['sentiment_vader']['compound']
        if e['sentiment_hf'] and isinstance(e['sentiment_hf'], dict):
            lab = e['sentiment_hf'].get('label','')
            sc = e['sentiment_hf'].get('score',0.0) or 0.0
            lab_l = str(lab).lower()
            if 'pos' in lab_l:
                val = float(sc)
            elif 'neg' in lab_l:
                val = -float(sc)
            else:
                val = 0.0
            agg[s]['hf_scores'].append(val)
        for f,v in e['frames'].items():
            agg[s]['frames'][f] = agg[s]['frames'].get(f,0) + v
        for ent,_ in e['entities']:
            agg[s]['entities'][ent] = agg[s]['entities'].get(ent,0) + 1

    for s,v in agg.items():
        v['avg_vader_compound'] = v['vader_compound_sum'] / max(1, v['count'])
        v['avg_hf_score'] = (sum(v['hf_scores']) / len(v['hf_scores'])) if len(v['hf_scores'])>0 else None
    return agg


In [None]:
# Pipeline runner
def run_pipeline(feeds: Dict[str,str]=DEFAULT_RSS_FEEDS, use_hf: bool=False, hf_model: Optional[str]=None):
    hf_pipe = None
    if use_hf and HF_AVAILABLE and hf_model is not None:
        try:
            hf_pipe = pipeline('sentiment-analysis', model=hf_model, device=0 if __import__('torch').cuda.is_available() else -1, truncation=True)
        except Exception as e:
            print('Failed to init HF pipeline:', e)
            hf_pipe = None
    raw = fetch_all_feeds(feeds)
    all_entries = []
    for src, items in raw.items():
        for it in items:
            all_entries.append(it)
    print(f'Fetched {len(all_entries)} articles')
    unique = dedupe_entries(all_entries)
    print(f'After dedupe: {len(unique)} articles')
    agg = compute_bias_metrics(unique, use_hf=(use_hf and hf_pipe is not None), hf_pipeline=hf_pipe)
    try:
        plt.figure(figsize=(8,3))
        sources = list(agg.keys())
        vals = [agg[s]['avg_vader_compound'] for s in sources]
        plt.bar(sources, vals)
        plt.title('Avg VADER compound sentiment by source')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    except Exception:
        pass
    return {'raw': raw, 'unique': unique, 'agg': agg}


## Frame Classifier — labeling UI + training

Label headlines with frames, train a TF-IDF + LogisticRegression model, and evaluate.

In [None]:
# Labeling UI
labeled_path = 'frame_labels.json'

def save_labels(labels):
    with open(labeled_path, 'w', encoding='utf-8') as f:
        json.dump(labels, f, ensure_ascii=False, indent=2)
    print('Saved', labeled_path)

def load_labels():
    if os.path.exists(labeled_path):
        return json.load(open(labeled_path,'r',encoding='utf-8'))
    return {}

labels = load_labels()

if 'results' not in globals():
    print('Run the pipeline cell first: results = run_pipeline()')
else:
    unique = results['unique']
    if len(unique) == 0:
        print('No articles in results to label.')
    else:
        def show_item(i):
            it = unique[i]
            print(f"Index: {i} | Source: {it.get('source')} | Published: {it.get('published')}")
            print(it.get('title'))
            print('Summary:', it.get('summary'))
            cur = labels.get(str(i), None)
            print('Current label:', cur)

        idx_widget = widgets.IntText(value=0, description='Index')
        prev_btn = widgets.Button(description='Prev')
        next_btn = widgets.Button(description='Next')
        save_btn = widgets.Button(description='Save label')
        label_dropdown = widgets.Dropdown(options=['none'] + list(FRAME_KEYWORDS.keys()), description='Label')
        out = widgets.Output()

        def update_display(change=None):
            with out:
                clear_output()
                i = int(idx_widget.value)
                if i < 0: idx_widget.value = 0
                if i >= len(unique): idx_widget.value = len(unique)-1
                show_item(idx_widget.value)
                label_dropdown.value = labels.get(str(idx_widget.value), 'none')

        def on_prev(b):
            idx_widget.value = max(0, idx_widget.value-1)
            update_display()
        def on_next(b):
            idx_widget.value = min(len(unique)-1, idx_widget.value+1)
            update_display()
        def on_save(b):
            labels[str(idx_widget.value)] = label_dropdown.value
            save_labels(labels)
            print('Saved label for', idx_widget.value)

        prev_btn.on_click(on_prev)
        next_btn.on_click(on_next)
        save_btn.on_click(on_save)

        display(widgets.HBox([prev_btn, next_btn, idx_widget, label_dropdown, save_btn]))
        display(out)
        update_display()


In [None]:
# Training the TF-IDF + LogisticRegression classifier
labels = load_labels()
rows = []
for k,v in labels.items():
    idx = int(k)
    it = results['unique'][idx]
    text = (clean_html(it.get('title','')) + ' ' + clean_html(it.get('summary',''))).strip()
    if v != 'none':
        rows.append({'text': text, 'label': v})
df_labels = pd.DataFrame(rows)
print('Total labeled examples:', len(df_labels))
if len(df_labels) < 10:
    print('Label more examples before training (>=10 recommended).')
else:
    X = df_labels['text'].values
    y = df_labels['label'].values
    tf = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=1)
    Xv = tf.fit_transform(X)
    Xtrain, Xtest, ytrain, ytest = train_test_split(Xv, y, test_size=0.2, random_state=42, stratify=y)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(Xtrain, ytrain)
    yp = clf.predict(Xtest)
    print('Accuracy:', accuracy_score(ytest, yp))
    print(classification_report(ytest, yp))
    pickle.dump({'vectorizer': tf, 'model': clf}, open('frame_classifier.pkl','wb'))
    print('Saved frame_classifier.pkl')


In [None]:
# Evaluate / Predict on all articles using trained classifier
if os.path.exists('frame_classifier.pkl'):
    obj = pickle.load(open('frame_classifier.pkl','rb'))
    tf = obj['vectorizer']
    clf = obj['model']
    texts = [(clean_html(it.get('title','')) + ' ' + clean_html(it.get('summary',''))).strip() for it in results['unique']]
    X = tf.transform(texts)
    preds = clf.predict(X)
    for it, p in zip(results['unique'], preds):
        it['pred_frame'] = p
    from collections import Counter
    print('Predicted frame counts:', Counter(preds))
else:
    print('Train classifier first (run the training cell)')

## BERTopic (optional)

Uncomment and run the BERTopic cell if you installed `bertopic` and `sentence-transformers`.

In [None]:
# BERTopic example (uncomment to run after installing heavy deps)
# from sentence_transformers import SentenceTransformer
# from bertopic import BERTopic
# sbert = SentenceTransformer('all-MiniLM-L6-v2')
# docs = [(it.get('title_clean','') + ' ' + it.get('summary_clean','')).strip() for it in results['unique']]
# embeddings = sbert.encode(docs, show_progress_bar=True, convert_to_numpy=True)
# topic_model = BERTopic(min_topic_size=8)
# topics, probs = topic_model.fit_transform(docs, embeddings)
# topic_model.get_topic_info()


In [None]:
# Save processed results to file (including predictions if available)
with open('vibe_results_full.json','w',encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print('Saved vibe_results_full.json')

### Notebook created and saved to /mnt/data/VIBE_Headline_Frame_Game_Final.ipynb

Run the pipeline cell, label data, train the classifier, and evaluate. After training, predictions will be attached to `results['unique']`.