## Extraction

In [None]:
still cooking lol

## Processing

You'll need to restart your runtime when it prompts you to

In [None]:
#@title Processing Pipeline
# ============================================
# INSTALLATION AND SETUP CELL
# ============================================

!pip install -q streamlit sentence-transformers scikit-learn gensim spacy textblob fuzzywuzzy python-Levenshtein
!pip install -q google-generativeai langchain langchain-google-genai pyngrok wordcloud plotly
!pip install -q nltk
!python -m spacy download en_core_web_sm
!python -m textblob.download_corpora

import os
import sys
import json
import warnings
warnings.filterwarnings('ignore')

# ============================================
# IMPORTS
# ============================================

import streamlit as st
import pandas as pd
import numpy as np
from datetime import datetime
import time
from typing import List, Dict, Tuple, Any

# NLP and ML
import spacy
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from gensim import corpora, models
from fuzzywuzzy import fuzz
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

# LLM and orchestration
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Tunneling
from pyngrok import ngrok
from google.colab import userdata

# ============================================
# CONFIGURATION
# ============================================

# Set up ngrok
os.environ["NGROK_AUTH_TOKEN"] = userdata.get('NGROK_AUTH_TOKEN')
ngrok.set_auth_token(os.environ["NGROK_AUTH_TOKEN"])

# Set up Gemini API
GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
genai.configure(api_key=GEMINI_API_KEY)

# ============================================
# MAIN APPLICATION CODE
# ============================================

# Save as app.py
app_code = '''
import streamlit as st
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import required libraries
import spacy
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from gensim import corpora, models
from fuzzywuzzy import fuzz
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os

# Configure page
st.set_page_config(
    page_title="Infact",
    page_icon="📰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main {padding: 0rem 1rem;}
    .stAlert {margin-top: 1rem;}
    h1 {color: #1E3A8A;}
    h2 {color: #2563EB;}
    h3 {color: #3B82F6;}
    .metric-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 1rem;
        border-radius: 10px;
        color: white;
        margin: 0.5rem 0;
    }
</style>
""", unsafe_allow_html=True)

# Title and description
st.title("📰 Infact")
st.markdown("**Desensationalizing news through AI-powered clustering and fact extraction**")

# Initialize session state
if 'processed_data' not in st.session_state:
    st.session_state.processed_data = None
if 'current_stage' not in st.session_state:
    st.session_state.current_stage = 0

# Cache model loading
@st.cache_resource(show_spinner=False)
def load_models():
    with st.spinner("Loading AI models... (one-time setup)"):
        nlp = spacy.load("en_core_web_sm")
        sentence_model = SentenceTransformer('all-mpnet-base-v2')
        if torch.cuda.is_available():
            sentence_model = sentence_model.to('cuda')
        genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
        return nlp, sentence_model

nlp, sentence_model = load_models()

# ============================================
# HELPER FUNCTIONS
# ============================================

def preprocess_text(text, nlp):
    """Clean and preprocess text using spaCy"""
    doc = nlp(text[:1000000])  # Limit for memory
    tokens = [token.lemma_.lower() for token in doc
              if not token.is_stop and not token.is_punct and token.is_alpha]
    return " ".join(tokens)

def extract_embeddings(texts, model, batch_size=32):
    """Extract embeddings with batch processing"""
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
        embeddings.extend(batch_embeddings.cpu().numpy())
    return np.array(embeddings)

def cluster_articles(embeddings, texts, n_clusters=None):
    """Cluster articles using TF-IDF + KMeans"""
    if n_clusters is None:
        n_clusters = min(max(3, len(texts) // 20), 15)

    # TF-IDF for feature enhancement
    tfidf = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
    tfidf_features = tfidf.fit_transform(texts).toarray()

    # Combine embeddings with TF-IDF
    combined_features = np.hstack([embeddings, tfidf_features * 0.3])

    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(combined_features)

    return clusters, kmeans

def name_clusters_lda(texts, clusters, n_topics=1):
    """Name clusters using LDA topic modeling"""
    cluster_names = {}

    for cluster_id in np.unique(clusters):
        cluster_texts = [texts[i] for i in range(len(texts)) if clusters[i] == cluster_id]

        # Tokenize
        tokenized = [text.split() for text in cluster_texts]

        # Create dictionary and corpus
        dictionary = corpora.Dictionary(tokenized)
        corpus = [dictionary.doc2bow(text) for text in tokenized]

        # LDA model
        if len(corpus) > 0:
            lda = models.LdaModel(
                corpus=corpus,
                id2word=dictionary,
                num_topics=n_topics,
                random_state=42,
                passes=10,
                alpha='auto'
            )

            # Get top words
            topics = lda.show_topics(num_topics=n_topics, num_words=5, formatted=False)
            if topics:
                words = [word for word, _ in topics[0][1]]
                cluster_names[cluster_id] = " ".join(words[:3]).title()
            else:
                cluster_names[cluster_id] = f"Cluster {cluster_id}"
        else:
            cluster_names[cluster_id] = f"Cluster {cluster_id}"

    return cluster_names

def extract_facts_and_musings(text, nlp):
    """Extract fact bullets and separate musings using NER and rules"""
    doc = nlp(text[:5000])  # Limit for processing

    facts = []
    musings = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue

        # Use TextBlob for sentiment
        blob = TextBlob(sent_text)
        sentiment = blob.sentiment.polarity

        # Check for entities (facts often contain entities)
        has_entities = len(sent.ents) > 0

        # Check for opinion indicators
        opinion_words = ['believe', 'think', 'feel', 'seems', 'appears', 'might', 'could', 'should', 'opinion']
        has_opinion = any(word in sent_text.lower() for word in opinion_words)

        # Check for factual indicators
        fact_indicators = ['reported', 'announced', 'confirmed', 'according to', 'data shows', 'study']
        has_fact_indicator = any(indicator in sent_text.lower() for indicator in fact_indicators)

        # Classification logic
        if has_fact_indicator or (has_entities and not has_opinion and abs(sentiment) < 0.5):
            # Clean up the fact
            fact = sent_text.replace('\\n', ' ').strip()
            if len(fact) > 20 and len(fact) < 300:  # Reasonable length for a bullet
                facts.append(fact)
        elif has_opinion or abs(sentiment) > 0.6:
            musing = sent_text.replace('\\n', ' ').strip()
            if len(musing) > 20 and len(musing) < 300:
                musings.append(musing)

    return facts[:10], musings[:5]  # Limit number of bullets

def merge_similar_bullets(bullets, threshold=0.7):
    """Merge similar bullets using cosine similarity and fuzzy matching"""
    if len(bullets) <= 1:
        return bullets, []

    # Calculate similarity matrix
    merged = []
    used = set()
    similarity_scores = []

    for i, bullet1 in enumerate(bullets):
        if i in used:
            continue

        similar_group = [bullet1]
        for j, bullet2 in enumerate(bullets[i+1:], i+1):
            if j in used:
                continue

            # Fuzzy matching
            ratio = fuzz.token_sort_ratio(bullet1, bullet2) / 100.0

            if ratio > threshold:
                similar_group.append(bullet2)
                used.add(j)
                similarity_scores.append(ratio)

        # Keep the longest bullet from similar group
        merged.append(max(similar_group, key=len))
        used.add(i)

    return merged, similarity_scores

def generate_article_with_gemini(bullets, musings, cluster_name):
    """Generate desensationalized article using Gemini API"""
    try:
        model = genai.GenerativeModel('gemini-2.5-flash')

        bullets_text = " • ".join(bullets) if bullets else "No specific facts available."
        musings_text = " • ".join(musings) if musings else "No opinions or commentary available."

        prompt = f"""Generate a desensationalized news article about {cluster_name} from these merged fact bullets:

FACTS:
{bullets_text}

Write in neutral, factual journalistic style. Stick to the facts provided.
After the main article, add a brief "Commentary & Analysis" section with these musings:
{musings_text}

Format the output with clear sections and professional structure."""

        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating article: {str(e)}"

# ============================================
# MAIN PIPELINE
# ============================================

# Sidebar for input
with st.sidebar:
    st.header("📥 Input Articles")

    # Sample data generator
    if st.button("Generate Sample Data"):
        sample_data = [
  {"title":"You Won’t Believe What Governor Silverstone Is Hiding!","content":"BREAKING: Progressive Beacon Daily has uncovered SHOCKING evidence that Governor Silverstone’s secret offshore accounts teem with illicit payoffs from corporate lobbyists! Documents obtained by our insider reveal hidden transactions totaling MILLIONS funneled through shell companies. Critics say this could spell the end of his political career. If you care about TRANSPARENCY, you NEED to read this exposé before it’s buried forever!"},
  {"title":"Is Silverstone the Most Corrupt Governor Ever?","content":"In a STUNNING revelation, Centrist Times reports Governor Silverstone’s office allegedly processed suspicious wire transfers linked to big-energy giants. Sources claim these funds influenced critical environmental votes. Lawmakers are demanding a full inquiry—could this be the biggest scandal in state history? Our exclusive analysis breaks down every transaction and political ramification. You won’t believe how deep this rabbit hole goes!"},
  {"title":"Expose: Silverstone’s Shady Deals Threaten Our Values","content":"Conservative Watchdog News warns that Governor Silverstone’s dereliction of duty isn’t just immoral—it’s PATRIOTIC BETRAYAL! Leaked financial ledgers allegedly show collaboration with radical green groups aiming to dismantle traditional industries. Experts fear these payoffs will cost thousands of jobs and undermine national security. Lawmakers are mobilizing to strip him of office. Don’t miss this fiery breakdown of treasonous politics!"},
  {"title":"Incredible Breakthrough: Scientists Harness Sunlight Like Never Before!","content":"Progressive Beacon Daily celebrates a MIND-BLOWING invention: researchers at Meridian Institute have developed solar panels that convert 90% of sunlight into energy! This could CRUSH the fossil fuel industry and save the planet. Testing shows devices working under low-light conditions—EVERY home can go green. Environmentalists call it the TECHNOLOGY of the century. Find out how this revolution could slash your bills to zero!"},
  {"title":"Solar Miracle Poised to Rewire Energy Market","content":"Centrist Times reports that a team at Meridian Institute unveiled ultra-efficient solar cells boosting energy conversion rates by 50%. Investors are already lining up to fund mass production. Officials say this could stabilize electricity prices and reduce carbon emissions dramatically. Our experts break down what this means for everyday consumers and the global energy landscape. Could this be the energy shift we’ve all waited for?"},
  {"title":"New Solar Tech Sparks Fears of Industrial Collapse","content":"Conservative Watchdog News ALERT: Meridian Institute’s latest solar innovation threatens to DESTROY American manufacturing! Reports indicate the technology could decimate traditional energy sectors, costing millions of jobs. Critics argue the government will FORCE companies to adopt this UNTESTED system, undermining economic stability. Industry leaders are mobilizing to resist—read our fiery take on how radical science is on track to wreck livelihoods."},
  {"title":"Hollywood’s Biggest Star Files for Divorce—MUST-SEE Details!","content":"Progressive Beacon Daily exposes the intimate details behind A-list actor Jordan Calibre’s shocking divorce filing from indie director Riley West. Sources say Calibre cited “irreconcilable creative differences,” but rumors of infidelity swirl like wildfire! Friends claim West discovered damning text messages. Our exclusive interviews delve into every tearful confrontation and trust-shattering betrayal, plus what it means for Calibre’s upcoming blockbuster release."},
  {"title":"Celebrity Split Shocks Fans Worldwide","content":"Centrist Times reveals actor Jordan Calibre has petitioned for divorce from Riley West after a decade-long marriage. While the pair released a joint statement emphasizing mutual respect, insiders hint at deep artistic disagreements and financial disputes. We break down the timeline of their relationship, the terms of their prenuptial agreement, and what this could mean for their sprawling media empire."},
  {"title":"Star Divorce: Hollywood’s Moral Decay Exposed","content":"Conservative Watchdog News decries Jordan Calibre’s divorce from Riley West as yet another symbol of Hollywood’s crumbling moral fabric. Sources allege West’s radical ideology clashed with Calibre’s family values, prompting this public split. Experts warn this trend undermines societal cohesion. Our explosive report uncovers behind-the-scenes drama, lavish alimony demands, and the culture-war stakes at play in Tinseltown’s latest breakup."},
  {"title":"SKY ALERT: Mysterious Comet Heads Straight for Earth!","content":"Progressive Beacon Daily warns: NASA scientists have detected Comet Talora hurtling toward Earth at BREAKNECK speed! Groundbreaking telescopes estimate a collision chance of 2%. While experts urge calm, conspiracy theorists speculate involvement of secret government satellites. Will we see a celestial spectacle—or total annihilation? Our live updates and expert interviews guide you through every astronomical twist before it’s too late!"},
  {"title":"Comet Talora: Real Risk or Media Circus?","content":"Centrist Times highlights recent NASA data on Comet Talora, currently 70 million km away and tracking a near-Earth trajectory. Officials place the impact probability at less than 1%, forecasting a dazzling sky show rather than disaster. We clarify scientific jargon, weigh expert assessments, and outline safe viewing protocols. Learn what the public should REALLY know amid the swirling cosmic hype."},
  {"title":"Armageddon Incoming? Comet Talora Doom Predictions!","content":"Conservative Watchdog News screams ALERT: Comet Talora might be God’s final judgment on a morally bankrupt world! Prepper communities stockpile supplies as the celestial object grows ominously bright. Though NASA insists there’s “no cause for alarm,” regional pastors call for national prayer days. Could this be the sign we’ve ignored for too long? Discover how this cosmic visitor might expose society’s spiritual failings!"},
  {"title":"Hospital Crisis: ERs Drowning in Patients—MUST READ!","content":"Progressive Beacon Daily uncovers a nationwide EMERGENCY as public hospitals report 200% ER capacity surges amid unprecedented flu and COVID-variant outbreaks. Frontline nurses sound the alarm on staff shortages and dwindling medical supplies. Patients wait HOURS for care. Health advocates demand major funding overhauls to save lives. Our exclusive testimonies reveal heartbreaking stories behind overcrowded wards and the real human cost you won’t believe!"},
  {"title":"ER Overload: What You Need to Know","content":"Centrist Times examines the current strain on emergency departments across the country, attributing it to overlapping flu, COVID-19, and RSV seasons. Hospitals report bed shortages and extended wait times. Officials propose federal grants and rapid staffing incentives to alleviate pressure. We analyze policy options, compare regional responses, and provide practical tips for seeking timely medical attention during the crisis."},
  {"title":"Hospitals on Brink: Government Failures EXPOSED","content":"Conservative Watchdog News BLASTS federal mandates for causing ER meltdowns, with hospitals forced to treat unlawful migrants and non-citizens, leaving locals to suffer. Staff report shutdown threats if they refuse care. Citizens face life-or-death delays while bureaucrats bicker. This is a TAXPAYER SCANDAL! Our fiery investigation names the officials responsible and outlines the radical reforms needed to save American healthcare."},
  {"title":"School’s New AI Pronoun Rules Spark Controversy!","content":"Progressive Beacon Daily reveals TechForward Academy’s radical introduction of AI-driven pronoun enforcement—a digital system that auto-corrects speech for inclusivity! Students are seeing real-time alerts and mandatory sensitivity training. Advocates hail it as the FUTURE of respect; critics decry Orwellian overreach. Our in-depth look explores student reactions, privacy concerns, and the impact on classroom culture you can’t afford to miss."},
  {"title":"AI Pronoun Tool: Balanced Perspectives","content":"Centrist Times reports that TechForward Academy is piloting an AI pronoun-assistant to promote inclusivity. The system flags misgendering and offers immediate guidance. Supporters argue it fosters empathy, while detractors question data security and free speech implications. We present viewpoints from educators, legal experts, and parent groups, plus a side-by-side analysis of the program’s benefits and potential pitfalls."},
  {"title":"School’s AI Pronoun Police: Free Speech Under Siege?","content":"Conservative Watchdog News warns TechForward Academy’s AI pronoun enforcement is the latest step toward THOUGHT CONTROL in schools! Students risk penalties for ‘unintentional’ speech errors, and faculty face dismissal if they push back. This techno-authoritarian nightmare could spread nationwide, stifling dissent and bending education to radical ideology. Read our blistering critique on how AI is weaponized against fundamental liberties!"}
]
        st.session_state.sample_json = json.dumps(sample_data, indent=2)

    articles_json = st.text_area(
        "Paste JSON array of articles:",
        value=st.session_state.get('sample_json', ''),
        height=300,
        help='Format: [{"title": "...", "content": "..."}, ...]'
    )

    process_button = st.button("🚀 Process Articles", type="primary", use_container_width=True)

# Main content area with tabs
if process_button and articles_json:
    try:
        articles = json.loads(articles_json)
        st.session_state.articles = articles
        st.session_state.current_stage = 1

        # Create tabs for different stages
        tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
            "1️⃣ Preprocessing",
            "2️⃣ Clustering",
            "3️⃣ Topic Extraction",
            "4️⃣ Fact & Musing Extraction",
            "5️⃣ Deduplication",
            "6️⃣ Final Articles"
        ])

        # Tab 1: Preprocessing
        with tab1:
            st.header("📋 Data Preprocessing")

            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Total Articles", len(articles))
            with col2:
                avg_length = np.mean([len(a['content']) for a in articles])
                st.metric("Avg Article Length", f"{avg_length:.0f} chars")
            with col3:
                st.metric("Processing Time", "< 1 min")

            with st.spinner("Cleaning and preprocessing text..."):
                processed_texts = []
                original_texts = []
                titles = []

                progress_bar = st.progress(0)
                for i, article in enumerate(articles):
                    combined_text = article['title'] + " " + article['content']
                    processed = preprocess_text(combined_text, nlp)
                    processed_texts.append(processed)
                    original_texts.append(combined_text)
                    titles.append(article['title'])
                    progress_bar.progress((i + 1) / len(articles))

                st.success(f"✅ Preprocessed {len(processed_texts)} articles")

                # Show sample
                with st.expander("View Sample Preprocessed Text"):
                    st.text("Original:")
                    st.write(original_texts[0][:500])
                    st.text("Processed:")
                    st.write(processed_texts[0][:500])

        # Tab 2: Clustering
        with tab2:
            st.header("🎯 Article Clustering")

            with st.spinner("Generating embeddings and clustering..."):
                # Generate embeddings
                embeddings = extract_embeddings(original_texts, sentence_model)

                # Cluster
                n_clusters = st.slider("Number of clusters:", 3, 15, min(7, len(articles)//3))
                clusters, kmeans_model = cluster_articles(embeddings, processed_texts, n_clusters)

                # Visualize clusters
                col1, col2 = st.columns(2)

                with col1:
                    # Cluster distribution
                    cluster_counts = pd.Series(clusters).value_counts().sort_index()
                    fig_dist = px.bar(
                        x=cluster_counts.index,
                        y=cluster_counts.values,
                        labels={'x': 'Cluster ID', 'y': 'Number of Articles'},
                        title="Cluster Size Distribution"
                    )
                    st.plotly_chart(fig_dist, use_container_width=True, key="cluster_distribution_chart")

                with col2:
                    # PCA visualization
                    pca = PCA(n_components=2)
                    embeddings_2d = pca.fit_transform(embeddings)

                    fig_pca = px.scatter(
                        x=embeddings_2d[:, 0],
                        y=embeddings_2d[:, 1],
                        color=clusters,
                        title="Cluster Visualization (PCA)",
                        labels={'x': 'Component 1', 'y': 'Component 2'},
                        color_continuous_scale='Viridis'
                    )
                    st.plotly_chart(fig_pca, use_container_width=True, key="pca_visualization_chart")

                st.success(f"✅ Created {n_clusters} story clusters")

        # Tab 3: Topic Extraction
        with tab3:
            st.header("🏷️ Cluster Naming via Topic Modeling")

            with st.spinner("Extracting topics with LDA..."):
                cluster_names = name_clusters_lda(processed_texts, clusters)

                # Display cluster names with word clouds
                cols = st.columns(3)
                for idx, (cluster_id, name) in enumerate(cluster_names.items()):
                    with cols[idx % 3]:
                        st.subheader(f"Cluster {cluster_id}: {name}")

                        # Get texts for this cluster
                        cluster_texts = [processed_texts[i] for i in range(len(processed_texts))
                                       if clusters[i] == cluster_id]

                        # Generate mini word cloud
                        if cluster_texts:
                            text_combined = " ".join(cluster_texts)
                            wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text_combined)

                            fig, ax = plt.subplots(figsize=(4, 3))
                            ax.imshow(wordcloud, interpolation='bilinear')
                            ax.axis('off')
                            st.pyplot(fig)
                            plt.close()

                            st.caption(f"Articles: {len(cluster_texts)}")

        # Tab 4: Fact & Musing Extraction
        with tab4:
            st.header("📝 Fact Bullet & Musing Extraction")

            all_cluster_data = {}

            for cluster_id in range(n_clusters):
                cluster_articles = [original_texts[i] for i in range(len(original_texts))
                                  if clusters[i] == cluster_id]

                if not cluster_articles:
                    continue

                with st.expander(f"Cluster {cluster_id}: {cluster_names.get(cluster_id, 'Unknown')}"):
                    all_facts = []
                    all_musings = []

                    # Extract from each article in cluster
                    for article in cluster_articles[:10]:  # Limit for speed
                        facts, musings = extract_facts_and_musings(article, nlp)
                        all_facts.extend(facts)
                        all_musings.extend(musings)

                    col1, col2 = st.columns(2)

                    with col1:
                        st.subheader("📌 Extracted Facts")
                        for i, fact in enumerate(all_facts[:5], 1):
                            st.write(f"{i}. {fact}")
                        st.metric("Total Facts Extracted", len(all_facts))

                    with col2:
                        st.subheader("💭 Extracted Musings")
                        for i, musing in enumerate(all_musings[:5], 1):
                            st.write(f"{i}. {musing}")
                        st.metric("Total Musings Extracted", len(all_musings))

                    all_cluster_data[cluster_id] = {
                        'facts': all_facts,
                        'musings': all_musings,
                        'name': cluster_names.get(cluster_id, f"Cluster {cluster_id}")
                    }

        # Tab 5: Deduplication
        with tab5:
            st.header("🔄 Bullet Deduplication & Merging")

            for cluster_id, data in all_cluster_data.items():
                with st.expander(f"Cluster {cluster_id}: {data['name']}"):
                    original_facts = data['facts']
                    merged_facts, similarity_scores = merge_similar_bullets(original_facts)

                    col1, col2, col3 = st.columns(3)
                    with col1:
                        st.metric("Original Facts", len(original_facts))
                    with col2:
                        st.metric("After Merging", len(merged_facts))
                    with col3:
                        reduction = (1 - len(merged_facts)/max(len(original_facts), 1)) * 100
                        st.metric("Reduction", f"{reduction:.1f}%")

                    if similarity_scores:
                        # Similarity heatmap
                        st.subheader("Similarity Distribution")
                        fig_sim = px.histogram(
                            similarity_scores,
                            nbins=20,
                            labels={'value': 'Similarity Score', 'count': 'Frequency'},
                            title="Distribution of Similarity Scores"
                        )
                        st.plotly_chart(fig_sim, use_container_width=True, key=f"similarity_histogram_{cluster_id}")

                    # Update data with merged facts
                    data['merged_facts'] = merged_facts

        # Tab 6: Final Articles
        with tab6:
            st.header("📰 Desensationalized Articles")

            for cluster_id, data in all_cluster_data.items():
                st.subheader(f"Story: {data['name']}")

                with st.spinner(f"Generating article for {data['name']}..."):
                    article = generate_article_with_gemini(
                        data.get('merged_facts', data['facts']),
                        data['musings'],
                        data['name']
                    )

                    # Display in a nice format
                    with st.container():
                        st.markdown(f"""
                        <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin: 10px 0;">
                            <h3 style="color: #1E3A8A;">{data['name']}</h3>
                            <div style="margin-top: 15px; line-height: 1.6;">
                                {article.replace(chr(10), '<br>')}
                            </div>
                            <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid #ddd;">
                                <small style="color: #666;">
                                    📊 Based on {len(data.get('merged_facts', []))} fact bullets from {sum(1 for c in clusters if c == cluster_id)} articles
                                </small>
                            </div>
                        </div>
                        """, unsafe_allow_html=True)

                st.divider()

            # Summary metrics
            st.success("✅ Processing Complete!")
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                st.metric("Total Articles Processed", len(articles))
            with col2:
                st.metric("Story Clusters Created", n_clusters)
            with col3:
                total_facts = sum(len(d.get('merged_facts', [])) for d in all_cluster_data.values())
                st.metric("Total Facts Extracted", total_facts)
            with col4:
                st.metric("Final Articles Generated", len(all_cluster_data))

    except json.JSONDecodeError:
        st.error("❌ Invalid JSON format. Please check your input.")
    except Exception as e:
        st.error(f"❌ Error: {str(e)}")

else:
    # Landing page
    st.info("👈 Please paste your article JSON in the sidebar and click 'Process Articles' to begin.")

    # Demo overview
    st.header("🎯 Pipeline Overview")

    pipeline_steps = [
        ("1️⃣ Input", "Paste JSON array of news articles"),
        ("2️⃣ Preprocess", "Clean text using spaCy NLP"),
        ("3️⃣ Embed & Cluster", "Group similar stories using sentence transformers + KMeans"),
        ("4️⃣ Name Clusters", "Extract topics using LDA topic modeling"),
        ("5️⃣ Extract Facts", "Separate facts from opinions using NER + rules"),
        ("6️⃣ Deduplicate", "Merge similar bullets using fuzzy matching"),
        ("7️⃣ Generate", "Create neutral articles using Gemini AI")
    ]

    cols = st.columns(len(pipeline_steps))
    for i, (step, desc) in enumerate(pipeline_steps):
        with cols[i]:
            st.markdown(f"**{step}**")
            st.caption(desc)

    # Key features
    st.header("✨ Key Features")
    col1, col2, col3 = st.columns(3)

    with col1:
        st.markdown("""
        **🤖 Advanced AI Stack**
        - Sentence Transformers for embeddings
        - Multi-stage clustering pipeline
        - LDA topic modeling
        - Gemini 1.5 for generation
        """)

    with col2:
        st.markdown("""
        **📊 Rich Visualizations**
        - Cluster distribution charts
        - PCA embeddings plot
        - Word clouds for topics
        - Similarity heatmaps
        """)

    with col3:
        st.markdown("""
        **⚡ Production Ready**
        - Batch processing for scale
        - GPU acceleration support
        - Efficient deduplication
        - Real-time processing metrics
        """)
'''

# Write the app file
with open('app.py', 'w') as f:
    f.write(app_code)

print("✅ App file created successfully!")

# ============================================
# LAUNCH STREAMLIT WITH NGROK
# ============================================

import subprocess
from threading import Thread
import time

def run_streamlit():
    """Run Streamlit app in background"""
    subprocess.run(["streamlit", "run", "app.py", "--server.port", "8501"])

# Start Streamlit in background thread
thread = Thread(target=run_streamlit)
thread.daemon = True
thread.start()

# Wait for Streamlit to start
time.sleep(5)

# Create tunnel
public_url = ngrok.connect(8501)
print("=" * 50)
print(f"🚀 Streamlit app is running!")
print(f"📱 Public URL: {public_url}")
print(f"🔗 Click here to open: {public_url}")
print("=" * 50)
print("\n⚠️ Note: First load may take a minute to cache models")
print("💡 Tip: Use the 'Generate Sample Data' button to test quickly")

# Keep the tunnel open
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\n🛑 Shutting down tunnel...")
    ngrok.disconnect(public_url)