In [1]:
!pip install -U sentence-transformers==2.2.2 transformers==4.30.2

Collecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━

In [2]:
!pip install -q gradio==3.50.2 

In [3]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.6.1

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Collecting scikit-learn==1.6.1
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m103.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.6.1


In [4]:
import gradio as gr
import joblib
import requests
import re
import html
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import plotly.graph_objects as go


TFIDF_PIPELINE_PATH = "/kaggle/input/knn-tf-idf-complete/other/default/1/complete_pipeline.joblib"
tfidf_bundle = joblib.load(TFIDF_PIPELINE_PATH)
tfidf_vectorizer = tfidf_bundle["tfidf"]
chi2_selector    = tfidf_bundle["selector"]
knn_tfidf        = tfidf_bundle["knn"]

BERT_DIR = "/kaggle/input/knn-bert/other/default/1"
knn_bert      = joblib.load(f"{BERT_DIR}/knn_model.pkl")
scaler_bert   = joblib.load(f"{BERT_DIR}/scaler.pkl")
selector_bert = joblib.load(f"{BERT_DIR}/selector.pkl")
bert_model = SentenceTransformer("all-MiniLM-L6-v2")


LABEL_MAP = {
    1: "World",
    2: "Sports",
    3: "Business",
    4: "Science / Technology"
}


def clean_text(text: str) -> str:
    text = html.unescape(text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip().lower()

def build_text(title, desc):
    return clean_text(f"{title} {desc}")

def extract_article(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    title = soup.title.string.strip() if soup.title else ""
    meta = soup.find("meta", attrs={"name": "description"})
    desc = meta["content"].strip() if meta else ""
    return title, desc


def toggle_inputs(mode):
    if mode == "URL":
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)


def get_top_tfidf_words(text, tfidf_vectorizer, chi2_selector, top_n=10):
    X = tfidf_vectorizer.transform([text])
    X_sel = chi2_selector.transform(X)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    selected_mask = chi2_selector.get_support()
    selected_features = feature_names[selected_mask]

    vector = X_sel.toarray().flatten()
    top_indices = vector.argsort()[::-1][:top_n]
    top_words = [(selected_features[i], vector[i]) for i in top_indices if vector[i] > 0]

    return top_words

def tfidf_word_importance_plot(top_words):
    if not top_words:
        return go.Figure()
    words, scores = zip(*top_words)
    fig = go.Figure(go.Bar(x=words, y=scores, text=[f"{v:.2f}" for v in scores], textposition="auto"))
    fig.update_layout(title="Top contributing words (TF-IDF)", margin=dict(t=50,b=10,l=10,r=10))
    return fig

def classify(input_mode, url, title, desc, method):
    try:
        if input_mode == "URL":
            if not url.strip():
                return "Please enter a URL", "", "", "", None
            title, desc = extract_article(url)
        else:
            if not title.strip() or not desc.strip():
                return "Please enter Title and Description", "", "", "", None

        text = build_text(title, desc)

        if method == "TF-IDF + KNN":
            X = tfidf_vectorizer.transform([text])
            X = chi2_selector.transform(X)
            pred = knn_tfidf.predict(X)[0]
            top_words = get_top_tfidf_words(text, tfidf_vectorizer, chi2_selector, top_n=10)
            top_words_str = ", ".join([f"{w}:{v:.2f}" for w, v in top_words])
            fig = tfidf_word_importance_plot(top_words)
        else:
            emb = bert_model.encode([text])
            emb = scaler_bert.transform(emb)
            emb = selector_bert.transform(emb)
            pred = knn_bert.predict(emb)[0]
            top_words_str = ""
            fig = go.Figure()  

        return LABEL_MAP[pred], title, desc, top_words_str, fig

    except Exception as e:
        return f"Error: {str(e)}", "", "", "", go.Figure()


def toggle_tfidf_outputs(method):
    if method == "TF-IDF + KNN":
        return gr.update(visible=True), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)

# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="AG News Classification Demo") as demo:
    gr.Markdown("## AG News Classification Demo")
    gr.Markdown("Predict using **TF-IDF + KNN** or **BERT + KNN**. Visualize top contributing words for TF-IDF.")

    with gr.Row():
        with gr.Column(scale=1):
            input_mode = gr.Radio(["URL", "Manual"], value="URL", label="Input Mode")
            url = gr.Textbox(label="Article URL", visible=True)
            title = gr.Textbox(label="Title", visible=False)
            desc  = gr.Textbox(label="Description", lines=5, visible=False)
            method = gr.Radio(["TF-IDF + KNN", "BERT + KNN"], value="TF-IDF + KNN", label="Feature Extraction Method")
            btn = gr.Button("Classify")

        with gr.Column(scale=1):
            out_label = gr.Textbox(label="Predicted Category")
            out_title = gr.Textbox(label="Title")
            out_desc  = gr.Textbox(label="Description", lines=5)
            out_words = gr.Textbox(label="Top contributing words (TF-IDF)")
            out_plot = gr.Plot(label="Word Importance Visualization")

    input_mode.change(toggle_inputs, input_mode, [url, title, desc])
    method.change(toggle_tfidf_outputs, method, [out_words, out_plot])
    btn.click(classify, [input_mode, url, title, desc, method],
              [out_label, out_title, out_desc, out_words, out_plot])

demo.launch()


.gitattributes: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512_vnni.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino_model.xml: 0.00B [00:00, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Running on local URL:  http://127.0.0.1:7860
IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://aa2d34973eb6b90134.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


