<a href="https://colab.research.google.com/github/JeevithaR3/Online_News/blob/main/BDA_Mini_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q pyspark==3.5.1


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NewsClassification") \
    .master("local[*]") \
    .config("spark.sql.catalogImplementation", "in-memory") \
    .getOrCreate()

spark


In [5]:
df = spark.read.json("/content/News_Category_Dataset_v3.json")
df.printSchema()
df.show(5)


root
 |-- authors: string (nullable = true)
 |-- category: string (nullable = true)
 |-- date: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- link: string (nullable = true)
 |-- short_description: string (nullable = true)

+--------------------+---------+----------+--------------------+--------------------+--------------------+
|             authors| category|      date|            headline|                link|   short_description|
+--------------------+---------+----------+--------------------+--------------------+--------------------+
|Carla K. Johnson, AP|U.S. NEWS|2022-09-23|Over 4 Million Am...|https://www.huffp...|Health experts sa...|
|      Mary Papenfuss|U.S. NEWS|2022-09-23|American Airlines...|https://www.huffp...|He was subdued by...|
|       Elyse Wanshel|   COMEDY|2022-09-23|23 Of The Funnies...|https://www.huffp...|"Until you have a...|
|    Caroline Bologna|PARENTING|2022-09-23|The Funniest Twee...|https://www.huffp...|"Accidentally put...|
|    

In [6]:
from pyspark.sql import functions as F

df = df.withColumn(
    "text",
    F.concat_ws(" ", F.col("headline"), F.col("short_description"))
).select("text", "category")

df.show(5, truncate=80)


+--------------------------------------------------------------------------------+---------+
|                                                                            text| category|
+--------------------------------------------------------------------------------+---------+
|Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters ...|U.S. NEWS|
|American Airlines Flyer Charged, Banned For Life After Punching Flight Attend...|U.S. NEWS|
|23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23) "Until ...|   COMEDY|
|The Funniest Tweets From Parents This Week (Sept. 17-23) "Accidentally put gr...|PARENTING|
|Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer...|U.S. NEWS|
+--------------------------------------------------------------------------------+---------+
only showing top 5 rows



In [7]:
# count categories
category_counts = df.groupBy("category").count().orderBy("count")
category_counts.show(200)

num_classes = category_counts.count()
target_per_class = int(70000 / num_classes)
print("Rows per class:", target_per_class)

from pyspark.sql import DataFrame

balanced_list = []

for cat in category_counts.select("category").toPandas()["category"]:
    sample_df = df.filter(F.col("category") == cat).limit(target_per_class)
    balanced_list.append(sample_df)

balanced_df = balanced_list[0]
for d in balanced_list[1:]:
    balanced_df = balanced_df.union(d)

balanced_df.count()


+--------------+-----+
|      category|count|
+--------------+-----+
|      WEDDINGS|    2|
|       COLLEGE|   12|
| THE WORLDPOST|   19|
|CULTURE & ARTS|   44|
|         MONEY|   49|
|       SCIENCE|   88|
|ARTS & CULTURE|   89|
|     EDUCATION|  112|
|     PARENTING|  114|
|  FOOD & DRINK|  114|
|      WELLNESS|  118|
|          TECH|  119|
|   ENVIRONMENT|  121|
| HOME & LIVING|  125|
|        TRAVEL|  152|
|         TASTE|  155|
|STYLE & BEAUTY|  165|
|      RELIGION|  176|
| LATINO VOICES|  193|
|         GREEN|  224|
|        IMPACT|  249|
|      BUSINESS|  253|
|         STYLE|  260|
|       PARENTS|  360|
|HEALTHY LIVING|  392|
|         CRIME|  466|
|    WEIRD NEWS|  467|
|         WOMEN|  608|
|         MEDIA|  648|
|        SPORTS|  680|
|  BLACK VOICES|  717|
|  QUEER VOICES|  915|
|        COMEDY| 1009|
|     U.S. NEWS| 1377|
|    WORLD NEWS| 2380|
| ENTERTAINMENT| 4076|
|      POLITICS| 8939|
+--------------+-----+

Rows per class: 1891


16265

In [8]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")
stop_remove = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tf = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
label_indexer = StringIndexer(inputCol="category", outputCol="label")

lr = LogisticRegression(maxIter=20, regParam=0.05)

pipeline = Pipeline(stages=[tokenizer, stop_remove, tf, idf, label_indexer, lr])


In [9]:
train, test = balanced_df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)
print("Training complete!")


Training complete!


In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

pred = model.transform(test)

acc = MulticlassClassificationEvaluator(metricName="accuracy").evaluate(pred)
f1  = MulticlassClassificationEvaluator(metricName="f1").evaluate(pred)

print("Accuracy:", acc)
print("F1 Score:", f1)


Accuracy: 0.5124847746650426
F1 Score: 0.48936706911452155


In [11]:
stream_df = balanced_df.orderBy(F.rand()).limit(20)
stream_pred = model.transform(stream_df)

stream_pred.select("text", "category", "prediction").show(truncate=100)


+----------------------------------------------------------------------------------------------------+-------------+----------+
|                                                                                                text|     category|prediction|
+----------------------------------------------------------------------------------------------------+-------------+----------+
|Landmark California Pet Store Ban Treats Animals As Pets, Not Products Pets aren‚Äôt appliances. Th...|       IMPACT|      15.0|
|GOP Senators Refuse To Rule Out Supporting Donald Trump Again ‚Äî Even If He's Indicted With the ex...|     POLITICS|       2.0|
|Robin Roberts Chokes Up In Tribute To 'GMA' Cameraman Who Died From Coronavirus Studio camera ope...|ENTERTAINMENT|       0.0|
|Here's All The Pumpkin-Flavored Alcohol We'll Actually Be Buying This Fall Yes, pumpkin wine is a...|        TASTE|      20.0|
|Why Isla Fisher Steers Away From Offering Parenting Advice The actress and children's book author..

In [12]:
model.write().overwrite().save("/content/news_model")


In [13]:
from pyspark.ml.pipeline import PipelineModel
model_loaded = PipelineModel.load("/content/news_model")


In [14]:
!pip install gradio




In [15]:
import gradio as gr
from pyspark.sql import Row

# function to classify user input
def predict_news(text):
    if not text.strip():
        return "Please enter some text.", {}

    # convert text to spark df
    pdf = spark.createDataFrame([Row(text=text)])

    # run through trained model
    pred = model.transform(pdf).select("prediction", "probability").collect()[0]

    # get label mapping
    labels = model.stages[-2].labels
    predicted_label = labels[int(pred.prediction)]

    # extract probability vector
    probs = {labels[i]: float(pred.probability[i]) for i in range(len(labels))}
    sorted_probs = dict(sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5])

    return predicted_label, sorted_probs


# build gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# üì∞ Real-Time News Category Classifier (PySpark + TF-IDF + LR)")
    gr.Markdown("Enter a headline or full article text, and the model will classify it.")

    text_input = gr.Textbox(label="Enter News Text", placeholder="Example: NASA announces new moon mission...")
    output_label = gr.Label(label="Predicted Category")
    output_probs = gr.JSON(label="Top Confidence Scores")

    predict_btn = gr.Button("Classify")

    predict_btn.click(fn=predict_news,
                      inputs=text_input,
                      outputs=[output_label, output_probs])

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dec4ffca6cd85dc197.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


