

```
1. Formatting the csv file
```



In [1]:
import pandas as pd

# Load the semicolon-separated data
df = pd.read_csv('labelled_newscatcher_dataset.csv', sep=';')

# Rename the columns
df.columns = ['topic', 'link', 'domain', 'published_date', 'title', 'lang']

# Save to a new CSV
df.to_csv('cleaned_data.csv', index=False)

```
2. Import libraries
```

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, trim, lower
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
)
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

```
3. Text Processing and Cleaning
```

In [3]:
# 1. Initialize Spark session
spark = SparkSession.builder.appName("NewsClassificationModels").getOrCreate()

# 2. Load data
df = spark.read.csv("cleaned_data.csv", header=True, inferSchema=True).fillna({"title": ""})

# 3. Clean the text
df_clean = df.withColumn("title", regexp_replace(col("title"), r"[^a-zA-Z\s]", ""))
df_clean = df_clean.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
df_clean = df_clean.withColumn("title", trim(col("title")))
df_clean = df_clean.withColumn("title", lower(col("title")))

In [4]:
# Showing sample rows
print("Sample rows:")
df.show(5, truncate=False)

Sample rows:
+-------+---------------------------------------------------------------------------------------------------------------------------+--------------+-------------------+----------------------------------------------------------------------------------------------------+----+
|topic  |link                                                                                                                       |domain        |published_date     |title                                                                                               |lang|
+-------+---------------------------------------------------------------------------------------------------------------------------+--------------+-------------------+----------------------------------------------------------------------------------------------------+----+
|SCIENCE|https://www.eurekalert.org/pub_releases/2020-08/dbnl-acl080620.php                                                         |eurekalert.org|2020-08-06 13:

In [5]:
# Printing row count
print(f"Total rows: {df.count()}")

# Assuming 'Category' is the label column - printing distinct categories and their counts
print("Category counts:")
df.groupBy("topic").count().show()

Total rows: 78731
Category counts:
+-------------+-----+
|        topic|count|
+-------------+-----+
|       SPORTS|10135|
|ENTERTAINMENT|10680|
|     BUSINESS| 9673|
|       HEALTH|11853|
|        WORLD|11616|
|   TECHNOLOGY|12103|
|       NATION| 9681|
|      SCIENCE| 2990|
+-------------+-----+



```
4. Checking Different Models
```

In [6]:
from pyspark.ml.classification import RandomForestClassifier, LinearSVC, OneVsRest
import time

# 4. Preprocessing stages
label_indexer = StringIndexer(inputCol="topic", outputCol="label")
tokenizer = Tokenizer(inputCol="title", outputCol="words_token")
remover = StopWordsRemover(inputCol="words_token", outputCol="words_clean")
hashingTF = HashingTF(inputCol="words_clean", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

# 5. Split data
train_data, test_data = df_clean.randomSplit([0.8, 0.2], seed=42)

# 6. Define evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# 7. Logistic Regression pipeline and evaluation
print("\n🔍 Training Logistic Regression...")
start = time.time()

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lr_pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashingTF,
    idf,
    lr
])

lr_model = lr_pipeline.fit(train_data)
lr_predictions = lr_model.transform(test_data)
lr_accuracy = evaluator.evaluate(lr_predictions)
elapsed_lr = time.time() - start

print(f"✅ Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"⏱️ Time taken: {elapsed_lr:.2f} seconds")

# 8. Naive Bayes pipeline and evaluation
print("\n🔍 Training Naive Bayes...")
start = time.time()

nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")
nb_pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashingTF,
    idf,
    nb
])

nb_model = nb_pipeline.fit(train_data)
nb_predictions = nb_model.transform(test_data)
nb_accuracy = evaluator.evaluate(nb_predictions)
elapsed_nb = time.time() - start

print(f"✅ Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"⏱️ Time taken: {elapsed_nb:.2f} seconds")

# 9. Random Forest pipeline and evaluation
print("\n🌲 Training Random Forest Classifier...")
start = time.time()

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashingTF,
    idf,
    rf
])

rf_model = rf_pipeline.fit(train_data)
rf_predictions = rf_model.transform(test_data)
rf_accuracy = evaluator.evaluate(rf_predictions)
elapsed_rf = time.time() - start

print(f"✅ Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"⏱️ Time taken: {elapsed_rf:.2f} seconds")

# 10. Linear SVC pipeline and evaluation
print("\n⚡ Training Linear SVC (One-vs-Rest)...")
start = time.time()

svc = LinearSVC(maxIter=100, regParam=0.1)
ovr = OneVsRest(classifier=svc, featuresCol="features", labelCol="label")
svc_pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashingTF,
    idf,
    ovr
])

svc_model = svc_pipeline.fit(train_data)
svc_predictions = svc_model.transform(test_data)
svc_accuracy = evaluator.evaluate(svc_predictions)
elapsed_svc = time.time() - start

print(f"✅ Linear SVC Accuracy: {svc_accuracy:.4f}")
print(f"⏱️ Time taken: {elapsed_svc:.2f} seconds")



🔍 Training Logistic Regression...
✅ Logistic Regression Accuracy: 0.7379
⏱️ Time taken: 51.47 seconds

🔍 Training Naive Bayes...
✅ Naive Bayes Accuracy: 0.7293
⏱️ Time taken: 17.32 seconds

🌲 Training Random Forest Classifier...
✅ Random Forest Accuracy: 0.3879
⏱️ Time taken: 116.19 seconds

⚡ Training Linear SVC (One-vs-Rest)...
✅ Linear SVC Accuracy: 0.7371
⏱️ Time taken: 220.64 seconds


```
5. Logistic Regression Model Building and Training
```

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, trim, lower, col
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF,
    StringIndexer, IndexToString
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

# 1. Initialize Spark session
spark = SparkSession.builder.appName("NewsClassification_LogisticRegression").getOrCreate()

# 2. Load and clean data
df = spark.read.csv("cleaned_data.csv", header=True, inferSchema=True).fillna({"title": ""})
df_clean = df.withColumn("title", regexp_replace(col("title"), r"[^a-zA-Z\s]", " "))
df_clean = df_clean.withColumn("title", regexp_replace(col("title"), r"\s+", " "))
df_clean = df_clean.withColumn("title", trim(lower(col("title"))))

# 3. Define pipeline stages
label_indexer = StringIndexer(inputCol="topic", outputCol="label")
tokenizer = Tokenizer(inputCol="title", outputCol="words_token")
remover = StopWordsRemover(inputCol="words_token", outputCol="words_clean")

# Increase HashingTF numFeatures to 50000
hashingTF = HashingTF(inputCol="words_clean", outputCol="raw_features", numFeatures=100000)

idf = IDF(inputCol="raw_features", outputCol="tfidf_features")

# Logistic Regression
lr = LogisticRegression(featuresCol="tfidf_features", labelCol="label", maxIter=20, regParam=0.3, elasticNetParam=0.0)

# Convert prediction index back to topic label
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_topic", labels=label_indexer.fit(df_clean).labels)

# 4. Train-test split
train_data, val_data = df_clean.randomSplit([0.8, 0.2], seed=42)

# 5. Build the pipeline
pipeline = Pipeline(stages=[
    label_indexer,
    tokenizer,
    remover,
    hashingTF,
    idf,
    lr,
    label_converter
])

# 6. Train and evaluate
print("\n🔍 Training model: Logistic Regression + HashingTF")
start = time.time()

model = pipeline.fit(train_data)
predictions = model.transform(val_data)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
elapsed = time.time() - start

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"⏱️ Time taken: {elapsed:.2f} seconds")


🔍 Training model: Logistic Regression + HashingTF
✅ Accuracy: 0.7964
⏱️ Time taken: 61.98 seconds


```
6. Model Prediction with Custom News Headline
```

In [8]:
from pyspark.sql import Row
from pyspark.sql.functions import lit

new_text = "5 symptoms of colon cancer that should not be ignored"
new_df = spark.createDataFrame([Row(Text=new_text)])

# 2. Clean the text (same steps as training data)
new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"[^a-zA-Z\s]", ""))
new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"\s+", " "))

# 3. Transform using the trained model pipeline
prediction_result = model.transform(new_df)

# 4. Show prediction
prediction_result.select("Text", "prediction").show()

# Get the original labels from the label indexer (it's the first stage in the pipeline)
labels = model.stages[0].labels

# Get the prediction value
predicted_index = prediction_result.select("prediction").first()["prediction"]

# Map index to actual category
predicted_category = labels[int(predicted_index)]

print(f"Predicted category: {predicted_category}")

+--------------------+----------+
|                Text|prediction|
+--------------------+----------+
|5 symptoms of col...|       1.0|
+--------------------+----------+

Predicted category: HEALTH


In [9]:
from pyspark.sql import Row
from pyspark.sql.functions import lit

new_text = "The championship final was an intense battle as the underdog team clinched the title in the last minute of the game. Fans celebrated wildly after a stunning comeback that showcased exceptional teamwork and determination. Experts say this victory could redefine the team’s future in the league."
new_df = spark.createDataFrame([Row(Text=new_text)])

# 2. Clean the text (same steps as training data)
new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"[^a-zA-Z\s]", ""))
new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"\s+", " "))

# 3. Transform using the trained model pipeline
prediction_result = model.transform(new_df)

# 4. Show prediction
prediction_result.select("Text", "prediction").show()

# Get the original labels from the label indexer (it's the first stage in the pipeline)
labels = model.stages[0].labels

# Get the prediction value
predicted_index = prediction_result.select("prediction").first()["prediction"]

# Map index to actual category
predicted_category = labels[int(predicted_index)]

print(f"Predicted category: {predicted_category}")

+--------------------+----------+
|                Text|prediction|
+--------------------+----------+
|The championship ...|       4.0|
+--------------------+----------+

Predicted category: SPORTS


```
7. Gradio Libraries Installation
```

In [10]:
# Install required libraries
!pip install gradio --quiet
!pip install deep-translator --quiet

# Imports
import gradio as gr
from pyspark.sql.functions import regexp_replace, trim, lower, col
from pyspark.sql import Row
from deep_translator import GoogleTranslator

# Translation function
def translate_to_english(text: str) -> str:
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except Exception as e:
        print("Translation failed:", e)
        return text

# News category prediction function
def predict_category(headline: str):
    translated = translate_to_english(headline)

    new_df = spark.createDataFrame([Row(Text=translated)])
    new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"[^a-zA-Z\s]", " "))
    new_df = new_df.withColumn("title", regexp_replace(col("Text"), r"\s+", " "))
    new_df = new_df.withColumn("title", trim(lower(col("Text"))))

    prediction_result = model.transform(new_df)
    labels = model.stages[0].labels
    predicted_index = prediction_result.select("prediction").first()["prediction"]
    predicted_category = labels[int(predicted_index)]

    return translated, predicted_category


# Clear input/output
def clear_fields():
    return "", "", ""

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

```
8. Web Interface Designing
```

In [11]:
with gr.Blocks(css="""
    #root {
        background-color: #e0e0e0 !important;
        display: flex;
        justify-content: center;
        align-items: flex-start;
        height: 100vh;
        flex-direction: column;
        padding-top: 100px;
    }

    #center-box {
        max-width: 700px;
        margin-left: auto;
        margin-right: auto;
    }

    .wide-textbox textarea {
        min-height: 100px !important;
        font-size: 24px;
    }

    .center-bold-text textarea,
    .center-bold-text input {
        text-align: center !important;
        font-weight: bold !important;
        font-size: 20px;
    }

    .large-font textarea {
        font-size: 24px !important;
    }
""", theme="soft") as demo:


    gr.Markdown(
    """
    <div style='text-align: center'>
        <h1 style='font-size: 24px; font-weight: bold;'>🧠 Multilingual News Topic Classifier</h1>
        <p style='font-size: 20px;'>Enter a news headline in any language and get its translated version and predicted topic.</p>
    </div>
    """
    )

    with gr.Column(elem_id="center-box"):
        # Input
        headline_input = gr.Textbox(
            lines=2,
            max_lines=5,
            placeholder="Type a news headline in any language...",
            label="📰 News Headline",
            elem_classes=["spaced", "wide-textbox"]
        )

        with gr.Row():
            submit_button = gr.Button("🔍 Predict", elem_classes=["spaced"])
            clear_button = gr.Button("🧹 Clear", elem_classes=["spaced"])

        # Output boxes
        translated_output = gr.Textbox(
            label="🗣️ Translated Headline",
            lines=2,
            interactive=False,
             elem_classes=["spaced","large-font"]
        )

        category_output = gr.Textbox(
            label="📌 Predicted Category",
            lines=1,
            interactive=False,
            elem_classes=["center-bold-text"]
        )

    # Actions
    submit_button.click(fn=predict_category, inputs=headline_input, outputs=[translated_output, category_output])
    clear_button.click(fn=clear_fields, inputs=[], outputs=[headline_input, translated_output, category_output])

demo.launch(share=True, inline=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://05a77222a757f12da7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


