In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd
df = pd.read_excel("news_dataset_large.xlsx")


In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
import re
import string
df["content"] = df["title"] + " " + df["text"]
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned"] = df["content"].apply(clean_text)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])


In [None]:
from sklearn.model_selection import train_test_split

X = df["cleaned"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [None]:
!pip install gradio

In [None]:
%%writefile app.py
import pickle
import re, string
import gradio as gr

In [None]:
model = pickle.load(open("model.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))
le = pickle.load(open("label_encoder.pkl", "rb"))

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\\s+", " ", text).strip()
    return text

In [None]:
def predict_category(news_text):
    cleaned = clean_text(news_text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    label = le.inverse_transform(prediction)[0]
    return f"Predicted Category: {label}"

In [None]:
import gradio as gr
interface = gr.Interface(
    fn=predict_category,
    inputs=gr.Textbox(lines=10, placeholder="Enter your news article here..."),
    outputs="text",
    title="ðŸ“° News Category Predictor",
    description="Paste any news article and get the predicted category!"
)

In [None]:
interface.launch(share=True)

In [None]:
The film industry saw major developments today as new releases dominated the box office.

In [None]:
Fans were thrilled as teams battled for the championship in a high-stakes final.

The government announced several new measures to tackle ongoing economic challenges.