In [1]:
# !unzip /content/News_Category_Dataset_v3.json.zip

Archive:  /content/News_Category_Dataset_v3.json.zip
  inflating: News_Category_Dataset_v3.json  


In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

In [None]:
# Load dataset (text, category)
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [None]:
df.shape

(209527, 6)

In [None]:
len(df.category.unique())

42

In [None]:
df['category'].value_counts().max()

35602

In [None]:
df['category'].value_counts().min()

1014

In [None]:
df.isnull().sum()

Unnamed: 0,0
link,0
headline,0
category,0
short_description,0
authors,0
date,0


In [None]:
df.category.unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [None]:
df['category'].max()

'WORLDPOST'

In [None]:
df['category'].min()

'ARTS'

In [None]:
df['category'].value_counts().max()

35602

In [None]:
allowed = ['SCIENCE','SPORTS','TECH','BUSINESS','EDUCATION']
df = df[df['category'].isin(allowed)]

In [None]:
df['category'].value_counts().max()

5992

In [None]:
df['category'].value_counts().min()

1014

In [None]:
df = (
    df.sort_values('category')  # optional, for clean order
      .groupby('category', group_keys=False)
      .head(df['category'].value_counts().min())
      .reset_index(drop=True)
)

In [None]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,1014
EDUCATION,1014
SCIENCE,1014
SPORTS,1014
TECH,1014


In [None]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffingtonpost.com/entry/outdoor-w...,Outdoor Workspace: The Next Workplace Frontier,BUSINESS,If you could take a vacation to anywhere in th...,"Amanda Schneider, ContributorI write about the...",2015-05-19
1,https://www.huffingtonpost.com/entry/open-carr...,Pro-Gun Group Backs Down After Chipotle Rally ...,BUSINESS,,Ben Hallman,2014-05-23
2,https://www.huffingtonpost.com/entry/volkswage...,Volkswagen's Emissions Scandal Just Got So Muc...,BUSINESS,The auto giant admitted to a cheating on a CO2...,"David Rising, AP",2015-11-04
3,https://www.huffingtonpost.com/entry/gm-recall...,Mother Finds Out Years Later That GM May Be Re...,BUSINESS,,Kira Brekke,2014-05-23
4,https://www.huffingtonpost.com/entry/abercromb...,Abercrombie To Get Slightly Less Obnoxious,BUSINESS,,,2014-05-22


In [None]:
df = df.drop(columns=['link', 'authors', 'date'])

In [None]:
df.head(2)

Unnamed: 0,headline,category,short_description
0,Outdoor Workspace: The Next Workplace Frontier,BUSINESS,If you could take a vacation to anywhere in th...
1,Pro-Gun Group Backs Down After Chipotle Rally ...,BUSINESS,


**Joining Data**

In [None]:
df['text'] = df['headline'] + " " + df['short_description']

**Encode labels**

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

**Splitting the dataset**

In [None]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

**Load TF Hub embedding model**

In [None]:
embedding_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(embedding_url)  # Load raw TF function

**Convert text → vectors**

In [None]:
def text_to_vector(texts):
    # Convert to list of strings and embed
    vectors = embed(tf.constant(texts))
    return vectors.numpy()

X_train_vec = text_to_vector(X_train_text.tolist())
X_test_vec = text_to_vector(X_test_text.tolist())

**Showing vector**

In [None]:
text_to_vector(["Nahid Hasan"]).shape

(1, 512)

**Preprocess vectors**

In [None]:
scaler = StandardScaler()
X_train_vec = scaler.fit_transform(X_train_vec)
X_test_vec = scaler.transform(X_test_vec)

**Shwing the classes**

In [None]:
label_encoder.classes_

array(['BUSINESS', 'EDUCATION', 'SCIENCE', 'SPORTS', 'TECH'], dtype=object)

 Build ANN model

In [None]:
model = tf.keras.Sequential([

    tf.keras.layers.Input(shape=(X_train_vec.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

**Train**

In [None]:
history = model.fit(
    X_train_vec, y_train,
    validation_data=(X_test_vec, y_test),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9713 - loss: 0.1079 - val_accuracy: 0.8629 - val_loss: 0.5396
Epoch 2/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9777 - loss: 0.0804 - val_accuracy: 0.8580 - val_loss: 0.5609
Epoch 3/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9841 - loss: 0.0669 - val_accuracy: 0.8550 - val_loss: 0.5964
Epoch 4/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9867 - loss: 0.0617 - val_accuracy: 0.8570 - val_loss: 0.6217
Epoch 5/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9893 - loss: 0.0504 - val_accuracy: 0.8580 - val_loss: 0.6312
Epoch 6/10
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9890 - loss: 0.0478 - val_accuracy: 0.8570 - val_loss: 0.6629
Epoch 7/10
[1m127/127[0m 

**Evaluate**

In [None]:
loss, acc = model.evaluate(X_test_vec, y_test)
print(f"Test Accuracy: {acc:.2f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8333 - loss: 0.8184
Test Accuracy: 0.85


**Predict function**

In [None]:
def predict_category(text):
    vec = text_to_vector([text])
    vec = scaler.transform(vec)
    pred = model.predict(vec)
    return label_encoder.inverse_transform([np.argmax(pred)])[0]

print(predict_category("Bitcoin price surges after new regulations"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
BUSINESS


**Testing the model**

In [None]:
print(predict_category("Brewers win ninth straight with Collins’ walk-off homer; Verlander strikes out 3,500th batter; USC and Penn State defenders named to Bednarik Award watch list."))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
SPORTS


**Save the model**

In [None]:
# model.save("./news_model.h5")



**Save classes to JSON**

In [None]:
# import json

# # Save classes to JSON
# with open('label_classes.json', 'w') as f:
#     json.dump(label_encoder.classes_.tolist(), f)