In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_json("google_news_dataset.json")

In [4]:
df.head(5)

Unnamed: 0,Source,Headline,Category
0,Rediff.com,Be Wary Of These Afghans!,sports
1,Cricbuzz,"Bangladesh's top-order a big concern, admits H...",sports
2,ESPNcricinfo,Shahidi on India's spin challenge: 'We play be...,sports
3,News18,"Shubman Gill to Fly to Ahmedabad, Recovery on ...",sports
4,BollywoodShaadis.com,Shubman Gill Received A 'Get Well Soon' Tweet ...,sports


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1517 entries, 0 to 1516
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    1517 non-null   object
 1   Headline  1517 non-null   object
 2   Category  1517 non-null   object
dtypes: object(3)
memory usage: 47.4+ KB


In [6]:
frequency = []

for x in list(df.groupby('Source')):
    
    frequency.append([x[0],len(x[1])])

frequency.sort(key=lambda x: x[1])
frequency[-5:]

[['The Indian Express', 34],
 ['India Today', 37],
 ['Sportskeeda', 42],
 ['Mint', 44],
 ['Hindustan Times', 58]]

In [7]:
df = df.sample(frac=1)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df[['Headline']], df['Category'], test_size=0.3, random_state=42)

In [9]:
le = LabelEncoder()
enc = OneHotEncoder(sparse_output=False)
le.fit(df['Category'])
enc.fit(le.transform(y_train).reshape(-1, 1))
y_train = enc.transform(le.transform(y_train).reshape(-1, 1))
y_val = enc.transform(le.transform(y_val).reshape(-1, 1))
le.classes_

array(['business', 'entertainment', 'environment', 'health', 'politics',
       'space', 'sports', 'technology'], dtype=object)

In [10]:
bert_tokenizer = hub.KerasLayer("https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-768-a-12/versions/2",trainable=True)
bert_preprocessor = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3")

In [11]:
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string)

text_preprocessed = bert_preprocessor(text_input)

outputs = bert_tokenizer(text_preprocessed)

hidden1 = tf.keras.layers.Dense(units=768,activation='relu')(outputs['sequence_output'])

hidden2 = tf.keras.layers.LSTM(units=768,activation='tanh')(hidden1)

hidden3 = tf.keras.layers.Dropout(0.3)(hidden2)

hidden4 = tf.keras.layers.Dense(len(df['Category'].unique()), activation='softmax')(hidden3)

model = tf.keras.Model(inputs=[text_input],outputs=[hidden4])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_mask': (Non  0           ['input_1[0][0]']                
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [12]:
model.compile(
  optimizer='adam',
  loss=tf.keras.losses.categorical_crossentropy,
  metrics=['accuracy'],
)

In [14]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=2,verbose=1)

In [16]:
with tf.device('/GPU:0'):
    model.fit(x=X_train, y=y_train,epochs=20,validation_data=(X_val,y_val), batch_size=16, callbacks = [early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 7: early stopping


In [17]:
model.save('news_headlines.keras')

In [18]:
X = df.sample(100)
X_test = X[['Headline']]
y_test = enc.transform(le.transform(X['Category']).reshape(-1, 1)) 

In [19]:
model.evaluate(X_test,y_test)



[0.21275611221790314, 0.9599999785423279]

In [20]:
prediction = model.predict(['MEA sets up 24-hour control room in view of Israel-Palestine conflict'])



In [21]:
for p in prediction:
    
    print(le.classes_[p.argmax()])

politics
