In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/news-classification/withtext.csv')
df = df[['topic', 'title']]
df.head()

Unnamed: 0,topic,title
0,SCIENCE,A closer look at water-splitting's solar fuel ...
1,SCIENCE,"An irresistible scent makes locusts swarm, stu..."
2,SCIENCE,Artificial intelligence warning: AI will know ...
3,SCIENCE,Glaciers Could Have Sculpted Mars Valleys: Study
4,SCIENCE,Perseid meteor shower 2020: What time and how ...


In [2]:
!pip install -U "tensorflow==2.8.*"
!pip install -U "tensorflow-text==2.8.*"

Collecting tensorflow==2.8.*
  Downloading tensorflow-2.8.4-cp37-cp37m-manylinux2010_x86_64.whl (497.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.9/497.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorboard<2.9,>=2.8
  Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.29.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.9,>=2.8.0rc0
  Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-estimator<2.9,>=2.8
  Do

In [3]:
labels_dict = {
    'SCIENCE': 0, 
    'TECHNOLOGY': 1, 
    'HEALTH': 2, 
    'WORLD': 3, 
    'ENTERTAINMENT': 4,
    'SPORTS': 5, 
    'BUSINESS': 6, 
    'NATION': 7,
}
df.topic = df.topic.apply(lambda x: labels_dict[x])
df

Unnamed: 0,topic,title
0,0,A closer look at water-splitting's solar fuel ...
1,0,"An irresistible scent makes locusts swarm, stu..."
2,0,Artificial intelligence warning: AI will know ...
3,0,Glaciers Could Have Sculpted Mars Valleys: Study
4,0,Perseid meteor shower 2020: What time and how ...
...,...,...
108769,7,PDP governors’ forum urges security agencies t...
108770,6,"In Q2-20, Apple Dominated the Premium Smartpho..."
108771,2,Coronavirus Northern Ireland: Full breakdown s...
108772,4,Paul McCartney details post-Beatles distress a...


In [4]:
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(df, test_size=0.5, random_state=12)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=12)

In [5]:
import tensorflow as tf
train_ds = tf.data.Dataset.from_tensor_slices((train_df.title, train_df.topic))
val_ds = tf.data.Dataset.from_tensor_slices((val_df.title, val_df.topic))
test_ds = tf.data.Dataset.from_tensor_slices((test_df.title, test_df.topic))

In [6]:
batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)
test_ds = test_ds.batch(batch_size)

In [7]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
tf.random.set_seed(16)


bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=True)

In [8]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

inputs = keras.Input(shape=(), dtype=tf.string, name='sentences')
preprocessed_text = bert_preprocess(inputs)
preprocessed_text = bert_encoder(preprocessed_text)
x = keras.layers.Dropout(0.1, name="dropout")(preprocessed_text['pooled_output'])
outputs = tf.keras.layers.Dense(8, activation='softmax', name="output")(x)
model = keras.Model(inputs, outputs)

optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

model.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sentences (InputLayer)         [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['sentences[0][0]']              
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [9]:
callbacks = [keras.callbacks.ModelCheckpoint("bert.keras",
                                             save_best_only=True)]
model.fit(train_ds, validation_data=val_ds, epochs=5, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff38c071390>

In [10]:
model = keras.models.load_model("bert.keras",
                                custom_objects={"KerasLayer": hub.KerasLayer})
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

Test acc: 0.821


In [11]:
model = keras.models.load_model("bert.keras",
                                custom_objects={"KerasLayer": hub.KerasLayer})

In [12]:
from keras.models import Model

XX = model.input 
YY = model.layers[2].output
new_model = Model(XX, YY)

In [13]:
embeddings = []
topics = []
for x in test_ds:
    out = new_model.predict(x[0])
    embeddings.extend(out['pooled_output'].tolist())
    topics.extend(x[1].numpy().tolist())

In [14]:
import pandas as pd
df = pd.DataFrame(embeddings)
df['topic'] = pd.Series(topics)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,topic
0,0.359535,-0.020203,0.841414,-0.186985,-0.459859,0.235641,-0.672958,0.465106,0.750093,-0.630278,...,0.298468,-0.899714,0.546387,-0.104769,0.902654,-0.072205,0.830608,-0.485194,0.269382,5
1,-0.209741,-0.683858,-0.500741,-0.001076,0.288752,-0.797180,0.594927,0.598423,-0.451868,-0.279323,...,0.263089,0.494455,-0.385653,-0.076397,-0.574596,-0.632067,-0.748754,-0.360298,0.389563,0
2,-0.625053,-0.047094,-0.771551,0.800215,0.232586,0.096233,0.732364,-0.090437,-0.544119,-0.998514,...,0.292976,-0.070838,0.632511,0.655152,0.091996,0.745086,-0.353510,-0.511464,0.870694,2
3,0.300871,-0.175521,0.300276,-0.127452,-0.091772,-0.165417,-0.320583,-0.234205,0.194621,0.338064,...,0.393926,0.057548,0.010258,0.576192,0.603740,0.527238,-0.266152,-0.072170,-0.275284,1
4,-0.363481,-0.155224,0.575043,0.072568,-0.238160,0.209949,-0.276912,0.587217,0.664331,-0.905945,...,-0.301591,-0.455775,0.790288,-0.116791,0.902496,-0.024990,0.114266,-0.515814,-0.049383,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54382,-0.116247,0.012665,-0.489994,0.348309,-0.028251,-0.306829,0.343133,-0.106614,-0.702484,-0.596462,...,0.910769,-0.168222,-0.324227,0.540493,-0.785821,0.655120,-0.252576,0.156391,0.142753,2
54383,0.389106,0.059344,-0.914865,0.264858,0.002697,0.450763,-0.469463,-0.249388,-0.560708,-0.973164,...,0.082738,0.148392,-0.145902,0.491189,-0.540107,0.660281,-0.765161,-0.087621,0.266228,3
54384,-0.332815,-0.179381,-0.698057,0.461850,0.089714,-0.027683,-0.087749,0.228028,-0.274518,-0.995950,...,0.725849,-0.408068,0.458943,0.839425,-0.058604,0.652260,-0.215079,-0.584427,0.634412,6
54385,-0.301674,-0.073891,-0.727883,0.468309,0.135035,-0.069433,0.119486,-0.341903,-0.146112,-0.988066,...,0.610428,-0.040717,0.158780,0.846544,-0.387610,0.717744,-0.750833,-0.060274,0.559922,7


In [15]:
df.to_csv('news_embeddings.csv', index=False)

In [16]:
#%cd /kaggle/working
#from IPython.display import FileLink 
#FileLink(r'news_embeddings.csv')