In [64]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [110]:
import plotly.express as px

In [65]:
tf.random.set_seed(42)
np.random.seed(42)

In [66]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [67]:
fig = px.histogram(df['labels'], x='labels')
fig.show()

In [68]:
df['target'] = df['labels'].astype('category').cat.codes

In [69]:
df.sample(10)

Unnamed: 0,text,labels,target
414,UK house prices dip in November\n\nUK house pr...,business,0
420,LSE 'sets date for takeover deal'\n\nThe Londo...,business,0
1644,Harinordoquy suffers France axe\n\nNumber eigh...,sport,3
416,Barclays shares up on merger talk\n\nShares in...,business,0
1232,Campaign 'cold calls' questioned\n\nLabour and...,politics,2
1544,Wolves appoint Hoddle as manager\n\nGlenn Hodd...,sport,3
1748,Hantuchova in Dubai last eight\n\nDaniela Hant...,sport,3
1264,BAA support ahead of court battle\n\nUK airpor...,politics,2
629,'My memories of Marley...'\n\nTo mark the 60th...,entertainment,1
1043,Labour trio 'had vote-rig factory'\n\nThree La...,politics,2


In [70]:
df_train, df_test = train_test_split(df, test_size=0.3)

In [71]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df_train['text'])
Xtrain = tokenizer.texts_to_sequences(df_train['text'])
Xtest = tokenizer.texts_to_sequences(df_test['text'])

In [72]:
word2idx = tokenizer.word_index
V = len(word2idx) + 1

In [73]:
Xtrain = pad_sequences(Xtrain)
dim = Xtrain.shape[1]
Xtest = pad_sequences(Xtest, maxlen=dim)

In [88]:
input = Input(shape=(dim,))
x = Embedding(V, 50)(input)
x = Conv1D(32, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
output = Dense(5)(x)

In [89]:
model = Model(input, output, name='Text_CNN')

In [90]:
model.summary()

Model: "Text_CNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 4178)]            0         
                                                                 
 embedding_14 (Embedding)    (None, 4178, 50)          1387450   
                                                                 
 conv1d_14 (Conv1D)          (None, 4176, 32)          4832      
                                                                 
 global_max_pooling1d_14 (G  (None, 32)                0         
 lobalMaxPooling1D)                                              
                                                                 
 dense_14 (Dense)            (None, 5)                 165       
                                                                 
Total params: 1392447 (5.31 MB)
Trainable params: 1392447 (5.31 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [93]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [94]:
result = model.fit(
    Xtrain, df_train['target'],
    validation_data=(Xtest, df_test['target']),
    epochs=10,
    batch_size=64,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [95]:
fig = px.line(result.history, x=range(1, 11), y=['loss', 'val_loss'], labels={'x': 'Epoch', 'value': 'Loss'})
fig.show()

In [96]:
fig = px.line(result.history, x=range(1, 11), y=['accuracy', 'val_accuracy'], labels={'x': 'Epoch', 'value': 'Accuracy'})
fig.show()

In [99]:
text = 'India will win the Cricket T20 World Cup'
pred = model.predict(pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = dim))
np.argmax(pred)



3

In [105]:
label_mapping = dict(enumerate(df['labels'].astype('category').cat.categories))
print("Label Mapping: ", label_mapping)

Label Mapping:  {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}


In [106]:
def predict(text):
    pred = model.predict(pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = dim))
    prediction = np.argmax(pred)
    predicted_label = label_mapping[prediction]
    return predicted_label

In [107]:
predict(text)



'sport'

In [108]:
predict('Elliott takes $1.9 billion stake in Southwest, seeks to oust CEO and chair')



'business'

In [109]:
predict('Fans get emotional seeing Ranbir Kapoor\'s with Raha')



'entertainment'