In [1]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

In [2]:
import pandas as pd
data = pd.read_csv('bbc-text.csv')
data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [3]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .80)
print(len(data),train_size) 
train_posts = data['text'][:train_size]
train_tags = data['category'][:train_size]
test_posts = data['text'][train_size:]
test_tags = data['category'][train_size:]

2225 1780


In [4]:
# 5 groups
num_labels = len(set(data['category']))
print(set(data['category']),len(set(data['category'])))
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
print(x_train.shape)
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
y_train


{'sport', 'tech', 'business', 'politics', 'entertainment'} 5
(1780, 15000)


array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [None]:

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,))) #can have positive and negative values
model.add(Activation('relu')) #relu is a rectifier. regulates negative values. 
model.add(Dropout(0.3)) #values >0.3 are allowed due to this function 


model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(num_labels)) #num_labels is the total number of classes
model.add(Activation('softmax')) #softmax activation is done on the output layer. Obtains probabalitic values of different classes.

model.summary()
 
model.compile(loss='categorical_crossentropy', #categorical cross...obtains the loss for multiple classes (i.e. categories entertainment, politcs...). Need to use with Softmax act function
              optimizer='adam', #adam is an optimization function to minimize the y' value in the loss function. A typical optimization function used in text classification.
              metrics=['accuracy']) #in the absence of confusion matrix, we can just use accuracy as metric. 
 
history = model.fit(x_train, y_train,    #fitting the model with the training set (x-train), y-train(training set data's tags encoded) The annotated dataset. 
                    batch_size=batch_size,
                    epochs=90,
                    verbose=1, #to randomize the data, not too important for this case. 
                    validation_split=0.1) #split from the test set. 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               7680512   
                                                                 
 activation (Activation)     (None, 512)               0         
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 activation_1 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               3

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1) #take same number of batches as taken suring training.

print('Test accuracy:', score[1]) #score[1] for one dimensional data. All data must be one-dimensional prior to entry into the NN

text_labels = encoder.classes_ #to check how we have encoded the class of our categories, in order to test

print("These are categories",text_labels)


for i in range(5,20): #range can be any number suitable to the range of the dataset
    prediction = model.predict(np.array([x_test[i]])) #changed news data to numpy array to fit the requirements of the predict function 
    predicted_label = text_labels[np.argmax(prediction[0])] #argmax
    print(test_posts.iloc[i])
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)
    print(prediction)

In [None]:
file = open('news.txt',encoding="utf8")
c=file.read()
print(c)
file.close()

In [None]:


 
labels = np.array(['business', 'entertainment' ,'politics' ,'sport' ,'tech']) #odrer is important 
 
test_files = ['news.txt']
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    print()
    x_data.append(t_f_data)
    print(x_data)
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print("File ->", test_files[i], "Predicted label: " + predicted_label)

print("Prediction in percentage = \n",prediction*100)

In [None]:
# Prediction Visualization
import plotly.graph_objects as go
classes=labels.tolist()
pred = (prediction*100).tolist()[0]
fig = go.Figure([go.Bar(x=classes, y=pred)])
fig.show()


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
# classification report
y_test_arg=np.argmax(y_test,axis=1)
Y_pred = np.argmax(model.predict(x_test),axis=1)
print('Confusion Report')
cm = classification_report(y_test_arg, Y_pred)
print(cm)