In [36]:
import pickle
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import tensorflow as tf
from tensorflow.keras import preprocessing as kprocessing

In [2]:
word2vec_model = api.load("word2vec-google-news-300")

# Read Dataset

In [3]:
df = pd.read_csv('Datasets/ag_news_csv/ag_news_merged.csv', usecols = ['Topic', 'Description'])

In [4]:
df.head()

Unnamed: 0,Topic,Description
0,Business news,Reuters - Private investment firm Carlyle Grou...
1,Business news,Reuters - Soaring crude prices plus worries\ab...
2,Business news,Reuters - Authorities have halted oil export\f...
3,Business news,"AFP - Tearaway world oil prices, toppling reco..."
4,Business news,Reuters - Stocks ended slightly higher on Frid...


# Dataset Summary

In [5]:
print('Number of rows: ' + str(len(df)))
N_Topics = len(df["Topic"].unique())
print("Number of Topics: " + str(N_Topics))
print('Topics:')
print(df["Topic"].value_counts())

Number of rows: 127598
Number of Topics: 4
Topics:
Science and technology news    31900
Sports news                    31900
World news                     31900
Business news                  31898
Name: Topic, dtype: int64


In [6]:
Descriptions = df['Description']
Descriptions

0         Reuters - Private investment firm Carlyle Grou...
1         Reuters - Soaring crude prices plus worries\ab...
2         Reuters - Authorities have halted oil export\f...
3         AFP - Tearaway world oil prices, toppling reco...
4         Reuters - Stocks ended slightly higher on Frid...
                                ...                        
127593    Ukrainian presidential candidate Viktor Yushch...
127594    With the supply of attractive pitching options...
127595    Like Roger Clemens did almost exactly eight ye...
127596    SINGAPORE : Doctors in the United States have ...
127597    EBay plans to buy the apartment and home renta...
Name: Description, Length: 127598, dtype: object

In [7]:
Topics = df['Topic']
Topics

0         Business news
1         Business news
2         Business news
3         Business news
4         Business news
              ...      
127593       World news
127594      Sports news
127595      Sports news
127596    Business news
127597    Business news
Name: Topic, Length: 127598, dtype: object

In [8]:
Str2Bin = LabelBinarizer()
Str2Bin.fit(Topics)
Topic_Bin = Str2Bin.transform(Topics)
Topic_Bin

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])

In [9]:
Desc_train, Desc_test, Topic_train, Topic_test = train_test_split(Descriptions, Topic_Bin, test_size = 0.2, random_state = 42, stratify = Topics)
Desc_train, Desc_valid, Topic_train, Topic_valid = train_test_split(Desc_train, Topic_train, test_size = 0.2, random_state = 42, stratify = Topic_train)

In [10]:
len(Desc_train), len(Desc_valid), len(Desc_test)

(81662, 20416, 25520)

In [11]:
tokenizer = kprocessing.text.Tokenizer(lower = True, split = ' ',  oov_token = "<pad>", filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(Desc_train)
desc_vocab = tokenizer.word_index

In [12]:
desc_vocab

{'<pad>': 1,
 'the': 2,
 'a': 3,
 'to': 4,
 'of': 5,
 'in': 6,
 'and': 7,
 'on': 8,
 'for': 9,
 '39': 10,
 's': 11,
 'that': 12,
 'with': 13,
 'as': 14,
 'its': 15,
 'at': 16,
 'said': 17,
 'is': 18,
 'by': 19,
 'it': 20,
 'has': 21,
 'new': 22,
 'an': 23,
 'from': 24,
 'reuters': 25,
 'his': 26,
 'will': 27,
 'was': 28,
 'after': 29,
 'have': 30,
 'their': 31,
 'be': 32,
 'two': 33,
 'are': 34,
 'quot': 35,
 'us': 36,
 'over': 37,
 'year': 38,
 'first': 39,
 'ap': 40,
 'he': 41,
 'but': 42,
 'gt': 43,
 'lt': 44,
 'this': 45,
 'more': 46,
 'monday': 47,
 'tuesday': 48,
 'one': 49,
 'wednesday': 50,
 'up': 51,
 'thursday': 52,
 'inc': 53,
 'company': 54,
 'friday': 55,
 'world': 56,
 'than': 57,
 '1': 58,
 'u': 59,
 'last': 60,
 'they': 61,
 'against': 62,
 'yesterday': 63,
 'who': 64,
 'york': 65,
 'about': 66,
 'were': 67,
 'not': 68,
 'into': 69,
 'out': 70,
 'three': 71,
 'been': 72,
 '2': 73,
 'president': 74,
 'had': 75,
 'million': 76,
 'corp': 77,
 'when': 78,
 'oil': 79,
 'week

In [13]:
max_text_length = 200
Desc_train_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(Desc_train), maxlen = max_text_length)
Desc_valid_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(Desc_valid), maxlen = max_text_length)
Desc_test_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(Desc_test), maxlen = max_text_length)
print('Shape: ',Desc_train_seq.shape)
print('Shape: ',Desc_valid_seq.shape)
print('Shape: ',Desc_test_seq.shape)

Shape:  (81662, 200)
Shape:  (20416, 200)
Shape:  (25520, 200)


In [14]:
list_of_keys = list(desc_vocab.keys())
vector_matrix = np.zeros((len(desc_vocab) + 1, 300))
for i in range(len(desc_vocab)):
    word = list_of_keys[i]
    if word in word2vec_model:
        vector_matrix[i + 1,:] = word2vec_model[word]

In [15]:
vector_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.08007812,  0.10498047,  0.04980469, ...,  0.00366211,
         0.04760742, -0.06884766],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00604248,  0.34179688, -0.05102539, ..., -0.30273438,
         0.11962891,  0.02600098]])

In [16]:
model = tf.keras.Sequential([
      tf.keras.layers.Input(shape = Desc_train_seq[0,:].shape),
      tf.keras.layers.Embedding(len(desc_vocab) + 1, vector_matrix.shape[1], weights=[vector_matrix], trainable=False),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout = 0.4)),
      tf.keras.layers.Dropout(0.4),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dense(N_Topics,activation='softmax')
  ])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.01), loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [17]:
history = model.fit(Desc_train_seq, Topic_train, batch_size = 32, shuffle = True, epochs = 5, validation_data = (Desc_valid_seq, Topic_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
print('Accuracy Training data: ' + str(history.history['accuracy'][-1] * 100) + '%')
print('Accuracy Valid data: ' + str(history.history['val_accuracy'][-1] * 100) + '%')

Accuracy Training data: 88.33116888999939%
Accuracy Valid data: 89.20944333076477%


In [19]:
Loss, Accuracy = model.evaluate(Desc_test_seq, Topic_test)
print("Loss = " + str(Loss))
print("Accuracy = " + str(Accuracy))

Loss = 0.3185107707977295
Accuracy = 0.8939655423164368


In [24]:
pred = model.predict(Desc_test_seq)



In [25]:
pred

array([[9.5093894e-01, 2.2049632e-02, 2.0817362e-03, 2.4929633e-02],
       [5.2450038e-04, 8.1892213e-04, 9.8236644e-01, 1.6290031e-02],
       [2.1706985e-02, 2.1987328e-02, 9.1062272e-01, 4.5682963e-02],
       ...,
       [7.5586528e-01, 1.6558369e-01, 1.8599737e-02, 5.9951369e-02],
       [1.4144930e-01, 4.6943486e-02, 7.3523810e-03, 8.0425483e-01],
       [9.8374933e-02, 8.8084257e-01, 1.8764505e-03, 1.8906098e-02]],
      dtype=float32)

In [26]:
Predicted_Classes = Str2Bin.inverse_transform(pred)
Predicted_Classes[0]

'Business news'

In [27]:
Expected_Classes = Str2Bin.inverse_transform(Topic_test)
Expected_Classes[0]

'Business news'

In [42]:
string = ["France companies are losing their profits in the market"]

In [43]:
seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(string))
seq

array([[ 608,  205,   34, 1145,   31,  764,    6,    2,  132]])

In [44]:
prediction = model.predict(seq)
confidence_score = np.max(prediction)
prediction



array([[0.6424319 , 0.13838248, 0.04805578, 0.17112987]], dtype=float32)

In [45]:
result = Str2Bin.inverse_transform(prediction)
result, confidence_score

(array(['Business news'], dtype='<U27'), 0.6424319)

In [38]:
with open('AG_News_Str2Bin.pickle', 'wb') as handle:
    pickle.dump(Str2Bin, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
with open('AG_News_Tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [35]:
model.save('Text_Classification_Model.h5')

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 300)          16545000  
                                                                 
 bidirectional (Bidirectiona  (None, 64)               85248     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 4)                 132       
                                                                 
Total params: 16,632,460
Trainable params: 87,460
Non-trainable params: 16,545,000
_______________________________________