In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM

from nltk.corpus import stopwords
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, classification_report


from tensorflow.keras.layers import Dense, Embedding, Input, InputLayer, RNN, SimpleRNN, LSTM, Bidirectional, TimeDistributed, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("flipitnews-data.csv")

In [3]:
df.head()

Unnamed: 0,Category,Article
0,Technology,tv future in the hands of viewers with home th...
1,Business,worldcom boss left books alone former worldc...
2,Sports,tigers wary of farrell gamble leicester say ...
3,Sports,yeading face newcastle in fa cup premiership s...
4,Entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
stopword = stopwords.words('english')
stopword.extend(punctuation)
def get_text_preprocessed(words):
    words = words.lower()
    words = re.sub(r'[^a-zA-Z\s]', '', words)
    words = word_tokenize(words)
    words = [WordNetLemmatizer().lemmatize(word) for word in words if word not in stopword]
    words = " ".join(words)
    return words

In [5]:
df['processd_article'] = df['Article'].apply(get_text_preprocessed)

In [6]:
from sklearn.preprocessing import LabelEncoder
df['Category_cat'] = LabelEncoder().fit_transform(df['Category'])

In [7]:
df['length'] = df['processd_article'].apply(lambda x: len(x))

In [8]:
len(set(df['Category_cat']))

5

In [9]:
df.head()

Unnamed: 0,Category,Article,processd_article,Category_cat,length
0,Technology,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,4,2801
1,Business,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...,0,1320
2,Sports,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester say rushed...,3,850
3,Sports,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...,3,1789
4,Entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...,1,1130


# Corpus

In [10]:
corpus = []
for val in df['processd_article']:
    corpus.append(val)

In [11]:
max_len = df['length'].max()
max_len

15827

In [12]:
def get_rare_words(text_col):

    # Prepare a tokenizer on testing data
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(list(text_col))

    thresh = 5

    cnt = 0
    tot_cnt = 0

    for key, value in text_tokenizer.word_counts.items():
        tot_cnt = tot_cnt + 1
        if value < thresh:
            cnt = cnt + 1

    print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

    return cnt, tot_cnt

In [13]:
x_train, x_valid, y_train, y_valid = train_test_split(np.array(corpus),
                                            np.array(df["Category_cat"]),
                                            test_size=0.2,
                                            random_state=0,
                                            shuffle=True
                                           )

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((1780,), (445,), (1780,), (445,))

In [14]:
x_train_cnt, x_train_tot_cnt = get_rare_words(text_col=x_train)

% of rare words in vocabulary: 66.02579604997985


In [29]:
maxlen = 100
# Prepare a tokenizer, again -- by not considering the rare words
x_tokenizer = Tokenizer(num_words=x_train_tot_cnt - x_train_cnt)
# x_tokenizer = Tokenizer(num_words = x_train_tot_cnt)
x_tokenizer.fit_on_texts(list(x_train))

# Convert text sequences to integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_train) # please save tokenizer when you train model
x_val_seq = x_tokenizer.texts_to_sequences(x_valid)

# Pad zero upto maximum length
x_tr = pad_sequences(x_tr_seq,  maxlen=100, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=100, padding='post')

# Size of vocabulary (+1 for padding token)
x_voc = x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

Size of vocabulary in X = 8430


In [30]:
x_tr.shape

(1780, 100)

In [31]:
Embedding_vector_size = 500
model = Sequential([
    Embedding(x_voc,Embedding_vector_size,input_length=100,trainable=True),
    LSTM(15,return_sequences=True,dropout=0.4),
    LSTM(10,return_sequences=False,dropout=0.6),
    Dropout(0.5),
    Dense(len(set(df['Category_cat'])),activation='softmax')
])

In [32]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 500)          4215000   
                                                                 
 lstm_4 (LSTM)               (None, 100, 15)           30960     
                                                                 
 dropout_2 (Dropout)         (None, 100, 15)           0         
                                                                 
 lstm_5 (LSTM)               (None, 10)                1040      
                                                                 
 dense_2 (Dense)             (None, 5)                 55        
                                                                 
Total params: 4,247,055
Trainable params: 4,247,055
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [34]:
call_backs = EarlyStopping(monitor='val_loss',patience=10)

In [35]:
hist = model.fit(x_tr,y_train,validation_data=(x_val,y_valid),batch_size=32,epochs=50,verbose=1,callbacks=call_backs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
