In [10]:
from google.colab import files
from google.colab import drive

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)

from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

import pandas as pd
import numpy as np

from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report,
    accuracy_score
)

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
filename_EN = "/content/drive/My Drive/550FinalProject/EN_data.csv"
glove_EN = "/content/drive/MyDrive/550FinalProject/glove.6B.100d.txt"

In [13]:
def read_file(filepath):
    data=pd.read_csv(filepath)
    return data
df = read_file(filename_EN)

In [14]:
df['news'] = df['Title']+ df['Description']

In [15]:
def split_dataset(X,y, test_size = 0.2):
    """
    Prepare the training/testing dataset

    Params:

        test_size {float} -- precentage of testing dataset
    Return:
        X_train,X_test,y_train,y_test {tuple} -- split datasets

    """
    
    return train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
def remove_pun_get_token(text: str) -> list:
    """
    Remove punctuation and tokenized strings

    Params:
        text {str} -- string of text

    Return:
        {list} -- tokenized string without punctuation

    """
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

In [None]:
def remove_stopword(seq: list) -> list:
    """
    Remove stopwords

    Params:
        tokens {list} -- list of tokens from the sentence

    Return:
        {list} -- list of tokens without stopwords
    """
    stop_words = set(stopwords.words('english'))
    token = []
    for word in seq:
        if word not in stop_words:
            token.append(word)
    return token

In [None]:
def preprocessing(lst: list,stopwords: bool = False):
    """
    Preprocessing the list of sentences

    Params:
        seq {list} -- list of sentences from text
    Return:
        {list} -- return the sentences after the preprocessing
    """
    preprocess = []
    for sent in lst:
        seq = remove_pun_get_token(sent)
        
        
        if stopwords == True:
            seq = remove_stopword(seq)
        
            
        preprocess.append(" ".join(seq))
        
    return preprocess

In [None]:
X = df['news']
y = df['Class Index']

In [None]:
X1 = preprocessing(X, True)

In [None]:
X = pd.DataFrame(X1)
X = X[0]
X

0         Wall St Bears Claw Back Into Black Reuters Reu...
1         Carlyle Looks Toward Commercial Aerospace Reut...
2         Oil Economy Cloud Stocks Outlook Reuters Reute...
3         Iraq Halts Oil Exports Main Southern Pipeline ...
4         Oil prices soar time record posing new menace ...
                                ...                        
127595    Around worldUkrainian presidential candidate V...
127596    Void filled ClementWith supply attractive pitc...
127597    Martinez leaves bitterLike Roger Clemens almos...
127598    5 arthritis patients Singapore take Bextra Cel...
127599    EBay gets rentalsEBay plans buy apartment home...
Name: 0, Length: 127600, dtype: object

In [None]:
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

In [None]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype="uint8")[y]

In [None]:
X

0         Wall St Bears Claw Back Into Black Reuters Reu...
1         Carlyle Looks Toward Commercial Aerospace Reut...
2         Oil Economy Cloud Stocks Outlook Reuters Reute...
3         Iraq Halts Oil Exports Main Southern Pipeline ...
4         Oil prices soar time record posing new menace ...
                                ...                        
127595    Around worldUkrainian presidential candidate V...
127596    Void filled ClementWith supply attractive pitc...
127597    Martinez leaves bitterLike Roger Clemens almos...
127598    5 arthritis patients Singapore take Bextra Cel...
127599    EBay gets rentalsEBay plans buy apartment home...
Name: 0, Length: 127600, dtype: object

In [None]:

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X)
vocab_length = len(word_tokenizer.word_index) + 1
longest_train = max(X, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))
padded_sentences = pad_sequences(embed(X), length_long_sentence, padding='post')



In [None]:
embeddings_dictionary = {}
embedding_dim = 100
glove_file = open(glove_EN, encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
def embedding_mat():
  
  word_tokenizer = Tokenizer()
  word_tokenizer.fit_on_texts(X)

  embedding_matrix = np.zeros((vocab_length, embedding_dim))
  for word, index in word_tokenizer.word_index.items():
      if index >= vocab_length:
          continue
      embedding_vector = embeddings_dictionary.get(word)
      if embedding_vector is not None:
          embedding_matrix[index] = embedding_vector
  return embedding_matrix


In [None]:
embedding_matrix = embedding_mat()

In [None]:
def BLSTM():
    
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], 
                        output_dim=embedding_matrix.shape[1], 
                        weights = [embedding_matrix], 
                        input_length=length_long_sentence,
                        trainable=False))
    
    model.add(Bidirectional(LSTM(length_long_sentence, return_sequences = True, recurrent_dropout=0.2)))
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
y = to_categorical(y, 5)
y= y[:,1:]


In [None]:
X_train, X_test, y_train, y_test = split_dataset(padded_sentences,y,0.2)

In [None]:
y_test

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       ...,
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0]], dtype=uint8)

In [None]:
X_train1, X_test1, y_train1, y_test1 = split_dataset(padded_sentences,y,0.05)

In [None]:
X_train2, X_test2, y_train2, y_test2 = split_dataset(padded_sentences,y,0.5)

In [None]:
X_train3, X_test3, y_train3, y_test3 = split_dataset(padded_sentences,y,0.1)

In [None]:
#Running this might take over ten hours
if __name__ == "__main__":
  print("AG's News [Training: 80%, Testing： 20%]")
  model = BLSTM()
  model.fit(X_train,
              y_train,
              batch_size=16,
              epochs=5,
              validation_data=[X_test, y_test],
              verbose=1)
  
  print("========================================")
  print("AG's News [Training: 95%, Testing： 5%]")
  model = BLSTM()
  model.fit(X_train1,
              y_train1,
              batch_size=16,
              epochs=5,
              validation_data=[X_test1, y_test1],
              verbose=1)
  
  print("========================================")
  print("AG's News [Training: 50%, Testing： 50%]")
  model = BLSTM()
  model.fit(X_train2,
              y_train2,
              batch_size=16,
              epochs=5,
              validation_data=[X_test2, y_test2],
              verbose=1)
  print("========================================")
  print("AG's News [Training: 90%, Testing： 10%]")
  model = BLSTM()
  model.fit(X_train3,
              y_train3,
              batch_size=16,
              epochs=5,
              validation_data=[X_test3, y_test3],
              verbose=1)

AG's News [Training: 80%, Testing： 20%]
Epoch 1/5
Epoch 2/5
1377/6380 [=====>........................] - ETA: 36:12 - loss: 0.2130 - accuracy: 0.8750

In [None]:
print("========================================")
print("AG's News [Training: 95%, Testing： 5%]")
model = BLSTM()
model.fit(X_train1,
              y_train1,
              batch_size=16,
              epochs=5,
              validation_data=[X_test1, y_test1],
              verbose=1)

AG's News [Training: 95%, Testing： 5%]
Epoch 1/5
  13/7577 [..............................] - ETA: 1:21:44 - loss: 0.7800 - accuracy: 0.3750

KeyboardInterrupt: ignored

In [None]:
print("AG's News [Training: 80%, Testing： 20%]")
model = BLSTM()
model.fit(X_train,
              y_train,
              batch_size=16,
              epochs=5,
              validation_data=[X_test, y_test],
              verbose=1)

In [None]:
print("========================================")
print("AG's News [Training: 50%, Testing： 50%]")
model = BLSTM()
model.fit(X_train2,
              y_train2,
              batch_size=16,
              epochs=5,
              validation_data=[X_test2, y_test2],
              verbose=1)

AG's News [Training: 50%, Testing： 50%]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3f3c4a2250>

In [None]:
print("========================================")
print("AG's News [Training: 90%, Testing： 10%]")
from keras.utils.layer_utils import count_params

print("THUCNews [Training: 80%, Testing： 20%]")
model = BLSTM()
print(model.summary())
model.fit(X_train3,
              y_train3,
              batch_size=16,
              epochs=5,
              validation_data=[X_test3, y_test3],
              verbose=1)

AG's News [Training: 90%, Testing： 10%]
THUCNews [Training: 80%, Testing： 20%]
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 142, 100)          13563100  
                                                                 
 bidirectional_7 (Bidirectio  (None, 142, 284)         276048    
 nal)                                                            
                                                                 
 global_max_pooling1d_7 (Glo  (None, 284)              0         
 balMaxPooling1D)                                                
                                                                 
 batch_normalization_7 (Batc  (None, 284)              1136      
 hNormalization)                                                 
                                                                 
 dropout_21 (Dropout)        (None, 284) 