### Packages

In [81]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import importlib, import_ipynb
import data_clean_order_text as data
import tensorflow as tf
from tensorflow.data import experimental
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split, cross_val_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# model building imports
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.layers import Conv1D, SimpleRNN, Bidirectional, MaxPooling1D, GlobalMaxPool1D, LSTM, GRU
from keras.models import Sequential
from keras.regularizers import L1L2

%matplotlib inline

# matplotlib defaults
plt.style.use("ggplot")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

import warnings 
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

### Loading ordered data from data_clean_order_text.ipynb

In [22]:
%store -r ordered_super_alpha_text
%store -r ordered_class_alpha_text
%store -r ordered_sem_clusters_desc_text
%store -r ordered_sem_clusters_asc_text
%store -r ordered_sem_clusters_shuffled_per_superclass_text
%store -r test_df

In [62]:
# Experimental data orderings on headlines TRAIN EXAMPLES
X1_train = ordered_super_alpha_text['cleaned_headline']
X2_train = ordered_class_alpha_text['cleaned_headline']
X3_train = ordered_sem_clusters_desc_text['cleaned_headline']
X4_train = ordered_sem_clusters_asc_text['cleaned_headline']
X5_train = ordered_sem_clusters_shuffled_per_superclass_text['cleaned_headline']

# Experimental data orderings on short_description #TODO

# TRAIN LABELS
Y1_train = ordered_super_alpha_text['class']
Y2_train = ordered_class_alpha_text['class']
Y3_train = ordered_sem_clusters_desc_text['class']
Y4_train = ordered_sem_clusters_asc_text['class']
Y5_train = ordered_sem_clusters_shuffled_per_superclass_text['class']

# TEST EXAMPLES
X_test = test_df['cleaned_headline']
Y_test = test_df['class']

In [63]:
Y1_train[:100]

0      QUEER VOICES
1             WOMEN
2      BLACK VOICES
3     LATINO VOICES
4      QUEER VOICES
          ...      
95     QUEER VOICES
96     QUEER VOICES
97     QUEER VOICES
98     QUEER VOICES
99     BLACK VOICES
Name: class, Length: 100, dtype: object

In [64]:
Y2_train[:100]

0     ARTS
1     ARTS
2     ARTS
3     ARTS
4     ARTS
      ... 
95    ARTS
96    ARTS
97    ARTS
98    ARTS
99    ARTS
Name: class, Length: 100, dtype: object

In [65]:
Y3_train[:100]

44360     GENERAL POLITICS
51103     GENERAL POLITICS
21539     GENERAL POLITICS
49127     GENERAL POLITICS
32623     GENERAL POLITICS
                ...       
1901      GENERAL POLITICS
11229     GENERAL POLITICS
49088     GENERAL POLITICS
35907     GENERAL POLITICS
117407    GENERAL POLITICS
Name: class, Length: 100, dtype: object

In [66]:
Y4_train[:100]

182268    CULTURE & ARTS
140009    CULTURE & ARTS
181708    CULTURE & ARTS
196465    CULTURE & ARTS
137307    CULTURE & ARTS
               ...      
669       CULTURE & ARTS
146767    CULTURE & ARTS
153814    CULTURE & ARTS
182259    CULTURE & ARTS
165747    CULTURE & ARTS
Name: class, Length: 100, dtype: object

In [67]:
Y5_train[:100]

14822    WORLD NEWS
25771    WORLD NEWS
5363     WORLD NEWS
1254     WORLD NEWS
8672     WORLD NEWS
            ...    
30909    WORLD NEWS
26292    WORLD NEWS
23705    WORLD NEWS
30575    WORLD NEWS
6264     WORLD NEWS
Name: class, Length: 100, dtype: object

In [68]:
Y_test[:100]

102504    GENERAL POLITICS
137099        FOOD & DRINK
162103            WEDDINGS
201412           PARENTING
172974        BLACK VOICES
                ...       
148578         ENVIRONMENT
86540                GREEN
130890               MEDIA
38289             RELIGION
161439            WEDDINGS
Name: class, Length: 100, dtype: object

## Tokenization and Vectorization


### One-hot encoding and indexing of train and test data

In [69]:
# TEST DATA

# one hot encoding using keras tokenizer and pad sequencing
encoder = LabelEncoder()
Y_test = encoder.fit_transform(Y_test)
print("shape of input data: ", X_test.shape)
print("shape of target variable: ", Y_test.shape)

tokenizer = Tokenizer(num_words=100000, oov_token='<00V>') 
tokenizer.fit_on_texts(X_test) # build the word index
# padding X_test text input data
test_seq = tokenizer.texts_to_sequences(X_test) # converts strinfs into integer lists
test_padseq = pad_sequences(test_seq, maxlen=20) # pads the integer lists to 2D integer tensor 

word_index = tokenizer.word_index
max_words = 150000  # total number of words to consider in embedding layer
total_words = len(word_index)
maxlen = 20 # max length of sequence 
Y_test = to_categorical(Y_test, num_classes=42)
print("Length of word index:", total_words)

shape of input data:  (41905,)
shape of target variable:  (41905,)
Length of word index: 27359


# EXPERIMENT ORDER 1

In [70]:
# ORDER 1 TRAIN DATA
X_train = X1_train
Y_train = Y1_train

# one hot encoding using keras tokenizer and pad sequencing
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("shape of input data: ", X_train.shape)
print("shape of target variable: ", Y_train.shape)

tokenizer = Tokenizer(num_words=100000, oov_token='<00V>') 
tokenizer.fit_on_texts(X_train) # build the word index
# padding X_train text input data
train_seq = tokenizer.texts_to_sequences(X_train) # converts strinfs into integer lists
train_padseq = pad_sequences(train_seq, maxlen=20) # pads the integer lists to 2D integer tensor 

word_index = tokenizer.word_index
max_words = 150000  # total number of words to consider in embedding layer
total_words = len(word_index)
maxlen = 20 # max length of sequence 
Y_train = to_categorical(Y_train, num_classes=42)
print("Length of word index:", total_words)

shape of input data:  (167616,)
shape of target variable:  (167616,)
Length of word index: 52548


### Model training using embedding layer and RNN (Baseline)

In [71]:
# basline model using embedding layers and simpleRNN
model = Sequential()
model.add(Embedding(total_words, 70, input_length=maxlen))
model.add(Bidirectional(SimpleRNN(64, dropout=0.1, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model.add(Bidirectional(SimpleRNN(64, dropout=0.1, recurrent_dropout=0.30, activation='tanh', return_sequences=True)))
model.add(SimpleRNN(32, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(42, activation='softmax'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 20, 70)            3678360   
                                                                 
 bidirectional_8 (Bidirectio  (None, 20, 128)          17280     
 nal)                                                            
                                                                 
 bidirectional_9 (Bidirectio  (None, 20, 128)          24704     
 nal)                                                            
                                                                 
 simple_rnn_14 (SimpleRNN)   (None, 32)                5152      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 42)               

In [89]:
model.compile(optimizer='rmsprop',
            loss='categorical_crossentropy',
            metrics=['accuracy']
            )
#SETUP A EARLY STOPPING CALL and model check point API
earlystopping = keras.callbacks.EarlyStopping(monitor='accuracy',
                                             patience=5,
                                              verbose=1,
                                              mode='min'
                                             )
checkpointer = ModelCheckpoint(filepath='bestvalue',moniter='val_loss', verbose=0, save_best_only=True)
callback_list = [checkpointer, earlystopping]
callback_list = [earlystopping]

In [90]:
# fit model to the data
history = model.fit(train_padseq, Y_train, 
                   batch_size=128, 
                    epochs=15 ,
                    validation_split=0.2
                   )

Epoch 1/15


AttributeError: in user code:

    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 870, in step_function  **
        write_scalar_summaries(outputs, step=model._train_counter)  # pylint: disable=protected-access
    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 3051, in write_scalar_summaries
        tf.summary.scalar('batch_' + name, value, step=step)
    File "c:\Users\natal\anaconda3\lib\site-packages\tensorboard\plugins\scalar\summary_v2.py", line 84, in scalar
        getattr(tf.summary.experimental, "summary_scope", None)

    AttributeError: module 'tensorboard.summary._tf.summary' has no attribute 'experimental'


In [87]:
# evalute the model
test_loss, test_acc = model.evaluate(test_padseq, Y_test, verbose=0)
print("test loss and accuracy:", test_loss, test_acc)

Epoch 1/15


AttributeError: in user code:

    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 870, in step_function  **
        write_scalar_summaries(outputs, step=model._train_counter)  # pylint: disable=protected-access
    File "c:\Users\natal\anaconda3\lib\site-packages\keras\engine\training.py", line 3051, in write_scalar_summaries
        tf.summary.scalar('batch_' + name, value, step=step)
    File "c:\Users\natal\anaconda3\lib\site-packages\tensorboard\plugins\scalar\summary_v2.py", line 84, in scalar
        getattr(tf.summary.experimental, "summary_scope", None)

    AttributeError: module 'tensorboard.summary._tf.summary' has no attribute 'experimental'
