In [2]:
import tensorflow as tf
tf.__version__

'2.2.0'

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [4]:
import pandas as pd
import numpy as np

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [5]:
text_data = pd.read_csv('../speeches_millercenter_preprocessed.csv')

In [6]:
text_data.head(1)

Unnamed: 0,Transcript,Summary,President,Source,Date,URL,Word Count,Affiliation,Label
0,34th time speak oval office year soon time wan...,"In this broadcast from the Oval Office, Presid...",Ronald Reagan,National Archives,"January 11, 1989",https://millercenter.org/the-presidency/presid...,3289,Republican,0


In [23]:
# train test split
from sklearn.model_selection import train_test_split
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(text_data['Transcript'],
                                         np.array(text_data['Label']),
                                         np.array(text_data['Affiliation']),
                                         test_size=0.2, random_state=42)
train_corpus.shape, test_corpus.shape

((312,), (79,))

In [25]:
# validation and train split
train_final_corpus, validation_corpus, train_final_label_nums, validation_label_nums, train_final_label_names, validation_label_names = train_test_split(train_corpus,
                                         train_label_nums,
                                         train_label_names,
                                         test_size=0.25, random_state=42)
train_final_corpus.shape, validation_corpus.shape, test_corpus.shape

((234,), (78,), (79,))

In [26]:
# Tokenize training speeches
t = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(train_final_corpus)
t.word_index['<PAD>'] = 0

In [27]:
print(max([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), 
      min([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), 
      t.word_index['<UNK>'])

('heterosexual', 14929) ('<PAD>', 0) 1


In [33]:
train_sequences = t.texts_to_sequences(train_final_corpus)
val_sequences = t.texts_to_sequences(validation_corpus)
test_sequences = t.texts_to_sequences(test_corpus)

In [31]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=14930
Number of Documents=234


In [32]:
# What does the below do? Limit the vector length? And why do we need it?

In [34]:
MAX_SEQUENCE_LENGTH = 2000
# pad dataset to a maximum review length in words
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_val = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_val.shape, X_test.shape

((234, 2000), (78, 2000), (79, 2000))

In [35]:
VOCAB_SIZE = len(t.word_index)
EMBED_SIZE = 300
EPOCHS=100
BATCH_SIZE=128

In [16]:
word2idx = t.word_index
# Word vectors downloaded from https://fasttext.cc/docs/en/english-vectors.html
FASTTEXT_INIT_EMBEDDINGS_FILE = '../wiki-news-300d-1M-subword.vec'


def load_pretrained_embeddings(word_to_index, max_features, embedding_size, embedding_file_path):    
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*row.split(" ")) 
                                for row in open(embedding_file_path, encoding="utf8", errors='ignore') 
                                    if len(row)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_to_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    
    for word, idx in word_to_index.items():
        if idx >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [17]:
# What does exactly the below? It uses the embeddings to create the corpus vector representation?

In [18]:
ft_embeddings = load_pretrained_embeddings(word_to_index=word2idx, 
                                           max_features=VOCAB_SIZE, 
                                           embedding_size=EMBED_SIZE, 
                                           embedding_file_path=FASTTEXT_INIT_EMBEDDINGS_FILE)
ft_embeddings.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(14930, 300)

In [19]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMBED_SIZE,
                                    weights=[ft_embeddings],
                                    trainable=True,
                                    input_length=MAX_SEQUENCE_LENGTH))

model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2000, 300)         4479000   
_________________________________________________________________
conv1d (Conv1D)              (None, 2000, 256)         384256    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 400, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 400, 128)          163968    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 80, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80, 64)            41024     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 16, 64)            0

In [20]:
# Training the model below with a validation set to stop the training when it starts to overfit

In [21]:
# Fit the model
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=3,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit(X_train, train_label_nums, 
          validation_data=(X_val, validation_label_nums),
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x1a36c9f790>

In [22]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
predictions = model.predict_classes(X_test, batch_size=2048, verbose=0).ravel()

print("Accuracy: %.2f%%" % (accuracy_score(test_label_nums.astype(int), predictions)*100))
print(classification_report(test_label_nums, predictions))
pd.DataFrame(confusion_matrix(test_label_nums, predictions))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Accuracy: 50.63%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.51      1.00      0.67        40

    accuracy                           0.51        79
   macro avg       0.25      0.50      0.34        79
weighted avg       0.26      0.51      0.34        79



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1
0,0,39
1,0,40


In [None]:
# Now training without the validation set. We might need to do this because
# we might not have enough data to split it, we might need all the data we have for training.

In [37]:
# Tokenize training speeches
t = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(train_corpus)
t.word_index['<PAD>'] = 0

print(max([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), 
      min([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), 
      t.word_index['<UNK>'])


('spellman', 16710) ('<PAD>', 0) 1


In [46]:
train_sequences = t.texts_to_sequences(train_corpus)
# val_sequences = t.texts_to_sequences(validation_corpus)
test_sequences = t.texts_to_sequences(test_corpus)

print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=16711
Number of Documents=312


In [47]:
VOCAB_SIZE = len(t.word_index)
EMBED_SIZE = 300
EPOCHS=100
BATCH_SIZE=128

In [48]:
word2idx = t.word_index
ft_embeddings = load_pretrained_embeddings(word_to_index=word2idx, 
                                           max_features=VOCAB_SIZE, 
                                           embedding_size=EMBED_SIZE, 
                                           embedding_file_path=FASTTEXT_INIT_EMBEDDINGS_FILE)
ft_embeddings.shape

(16711, 300)

In [49]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMBED_SIZE,
                                    weights=[ft_embeddings],
                                    trainable=True,
                                    input_length=MAX_SEQUENCE_LENGTH))

model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2000, 300)         5013300   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 2000, 256)         384256    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 400, 256)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 400, 128)          163968    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 80, 128)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 80, 64)            41024     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 16, 64)           

In [50]:
MAX_SEQUENCE_LENGTH = 2000
# pad dataset to a maximum review length in words
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
# X_val = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((312, 2000), (79, 2000))

In [51]:
# Fit the model
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=3,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit(X_train, train_label_nums, 
#           validation_data=(X_val, validation_label_nums),
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100


Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1a36ca0790>

In [52]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
predictions = model.predict_classes(X_test, batch_size=2048, verbose=0).ravel()

print("Accuracy: %.2f%%" % (accuracy_score(test_label_nums.astype(int), predictions)*100))
print(classification_report(test_label_nums, predictions))
pd.DataFrame(confusion_matrix(test_label_nums, predictions))

Accuracy: 75.95%
              precision    recall  f1-score   support

           0       0.95      0.54      0.69        39
           1       0.68      0.97      0.80        40

    accuracy                           0.76        79
   macro avg       0.82      0.76      0.75        79
weighted avg       0.82      0.76      0.75        79



Unnamed: 0,0,1
0,21,18
1,1,39


In [None]:
# The performance improves without the validation dataset, probably because we have more training data
# to train the model on.
# Although it does overfit, so it would be best to just get more data and train again with a validation dataset