# Install Tensorflow Hub

In [0]:
!pip install tensorflow-hub
!pip install seaborn

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 


In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 

# Build train, validation and test datasets

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/testall.csv

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/toevaluate2.csv

In [0]:
!wget https://github.com/Dragonet95/utils/raw/master/politifactfull.csv

In [0]:
_COL_NAMES = ['line_number', 'speaker', 'text', 'label']
dataset = pd.read_csv('testall.csv', index_col=None, header=None, names=_COL_NAMES, sep='\t')

dataset['label'] = [1 if sentiment == '1' else 0 for sentiment in dataset['label'].values]

dataset = dataset[1:]
dataset.info()
reviews = dataset['text'].values
sentiments = dataset['label'].values

testing = dataset[['label','text']]
df_train = testing[:16418]
#df_test = testing[15000:]
df_train.info()

speaker = dataset[['speaker']]
df_train2 = speaker[:16418]
df_train2.info()

_COL_NAMES = ['line_number', 'speaker', 'text', 'label']
dataset = pd.read_csv('toevaluate2.csv', index_col=None, header=None, names=_COL_NAMES, sep='\t')
dataset.info()

dataset = dataset[1:]

reviews = dataset['text'].values

testing = dataset[['label','text']]

speaker = dataset[['speaker']]
df_test2 = speaker[:16418]

df_test = testing
df_test.info()
df_test2.info()

In [0]:
_COL_NAMES = ['id','statement', 'label']
dataset = pd.read_csv('politifactfull.csv',  index_col=None,names=_COL_NAMES, sep='\t',skiprows=1)
dataset.info()

dataset.label[dataset.label=='true'] = '5'
dataset.label[dataset.label=='false'] = '0'
dataset.label[dataset.label=='barely-true'] = '3'
dataset.label[dataset.label=='mostly-true'] = '4'
dataset.label[dataset.label=='half-true'] = '2'
dataset.label[dataset.label=='pants-fire'] = '1'

dataset['label'] = [0 if sentiment == '0' else 1 if sentiment == '1' else 2 if sentiment == '2' else 3 if sentiment == '3' else 4 if sentiment == '4' else 5 if sentiment == '5' else 0 for sentiment in dataset['label'].values]


dataset.info()
dataset=dataset[['statement','label']]
print(dataset)

df_train = dataset[:10000]
df_test = dataset[10000:len(dataset)]

df_train.info()
df_test.info()

In [0]:
testing = dataset[['speaker']]
df_train2 = testing[:15000]
df_test2 = testing[15000:]

# Basic Text Wrangling

In [0]:
!pip install contractions
!pip install beautifulsoup4

In [0]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [0]:
train_reviews = pre_process_corpus(train_reviews)
val_reviews = pre_process_corpus(val_reviews)
test_reviews = pre_process_corpus(test_reviews)

# Build Data Ingestion Functions

In [0]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [0]:
embed = hub.Module(module_url)
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
category_counts = 6

In [0]:
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

input_text = layers.Input(shape=(1,), dtype=tf.string)
#input_text2 = layers.Input(shape=(1,), dtype=tf.string)
#input_text3 = layers.Input(shape=(1,), dtype=tf.string)
#embedding1 = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
#embedding2 = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text2)
#embedding3 = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text3)

#embedding = concatenate([x.output, y.output])
#embedding = layers.Add()([embedding1,embedding2])
#embedding = layers.Add()([embedding1,embedding2,embedding3])
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)

#conv = Conv1D(32, kernel_size=3, activation='elu', padding='same')(embedding)
dense = layers.Dense(256, activation='relu')(embedding)

pred = layers.Dense(category_counts, activation='softmax')(dense)

model = Model(inputs=input_text, outputs=pred)
#model = Model(inputs=[input_text,input_text2], outputs=pred)
#model = Model(inputs=[input_text,input_text2, input_text3], outputs=pred)
#model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy'])
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0611 10:01:26.019312 140399681935232 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 1542      
Total params: 132,870
Trainable params: 132,870
Non-trainable params: 0
_________________________________________________________________


In [0]:
train_text = df_train['statement'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [0]:
train_text2 = df_train2['speaker'].tolist()
train_text2 = np.array(train_text2, dtype=object)[:, np.newaxis]

In [0]:
train_text2 = df_train['text'].tolist()
train_text2 = [""] + train_text2[:-1]
train_text2 = np.array(train_text2, dtype=object)[:, np.newaxis]

In [0]:
train_text3 = df_train['text'].tolist()
train_text3 = [""] + train_text3[:-1]
train_text3 = [""] + train_text3[:-1]

train_text3 = np.array(train_text3, dtype=object)[:, np.newaxis]

In [0]:
train_text3 = df_train['text'].tolist()
train_text3 = train_text3[1:] + [""]

train_text3 = np.array(train_text3, dtype=object)[:, np.newaxis]

In [0]:
test_text = df_test['statement'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

In [0]:
test_text2 = df_test2['speaker'].tolist()
test_text2 = np.array(test_text2, dtype=object)[:, np.newaxis]

In [0]:
test_text2 = df_test['text'].tolist()
test_text2 = [""] + test_text2[:-1]
test_text2 = np.array(test_text2, dtype=object)[:, np.newaxis]


In [0]:
test_text3 = df_test['text'].tolist()
test_text3 = [""] + test_text3[:-1]
test_text3 = [""] + test_text3[:-1]
test_text3 = np.array(test_text3, dtype=object)[:, np.newaxis]

In [0]:
test_text2 = df_test['text'].tolist()
test_text2 = test_text2[1:]+ [""]
test_text2 = np.array(test_text2, dtype=object)[:, np.newaxis]

In [0]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit( [train_text, train_text2],
            train_label,
            epochs=5,
            batch_size=32)
  model.save_weights('./model.h5')

In [0]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit( train_text,
            train_label,
            validation_data=(test_text,test_label),
            epochs=10,
            batch_size=32)
  model.save_weights('./model.h5')

Train on 10000 samples, validate on 2791 samples
Epoch 1/10
  416/10000 [>.............................] - ETA: 18s - loss: 0.1381 - acc: 0.2067

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fb113669a20>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.




Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fb1175ad2b0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
!ls -alh | grep model.h5

-rw-r--r-- 1 root root 530K May  6 13:30 model.h5


# Build Deep Learning Model with Universal Sentence Encoder

In [0]:
embedding_feature = hub.text_embedding_column(
    key='sentence', 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2",
    trainable=False)

In [0]:
dnn = tf.estimator.DNNClassifier(
          hidden_units=[512, 128],
          feature_columns=[embedding_feature],
          n_classes=2,
          activation_fn=tf.nn.relu,
          dropout=0.1,
          optimizer=tf.train.AdagradOptimizer(learning_rate=0.005))

INFO:tensorflow:Using default config.


I0419 18:19:28.325523 140148631795584 estimator.py:1739] Using default config.




W0419 18:19:28.335888 140148631795584 estimator.py:1760] Using temporary folder as model directory: /tmp/tmp6fafmxrs


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp6fafmxrs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f767e3c6c18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0419 18:19:28.338926 140148631795584 estimator.py:201] Using config: {'_model_dir': '/tmp/tmp6fafmxrs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f767e3c6c18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Train for approx 12 epochs

# Model Training

In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)
import time

TOTAL_STEPS = 1500
STEP_SIZE = 100
for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*100)
    print('Training for step =', step)
    start_time = time.time()
    dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
    elapsed_time = time.time() - start_time
    print('Train Time (s):', elapsed_time)
    print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
    print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))


----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 50.60619783401489
Eval Metrics (Train): {'accuracy': 0.5685652, 'accuracy_baseline': 0.5502174, 'auc': 0.9439395, 'auc_precision_recall': 0.93231857, 'average_loss': 0.92657685, 'label/mean': 0.4497826, 'loss': 118.39593, 'precision': 1.0, 'prediction/mean': 0.09654394, 'recall': 0.040792655, 'global_step': 100}
Eval Metrics (Validation): {'accuracy': 0.956, 'accuracy_baseline': 0.956, 'auc': 0.67050207, 'auc_precision_recall': 0.07705936, 'average_loss': 0.19451128, 'label/mean': 0.044, 'loss': 24.31391, 'precision': 0.0, 'prediction/mean': 0.021401588, 'recall': 0.0, 'global_step': 100}

----------------------------------------------------------------------------------------------------
Training for step = 100
Train Time (s): 46.15957498550415
Eval Metrics (Train): {'accuracy': 0.6458261, 'accuracy_baseline': 0.5502174, 'auc': 0.94975805, 'auc_pr

# Model Evaluation

In [0]:

with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  predicts = model.predict([test_text,test_text2], batch_size=32)

In [0]:

with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  predicts = model.predict(test_text, batch_size=32)

In [0]:

with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights('./model.h5')  
  a,b = model.evaluate(test_text,test_label, batch_size=32)

 576/2791 [=====>........................] - ETA: 2s

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fb1166e18d0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.




In [0]:
import csv
import os

folder = "mse3"
np.set_printoptions(suppress=True)
os.mkdir(folder)

predictions = []
cont=0
for x in predicts:
  x = str(x)
  start = x.find(' ') + 1
  end = x.find(']', start)
  predictions.append( float(x[start:end]))
  #print(float(x[start:end]))
 
with open(folder+'/primary-20151219_3_dem.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[:1388],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20160129_7_gop.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[1388:2868],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20160311_12_gop.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[2868:4586],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20180131_state_union.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[4586:5106],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20181015_60_min.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[5106:5718],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20190205_trump_state.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[5718:6222],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
with open(folder+'/primary-20190215_trump_emergency.tsv', 'w') as f:
  writer=csv.writer(f, delimiter='\t')
  for i,j in enumerate(predictions[6222:7080],start=1):
    print("{}\t{:.15f}".format(i, j))
    writer.writerow([i,"{:.15f}".format(j)])
    
!zip -r mse3.zip mse3

In [0]:
newlist = []
cont=0
for x in predicts[-316:-71]:
  x = str(x)
  start = x.find(' ') + 1
  end = x.find(']', start)
  newlist.append( float(x[start:end]))
  #print(float(x[start:end]))

for x in newlist:
  cont = cont+1
  print('%s\t%.15f' %(cont, x) )

In [0]:
newlist = []
cont=0
for x in predicts[-762:-316]:
  x = str(x)
  start = x.find(' ') + 1
  end = x.find(']', start)
  newlist.append( float(x[start:end]))
  #print(float(x[start:end]))

for x in newlist:
  cont = cont+1
  print('%s\t%.15f' %(cont, x) )

In [0]:
def get_predictions(estimator, input_fn):
    return [x["probabilities"][0] for x in estimator.predict(input_fn=input_fn)]
predictions = get_predictions(estimator=dnn, input_fn=predict_test_input_fn)
for i,j in enumerate(predictions[-316:-71],start=1):
  print("{}\t{:.15f}".format(i, 1-j))

In [0]:
def get_predictions(estimator, input_fn):
    return [x["probabilities"][0] for x in estimator.predict(input_fn=input_fn)]
predictions = get_predictions(estimator=dnn, input_fn=predict_test_input_fn)
for i,j in enumerate(predictions[-762:-316],start=1):
  print("{}\t{:.15f}".format(i, 1-j))