# Metadata and Text concatenated network

## Imports

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string
import re
import emoji
import random
import keras
import warnings

warnings.filterwarnings('ignore')

import sys
sys.path.append("../") # make it possible to import functions from different files that are in folders a level up

import os

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from keras.models import Model
from data.functions.string_tools import print_split_shapes

from transformers import AutoTokenizer
from text.tools.bert_model import get_model, MAX_LEN

# SEEDS
random_state = 111

os.environ['PYTHONHASHSEED'] = str(random_state)
random.seed(random_state)
np.random.seed(random_state)
tf.random.set_seed(random_state)

epochs = 8
batch_size= 128
txt_size= 16

Some layers from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Check if a GPU is available

In [2]:
if tf.test.gpu_device_name():
  print('GPU found')
else:
  print("No GPU found")

GPU found


## Load the data in

In [3]:
folder_path = '../data/selected_data/all_data_selected.csv'

df = pd.read_csv(folder_path)

df.drop('Unnamed: 0', axis=1, inplace=True)

df.head()

Unnamed: 0,full_text,retweet_count,user_description,user_followers_count,user_friends_count,user_favourites_count,user_statuses_count,user_media_count,hashtags_count,username,...,part_of_thread,tweet_sentiment,user_creation_tweet_diff,tweeted_in_daypart_day,tweeted_in_daypart_evening,tweeted_in_daypart_morning,tweeted_in_daypart_night,user_created_in_daypart_day,user_created_in_daypart_evening,real_fake_grade
0,Our daily update is published. States reported...,171,We try to provide the most comprehensive state...,468030,13,85,2594,1364,0,The COVID Tracking Project,...,0,0,16384932,0,1,0,0,0,0,1.0
1,President Trump Asked What He Would Do If He W...,0,"Spoof news, political satire, parody and more!...",803,97,1,57502,3,2,The Spoof,...,0,0,293776787,1,0,0,0,0,0,-1.0
2,States reported 630 deaths. We are still seein...,71,We try to provide the most comprehensive state...,468030,13,85,2594,1364,0,The COVID Tracking Project,...,1,0,9039963,0,1,0,0,0,0,1.0
3,Low #vitaminD was an independent predictor of ...,40,Medscape provides breaking medical news and ex...,215969,39457,2206,49892,16563,1,Medscape,...,0,1,375950159,0,0,0,1,0,1,1.0
4,A common question: why are the cumulative outc...,0,We try to provide the most comprehensive state...,468030,13,85,2594,1364,1,The COVID Tracking Project,...,1,2,2470004,0,1,0,0,0,0,1.0


## Prepare the data

### Drop duplicated and not set fields

In [4]:
df['user_description'] = df['user_description'].apply(lambda x: x if pd.notna(x) else '')
print(df.shape)
df = df.dropna(subset=['full_text']).drop_duplicates(subset=['full_text'])
df.shape

(7905, 44)


(7816, 44)

### Combine text fields

In [5]:
df['text'] = df['full_text']
df['user_info'] = df['username'] + ' ' + df['user_description']

df.drop(['username', 'user_description'], axis=1, inplace=True)

df.head()

Unnamed: 0,full_text,retweet_count,user_followers_count,user_friends_count,user_favourites_count,user_statuses_count,user_media_count,hashtags_count,has_user_url,text_length,...,user_creation_tweet_diff,tweeted_in_daypart_day,tweeted_in_daypart_evening,tweeted_in_daypart_morning,tweeted_in_daypart_night,user_created_in_daypart_day,user_created_in_daypart_evening,real_fake_grade,text,user_info
0,Our daily update is published. States reported...,171,468030,13,85,2594,1364,0,1,163,...,16384932,0,1,0,0,0,0,1.0,Our daily update is published. States reported...,The COVID Tracking Project We try to provide t...
1,President Trump Asked What He Would Do If He W...,0,803,97,1,57502,3,2,1,125,...,293776787,1,0,0,0,0,0,-1.0,President Trump Asked What He Would Do If He W...,"The Spoof Spoof news, political satire, parody..."
2,States reported 630 deaths. We are still seein...,71,468030,13,85,2594,1364,0,1,245,...,9039963,0,1,0,0,0,0,1.0,States reported 630 deaths. We are still seein...,The COVID Tracking Project We try to provide t...
3,Low #vitaminD was an independent predictor of ...,40,215969,39457,2206,49892,16563,1,1,112,...,375950159,0,0,0,1,0,1,1.0,Low #vitaminD was an independent predictor of ...,Medscape Medscape provides breaking medical ne...
4,A common question: why are the cumulative outc...,0,468030,13,85,2594,1364,1,1,277,...,2470004,0,1,0,0,0,0,1.0,A common question: why are the cumulative outc...,The COVID Tracking Project We try to provide t...


In [6]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
def clean_text(text):
    text = text.lower()

    # vervang alle urls
    text = re.sub('https?://\S+|www\.\S+', 'HTTPADDR', text)
    
    # verwijder punctuation
    text = text.translate(str.maketrans('','', string.punctuation))
    # vervang emojis door  'EMOJI'
    text = emoji.demojize(text)
    text = re.sub(r"\:(.*?)\:", ' EMOJI ',text)
    # vervang getallen door 'NUMMER'
    text = re.sub(r"\b[\d.]+\b", " NUMMER ", text)
    # vervang opeenvolgende spaties en tabs door een enkele spatie
    text = re.sub(r"\s+", " ", text)

    # Removing the stopwords from text
    new_text = []

    tokenized = word_tokenize(text)

    for word in tokenized:
        if word not in stop_words:
            word = word.strip() # haal spaties aan uiteinde weg
            word = lemmatizer.lemmatize(word) # rocks -> rock, better -> good, running -> run
            new_text.append(word)

    
    
    text = ' '.join(lemmatizer.lemmatize(word) for word in tokenized if word not in stop_words)
    

    # verwijder onnodige spaties aan begin en eind
    text = text.strip()

    return text


In [8]:
print('before:')
print(df['text'][74])

print('\nafter:')
print(clean_text(df['text'][74]))

before:
@globaltimesnews It doesn’t effect randians coz they have CowUrine for cure🐄💦💁🏿‍♂️ after all they have bad smell to tackle COVID-19 with Cow-dung

after:
globaltimesnews ’ effect randians coz cowurine cure EMOJI EMOJI EMOJI bad smell tackle covid19 cowdung


In [9]:
df['text'] = df['text'].apply(clean_text)
df['full_text'] = df['full_text'].apply(clean_text)
df['user_info'] = df['user_info'].apply(clean_text)

In [10]:
x_text = df[['text', 'user_info']]

x_meta_data = df
# x_meta_data= df.drop(axis=1, columns=['real_fake_grade', 'text', 'user_info', 'full_text'])

y = df['real_fake_grade']

print('x_text: ', x_text.shape, '\nx_meta_data', x_meta_data.shape, '\ny', y.shape)

x_text:  (7816, 2) 
x_meta_data (7816, 44) 
y (7816,)


### Split the data into metadata and text sets

In [11]:
x_train_meta_data, x_test_meta_data, x_train_text, x_test_text = train_test_split(x_meta_data, x_text, test_size=.2, random_state=random_state)

x_train_user_info = x_train_text['user_info']
x_test_user_info = x_test_text['user_info']

x_train_tweet = x_train_text['text']
x_test_tweet = x_test_text['text']

y_train = x_train_meta_data['real_fake_grade'].values
y_test = x_test_meta_data['real_fake_grade'].values

x_train_meta_data = x_train_meta_data.drop(axis=1, columns=['real_fake_grade', 'text', 'user_info', 'full_text'])
x_test_meta_data = x_test_meta_data.drop(axis=1, columns=['real_fake_grade', 'text', 'user_info', 'full_text'])

In [12]:
def print_shapes():
  print('Meta data shapes:')
  print_split_shapes(x_train_meta_data, y_train, x_test_meta_data, y_test)

  print('Tweet data shapes:')
  print_split_shapes(x_train_text, y_train, x_test_text, y_test)

  print('User info data shapes:')
  print_split_shapes(x_train_user_info, y_train, x_test_user_info, y_test)

print_shapes()

Meta data shapes:
Train shapes
	X:(6252, 40)
	y:(6252,)
Test shapes
	X:(1564, 40)
	y:(1564,)
Tweet data shapes:
Train shapes
	X:(6252, 2)
	y:(6252,)
Test shapes
	X:(1564, 2)
	y:(1564,)
User info data shapes:
Train shapes
	X:(6252,)
	y:(6252,)
Test shapes
	X:(1564,)
	y:(1564,)


### Transform the data into better (machine) readable sets

In [13]:
# scale the metadata
scaler = StandardScaler()

x_train_meta_data = scaler.fit_transform(x_train_meta_data)
x_test_meta_data = scaler.transform(x_test_meta_data)

# the y labels contains 3 possible values, -1, 0 and 1. Negative labels are not accepted by a neural net so these must be transformed
encoder = LabelEncoder()

y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

print_shapes()

Meta data shapes:
Train shapes
	X:(6252, 40)
	y:(6252,)
Test shapes
	X:(1564, 40)
	y:(1564,)
Tweet data shapes:
Train shapes
	X:(6252, 2)
	y:(6252,)
Test shapes
	X:(1564, 2)
	y:(1564,)
User info data shapes:
Train shapes
	X:(6252,)
	y:(6252,)
Test shapes
	X:(1564,)
	y:(1564,)


### Tokenize the text

In [14]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')

def tokenize(X):
  return tokenizer(
    text=X.tolist(),
    add_special_tokens=True,
    max_length=MAX_LEN,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True
  )

# Tokenize the input (takes some time)
x_train_tweet_tokenized = tokenize(x_train_tweet)
x_test_tweet_tokenized = tokenize(x_test_tweet)

x_train_user_text_tokenized = tokenize(x_train_user_info)
x_test_user_text_tokenized = tokenize(x_test_user_info)

bert_train_tweet_input = { 'input_ids': x_train_tweet_tokenized['input_ids'], 'attention_mask': x_train_tweet_tokenized['attention_mask'] }
bert_test_tweet_input = { 'input_ids': x_test_tweet_tokenized['input_ids'], 'attention_mask': x_test_tweet_tokenized['attention_mask'] }

bert_train_user_text_input = { 'input_ids': x_train_user_text_tokenized['input_ids'], 'attention_mask': x_train_user_text_tokenized['attention_mask'] }
bert_test_user_text_input = { 'input_ids': x_test_user_text_tokenized['input_ids'], 'attention_mask': x_test_user_text_tokenized['attention_mask'] }

## Model building
For the model building we are going to use three different combinations for concatenation

- metadata (nn) + user_info bert
- metadata (nn) + tweet bert
- metadata (nn) + user_info bert + tweet

### Concatenate the two networks into one

#### Prepare the models for concatenation

The two pre-trained models (nn and bert) are functional models, meaning that they have an output layer ready for classification and all the parameters have been trained.<br>
For the concatenation process we do not require an output layer for each network, so we pop it off. And we also set all the parameters to not trainable since they have already been trained.

### Used functions

In [15]:
def predict(network: Model, x_test: list, name: str) -> pd.DataFrame:
  predictions = network.predict(x_test)

  return pd.DataFrame({
    'Name': [name],
    'Predictions': [predictions]
  })

In [16]:
bert_user_info = get_model()

bert_user_info.load_weights('./text/bert-text-metadata-weights.h5')

bert_user_info.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  335141888   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [17]:
bert_pred = predict(bert_user_info, bert_test_user_text_input, 'BERT')

bert_pred

Unnamed: 0,Name,Predictions
0,BERT,"[[0.006881526, 0.0077838516, 0.98533463], [0.2..."


In [18]:
nn_model = keras.models.load_model('./metadata/nn_model.h5')

nn_model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_72 (Dense)            (None, 512)               20992     
                                                                 
 dropout_60 (Dropout)        (None, 512)               0         
                                                                 
 dense_73 (Dense)            (None, 256)               131328    
                                                                 
 dropout_61 (Dropout)        (None, 256)               0         
                                                                 
 dense_74 (Dense)            (None, 128)               32896     
                                                                 
 dropout_62 (Dropout)        (None, 128)               0         
                                                                 
 dense_75 (Dense)            (None, 64)              

In [19]:
nn_pred = predict(nn_model, x_test_meta_data, 'Metadata')

nn_pred

Unnamed: 0,Name,Predictions
0,Metadata,"[[0.99999976, 9.321629e-08, 1.17567716e-07], [..."


In [20]:
def softmax_pred(entry):
    """returns the softmax index of the entry for accuracy score"""
    softmax = max(entry)
    list = entry.tolist()
    index = list.index(softmax)
    return index + 1

In [21]:
def convert(values):
  converted = []
  for pred in values:
    converted.append(np.argmax(pred))

  return converted

nn_converted = convert(nn_pred['Predictions'].values[0])

nn_converted[:5]

[0, 0, 2, 2, 0]

In [22]:
bert_converted = convert(bert_pred['Predictions'].values[0])

bert_converted[:5]

[2, 1, 1, 1, 2]

In [23]:
nn_converted = [pred - 1 for pred in nn_converted]

nn_converted[:5]

[-1, -1, 1, 1, -1]

In [24]:
bert_converted = [-1 if pred == 2 else pred for pred in bert_converted]

bert_converted[:5]

[-1, 1, 1, 1, -1]

In [25]:
test_values = y_test

test_values = test_values - 1

test_values = list(test_values)

test_values[:5]

[-1, -1, 1, 1, -1]

In [26]:
len(nn_converted)

1564

In [27]:
len(bert_converted)

1564

In [28]:
real_indexes = []
fake_indexes = []
neutral_indexes = []

for index in range(len(test_values)):
  val = test_values[index]

  if val == 1:
    real_indexes.append(index)
  elif val == 0:
    neutral_indexes.append(index)
  else:
    fake_indexes.append(index)

In [29]:
real_indexes[:5]

[2, 3, 6, 7, 10]

In [30]:
fake_indexes[:5]

[0, 1, 4, 5, 11]

In [31]:
neutral_indexes[:5]

[8, 9, 12, 14, 17]

In [32]:
fake_both_correct = 0
fake_metadata_correct_data_incorrect = 0
fake_metadata_incorrect_correct_data_correct = 0
fake_both_incorrect = 0

neutral_both_correct = 0
neutral_metadata_correct_data_incorrect = 0
neutral_metadata_incorrect_correct_data_correct = 0
neutral_both_incorrect = 0

true_both_correct = 0
true_metadata_correct_data_incorrect = 0
true_metadata_incorrect_correct_data_correct = 0
true_both_incorrect = 0

# index:
# 0 == both true
# 1 == bert true, nn false
# 2 == nn true, bert false
# 3 == both false

for true_value, nn, bert in zip(test_values, nn_converted, bert_converted):
  if true_value == -1:
    if nn == true_value and bert == true_value:
      fake_both_correct += 1
    elif bert == true_value and not nn == true_value:
      fake_metadata_incorrect_correct_data_correct += 1 
    elif nn == true_value and not bert == true_value:
      fake_metadata_correct_data_incorrect += 1
    else:
      fake_both_incorrect += 1
  if true_value == 0:
    if nn == true_value and bert == true_value:
      neutral_both_correct += 1
    elif bert == true_value and not nn == true_value:
      neutral_metadata_incorrect_correct_data_correct += 1 
    elif nn == true_value and not bert == true_value:
      neutral_metadata_correct_data_incorrect += 1
    else:
      neutral_both_incorrect += 1
  if true_value == 1:
    if nn == true_value and bert == true_value:
      true_both_correct += 1
    elif bert == true_value and not nn == true_value:
      true_metadata_incorrect_correct_data_correct += 1 
    elif nn == true_value and not bert == true_value:
      true_metadata_correct_data_incorrect += 1
    else:
      true_both_incorrect += 1

print(f'both incc: {fake_both_incorrect}\nboth corr: {fake_both_correct}')

both incc: 25
both corr: 296


In [33]:
values = [None, None, None]
test = [-1,0,1]

for i in test:
  arr = [0,0,0,0]
  values[i] = []

  for true_value, nn, bert in zip(test_values, nn_converted, bert_converted):
    if true_value == i:
      if nn == true_value and bert == true_value:
        arr[0] += 1
      elif bert == true_value and not nn == true_value:
        arr[1] += 1
      elif nn == true_value and not bert == true_value:
        arr[2] += 1
      else:
        arr[3] += 1

  values[i] = np.array(arr)

values

[array([  2,   1,  46, 143]),
 array([757,  98,  59,  24]),
 array([296,  26,  87,  25])]

In [34]:
real_count = len(real_indexes)

real_count

938

In [35]:
fake_count = len(fake_indexes)
fake_count

434

In [36]:
neutral_count = len(neutral_indexes)
neutral_count

192

In [37]:
counts = [
  np.array([
    fake_both_correct,
    fake_metadata_correct_data_incorrect,
    fake_metadata_incorrect_correct_data_correct,
    fake_both_incorrect]
  ),
  np.array([
    neutral_both_correct,
    neutral_metadata_correct_data_incorrect,
    neutral_metadata_incorrect_correct_data_correct,
    neutral_both_incorrect]
  ),
  np.array([
    true_both_correct,
    true_metadata_correct_data_incorrect,
    true_metadata_incorrect_correct_data_correct,
    true_both_incorrect]
  )
]

In [38]:
f = []
n = []
r = []
for index, t in enumerate(counts):
  if index == 0:
    # fake
    f = np.round(t / fake_count * 100)
  elif index == 1:
    # neutral
    n = np.round(t / neutral_count * 100)
  else:
    # true
    r = np.round(t / real_count * 100)

Oplaan in onderzoeksrapport

In [39]:
f

array([68., 20.,  6.,  6.])

In [40]:
n

array([ 1., 24.,  1., 74.])

In [41]:
r

array([81.,  6., 10.,  3.])