In [None]:
import json
import string
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import re
import nltk
import pip
from nltk import collections
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm, metrics
from sklearn.metrics import precision_recall_fscore_support
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM
from keras import backend as K
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def process_data(read_data_filepath, read_label_filepath):
  label_map = {'non-rumour': 0, 'rumour': 1}
  index_map = {v: k for k, v in label_map.items()}
  with open(read_data_filepath, 'r') as f:
    source_id = []
    text = []
    label = [] 
    for line in f:
      line = json.loads(line)
      source_id.append(line[0]['id_str'])
      text.append(line[0]['text'])
    df_data = pd.DataFrame({'id':source_id,'text':text, 'label':None})
  if read_label_filepath is not None:
    with open(read_label_filepath, 'r') as f:
      id, label = [], []
      temp = json.loads(f.read())
      for key, val in temp.items():
        id.append(key)
        label.append(val)
      # label_data = pd.DataFrame(list(zip(id, label)))
      # label_data.columns = ['id', 'label']
      for i in range(len(label)):
          label[i] = 1 if label[i] == "rumour" else 0 #convert_label(label_data["label"][i])
          if source_id[i] == id[i]:
            df_data.label[i] = label[i]
    return df_data
  else:
    return df_data[['id', 'text']]

df_dev = process_data('/content/drive/My Drive/data/dev.data.jsonl', '/content/drive/My Drive/data/dev.label.json')
df_train = process_data('/content/drive/My Drive/data/train.data.jsonl', '/content/drive/My Drive/data/train.label.json')
df_test = process_data('/content/drive/My Drive/data/test.data.jsonl', None)

# a list of events, and each event is a list of tweets (source tweet + reactions)
print("Number of train data =", len(df_train["text"]))
print("Number of dev data =", len(df_dev["text"]))
print("Number of test data =", len(df_test["text"]))

Number of train data = 4641
Number of dev data = 580
Number of test data = 581


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# clean data
default_stopwords = set(nltk.corpus.stopwords.words('english'))
# will have to add the following custom
custom_stopwords = {"http://", "rt", "co", "https://", "www", "@"}
all_stopwords = default_stopwords | custom_stopwords
eng_stemmer = nltk.stem.SnowballStemmer('english')
tt = TweetTokenizer()
lemmatizer = WordNetLemmatizer()


def preprocess_data(df):
    print("Started preprocessing!")
    text = df['text'].apply(str)
    for i in range(len(text)):
        text[i] = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', text[i])
        text[i] = re.sub(r'@[^\s]+', '', text[i])
        text[i] = tt.tokenize(text[i])
        # remove single character words
        text[i] = [word for word in text[i] if len(word) > 1]
        # convert to lower case
        text[i] = [word.lower() for word in text[i]]
        # removing numbers
        text[i] = [word for word in text[i] if word.isalpha()]
        # stem the words
        text[i] = [lemmatizer.lemmatize(word) for word in text[i]]
        # remove stopwords
        text[i] = [word for word in text[i] if word not in default_stopwords]
        text[i] = " ".join(text[i])
        df['text'][i] = text[i]
    return df['text']

train_texts = preprocess_data(df_train)
dev_texts = preprocess_data(df_dev)
test_texts = preprocess_data(df_test)


In [None]:
test_texts[580]

'terrible news ottawa today thought prayer everyone involved'

In [None]:
## Get Train data and Validation data for model 1
# x_train, x_test, y_train, y_test = train_test_split(train_texts, np.array(df_train["label"], dtype=int), test_size=0.1)
x_train = train_texts
y_train = np.array(df_train["label"].apply(int))
x_dev = dev_texts
y_dev = np.array(df_dev["label"].apply(int))
x_test = test_texts

In [None]:
def convert_label_tofile(label):
    if int(label) == 1:
        label = "rumour"
        return label
    else:
        label = "non-rumour"
        return label


def write_pred(id, labels, filename):
    for i in range(len(labels)):
        labels[i][0] = convert_label_tofile(labels[i][0])
    dic = collections.OrderedDict()
    for i in range(len(id)):
        dic[id[i]] = labels[i][0]
    with open(filename, 'w') as f:
        json.dump(dic, f)
    print("save finished")


def convert_prob(y_pred):
    for i in range(len(y_pred)):
        if y_pred[i] > 0.5:
            y_pred[i] = 1
        else:
            y_pred[i] = 0
    return y_pred


def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


In [None]:
"""
BOW - NN
"""
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(x_train)

bow_x_train = tokenizer.texts_to_matrix(x_train, mode="count")  # BOW representation
bow_x_dev = tokenizer.texts_to_matrix(x_dev, mode="count")  # BOW representation
bow_x_test = tokenizer.texts_to_matrix(x_test, mode="count")  # BOW representation
vocab_size = bow_x_train.shape[1]
print(vocab_size)

# model definition
# model construction
# embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1"
# hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)
model1 = Sequential(name="feedforward-bow-input")
# model1.add(hub_layer)
model1.add(layers.Dense(20, activation='relu'))
model1.add(layers.Dense(10, input_dim=vocab_size, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

# since it's a binary classification problem, we use a binary cross entropy loss here
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', f1_m, precision_m, recall_m])
# model1.summary()

# training
model1.fit(bow_x_train, y_train, epochs=20, verbose=True, validation_data=(bow_x_dev, y_dev), batch_size=100)
loss, accuracy, f1_score, precision, recall = model1.evaluate(bow_x_dev, y_dev, verbose=False)
print("\nTesting BOW - NN f1_score:  {:.4f}".format(f1_score))

# predict test set
y_pred_test1 = model1.predict(bow_x_test)
y_pred_test1 = convert_prob(y_pred_test1)
y_pred_test1 = np.array(y_pred_test1).tolist()
write_pred(df_test["id"], y_pred_test1, "test-output-bow.json")


5086
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Testing BOW - NN f1_score:  0.7679
save finished


In [None]:
"""
word sequence
"""
xseq_train = tokenizer.texts_to_sequences(x_train)
xseq_dev = tokenizer.texts_to_sequences(x_dev)
xseq_test = tokenizer.texts_to_sequences(x_test)
#
maxlen = 100
xseq_train = pad_sequences(xseq_train, padding='post', maxlen=maxlen)
xseq_dev = pad_sequences(xseq_dev, padding='post', maxlen=maxlen)
xseq_test = pad_sequences(xseq_test, padding='post', maxlen=maxlen)

embedding_dim = 10

# word order preserved with this architecture
model2 = Sequential(name="feedforward-sequence-input")
model2.add(layers.Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=maxlen))
model2.add(layers.Flatten())
model2.add(layers.Dense(10, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['acc', f1_m, precision_m, recall_m])
model2.summary()

# training
model2.fit(xseq_train, y_train, epochs=8, verbose=True, validation_data=(xseq_dev, y_dev), batch_size=10)
loss, accuracy, f1_score, precision, recall = model2.evaluate(xseq_dev, y_dev, verbose=False)
print("\nTesting word sequence f1_score:  {:.4f}".format(f1_score))

Model: "feedforward-sequence-input"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 10)           50860     
_________________________________________________________________
flatten_4 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 10)                10010     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 60,881
Trainable params: 60,881
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

Testing word sequence f1_score:  0.7522


In [None]:
# predict test set
y_pred_test2 = model2.predict(xseq_test)
y_pred_test2 = convert_prob(y_pred_test2)
y_pred_test2 = np.array(y_pred_test2).tolist()
write_pred(df_test["id"], y_pred_test2, "test-output-seq.json")

save finished
