In [1]:
import pandas as pd
import numpy as np
import os
import json
import spacy

# Data Loading


In [2]:
def read_json_file(path):
    f = open(path)
    json_content = json.load(f)
    f.close()
    return json_content

In [3]:
training_dataset_directory = 'datasets/rumoureval-2019-training-data/twitter-english'
test_dataset_directory = 'datasets/rumoureval-2019-test-data/twitter-en-test-data'

training_dataset_reddit_directory = 'datasets/rumoureval-2019-training-data/reddit-training-data'
test_dataset_reddit_directory = 'datasets/rumoureval-2019-test-data/reddit-test-data'

training_labels_json = 'datasets/rumoureval-2019-training-data/train-key.json'
training_labels_json_2 = 'datasets/rumoureval-2019-training-data/dev-key.json'
test_labels_json = 'datasets/final-eval-key.json'

training_labels_dict = read_json_file(training_labels_json)['subtaskaenglish']
training_labels_dict.update(read_json_file(training_labels_json_2)['subtaskaenglish'])
test_labels_dict = read_json_file(test_labels_json)['subtaskaenglish']

In [4]:
class Tweet:
    def __init__(self, post_content, post_id, parent_post_id=None, external_urls_count=0):
        self.post_content = post_content
        self.post_id = post_id
        self.category = None
        self.parent_post_id = parent_post_id
        self.external_urls = external_urls_count > 0
        self.user_metadata = None
        
    def add_category(self, category):
        self.category = category

In [5]:
class SourceTweet:
    def __init__(self, tweet: Tweet):
        self.tweet = tweet
        self.replies = []
        
    def add_reply(self, reply: Tweet):
        self.replies.append(reply)

In [6]:
def read_tweets_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    topic_to_tweets_map = {}  # {topic_name: [SourceTweet, ...]}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        
        tweets_paths = [f.path for f in os.scandir(topic_dir) if f.is_dir()]
        for tweet_dir in tweets_paths:
            source_tweet_path = [f.path for f in os.scandir(tweet_dir + '/source-tweet')][0]
            source_tweet_json = read_json_file(source_tweet_path)
            
            tweet = Tweet(source_tweet_json['text'], source_tweet_json['id'],
                              source_tweet_json['in_reply_to_status_id'],
                              len(source_tweet_json['entities']['urls']))
            
            source_tweet = SourceTweet(tweet)
            source_tweets.append(source_tweet)
            tweet.add_category("support")
            source_tweet.add_reply(tweet)

            
            reply_tweets_paths = [f.path for f in os.scandir(tweet_dir + '/replies')]
            for reply_tweet_path in reply_tweets_paths:
                reply_tweet_json = read_json_file(reply_tweet_path)
                
                reply_tweet = Tweet(reply_tweet_json['text'], reply_tweet_json['id'],
                                        source_tweet.tweet.post_id, len(reply_tweet_json['entities']['urls']))
                reply_tweet.add_category(labels_dict[str(reply_tweet_json['id'])])
                source_tweet.add_reply(reply_tweet)
        
        topic_to_tweets_map[topic_name] = source_tweets
        
    return topic_to_tweets_map
       

In [7]:
def read_reddit_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    topic_to_tweets_map = {}  # {topic_name: [SourceTweet, ...]}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        
        source_tweet_path = [f.path for f in os.scandir(topic_dir + '/source-tweet')][0]
        source_tweet_json = read_json_file(source_tweet_path)

        content = source_tweet_json['data']['children'][0]['data']['title'] + ' ' + source_tweet_json['data']['children'][0]['data']['selftext']
        tweet = Tweet(content, source_tweet_json['data']['children'][0]['data']['id'], None, content.count("http"))

        source_tweet = SourceTweet(tweet)
        source_tweets.append(source_tweet)
        tweet.add_category("support")
        source_tweet.add_reply(tweet)

        reply_tweets_paths = [f.path for f in os.scandir(topic_dir + '/replies')]
        for reply_tweet_path in reply_tweets_paths:
            reply_tweet_json = read_json_file(reply_tweet_path)
            
            if 'body' in reply_tweet_json['data']:
                reply_tweet = Tweet(reply_tweet_json['data']['body'], reply_tweet_json['data']['id'],
                                        source_tweet.tweet.post_id, reply_tweet_json['data']['body'].count('http'))
                reply_tweet.add_category(labels_dict[str(reply_tweet.post_id)])
                source_tweet.add_reply(reply_tweet)
                
        topic_to_tweets_map[topic_name] = source_tweets
        
    return topic_to_tweets_map

In [8]:
# Twitter
training_topic_to_tweets_map = read_tweets_dataset(training_dataset_directory, training_labels_dict)
test_topic_to_tweets_map = read_tweets_dataset(test_dataset_directory, test_labels_dict)

# Reddit
training_topic_to_reddit_map = read_reddit_dataset(training_dataset_reddit_directory, training_labels_dict)
test_topic_to_reddit_map = read_reddit_dataset(test_dataset_reddit_directory, test_labels_dict)

# Preprocessing

In [9]:
import string
import re

nlp = spacy.load("en_core_web_sm")
punctuation = string.punctuation.replace("!", "")
punctuation = string.punctuation.replace("?", "")

In [10]:
def preprocessing(sentence):
    lemmatizer = nlp.get_pipe("lemmatizer")        
    doc = nlp(sentence)
    lemmas = []
    for token in doc:
        if token.is_stop:
            continue
        elif token.pos_ == "NUM":
            lemmas.append('#')
        elif token.pos_ == "SYM":
            continue
        elif token.text in punctuation:
            continue
        elif re.search(r"[http.*]", token.text):
            continue
        else:
            lemmas.append(token.lemma_.lower())
    return lemmas

In [11]:
def create_df(topic_map):
    rows = []
    for topic, source_tweets in topic_map.items():
        for source_tweet in source_tweets:
            tokenized_source_tweet = preprocessing(source_tweet.tweet.post_content)
            for reply in source_tweet.replies:
                tokenized_reply = preprocessing(reply.post_content)
                rows.append((topic, source_tweet.tweet.post_content, reply.post_content, tokenized_source_tweet, tokenized_reply, reply.external_urls, reply.category))
    return pd.DataFrame(rows, columns=['topic', 'original_source_tweet', 'original_reply', 'source_tweet', 'reply', 'external_urls', 'category'])

## CountVectorizer and TfidfVectorizer feature extractions

In [12]:
training_tweets_df = create_df(training_topic_to_tweets_map)
test_tweets_df = create_df(test_topic_to_tweets_map)
# training_reddit_df = create_df(training_topic_to_reddit_map)
# test_reddit_df = create_df(test_topic_to_reddit_map)

In [13]:

training_data = training_tweets_df[['reply', 'category']].values
test_data = test_tweets_df[['reply', 'category']].values
# training_tweets_df['category'].unique()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

label_map = {'comment':0, 'support':1, 'deny':2, 'query':3}

def count_vectorize(data, ngram, count_vect=None):
    

    text_data = []
    labels = []
    for i in data:
        text_data.append(' '.join(i[0]))
        labels.append(label_map[i[1]])
    
    if count_vect is None:
        count_vect = CountVectorizer(ngram_range=(ngram, ngram), token_pattern = '[a-zA-Z0-9#?!]+')
        count_vect.fit(text_data)
    
    vectorized_data = count_vect.transform(text_data)
    
    return vectorized_data.toarray(), np.array(labels), count_vect
    
    # for i in text_data:
    #     count_vect.transform()

def tfidf_vectorize(data, ngram, count_vect=None):
    

    text_data = []
    labels = []
    for i in data:
        text_data.append(' '.join(i[0]))
        labels.append(label_map[i[1]])
    
    if count_vect is None:
        count_vect = TfidfVectorizer(ngram_range=(ngram, ngram), token_pattern = '[a-zA-Z0-9#?!]+')
        
        count_vect.fit(text_data)
        # print(count_vect.get_feature_names_out())
    
    vectorized_data = count_vect.transform(text_data)
    
    return vectorized_data.toarray(), np.array(labels), count_vect

# print(training_data[:5])
# X_train, y_train, count_vect = tfidf_vectorize(training_data[:5], 2)

In [15]:
def results(y_pred, y_test, name, ngram_size = None):
    if ngram_size is not None:
        print(f'{name}, {ngram_size}-grams:')
    else:
        print(f'{name}:')
    unique, counts = np.unique(y_pred, return_counts=True)
    print(dict(zip(unique, counts)))
    unique, counts = np.unique(y_test, return_counts=True)
    print(dict(zip(unique, counts)))
    # print(y_pred.count(0), y_test.count(0))

    
    print(" Classification accuracy: ", accuracy_score(y_test, y_pred))
    print(" Confusion matrix: \n", confusion_matrix(y_test, y_pred))
    target_names = ['comment', 'support', 'deny', 'query']
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.linear_model import SGDClassifier


ngram_size = 1


X_train, y_train, count_vect = tfidf_vectorize(training_data, ngram_size)
X_test, y_test, _ = tfidf_vectorize(test_data, ngram_size, count_vect)


X_train.shape, y_train.shape, X_test.shape, y_test.shape


((5568, 3874), (5568,), (1066, 3874), (1066,))

In [17]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
y_pred = MNB.predict(X_test)

results(y_pred, y_test, 'Naive Bayes', ngram_size)

Naive Bayes, 1-grams:
{0: 1065, 1: 1}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7223264540337712
 Confusion matrix: 
 [[770   1   0   0]
 [147   0   0   0]
 [ 92   0   0   0]
 [ 56   0   0   0]]
              precision    recall  f1-score   support

     comment       0.72      1.00      0.84       771
     support       0.00      0.00      0.00       147
        deny       0.00      0.00      0.00        92
       query       0.00      0.00      0.00        56

    accuracy                           0.72      1066
   macro avg       0.18      0.25      0.21      1066
weighted avg       0.52      0.72      0.61      1066



In [18]:
LR = LogisticRegression(max_iter = 1000)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

results(y_pred, y_test, 'Logistic Regression', ngram_size)

Logistic Regression, 1-grams:
{0: 1013, 1: 10, 2: 4, 3: 39}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7223264540337712
 Confusion matrix: 
 [[749   5   0  17]
 [143   1   0   3]
 [ 83   2   4   3]
 [ 38   2   0  16]]
              precision    recall  f1-score   support

     comment       0.74      0.97      0.84       771
     support       0.10      0.01      0.01       147
        deny       1.00      0.04      0.08        92
       query       0.41      0.29      0.34        56

    accuracy                           0.72      1066
   macro avg       0.56      0.33      0.32      1066
weighted avg       0.66      0.72      0.63      1066



In [19]:
SGD = SGDClassifier(max_iter=1000)
SGD.fit(X_train, y_train)
y_pred = SGD.predict(X_test)

results(y_pred, y_test, 'SGD', ngram_size)

SGD, 1-grams:
{0: 961, 1: 39, 2: 18, 3: 48}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7110694183864915
 Confusion matrix: 
 [[725  17   6  23]
 [135   5   3   4]
 [ 66  15   9   2]
 [ 35   2   0  19]]
              precision    recall  f1-score   support

     comment       0.75      0.94      0.84       771
     support       0.13      0.03      0.05       147
        deny       0.50      0.10      0.16        92
       query       0.40      0.34      0.37        56

    accuracy                           0.71      1066
   macro avg       0.44      0.35      0.35      1066
weighted avg       0.63      0.71      0.65      1066



In [20]:
linear_SVC = lin_clf = svm.LinearSVC()
linear_SVC.fit(X_train, y_train)
y_pred = linear_SVC.predict(X_test)

results(y_pred, y_test, 'Linear_SVC')

Linear_SVC:
{0: 928, 1: 75, 2: 21, 3: 42}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.699812382739212
 Confusion matrix: 
 [[708  37   8  18]
 [129  10   3   5]
 [ 56  25  10   1]
 [ 35   3   0  18]]
              precision    recall  f1-score   support

     comment       0.76      0.92      0.83       771
     support       0.13      0.07      0.09       147
        deny       0.48      0.11      0.18        92
       query       0.43      0.32      0.37        56

    accuracy                           0.70      1066
   macro avg       0.45      0.35      0.37      1066
weighted avg       0.63      0.70      0.65      1066



## Hand crafted features

In [145]:
import gensim.downloader as api
model = api.load("glove-twitter-25")
embedding_length = 25

In [146]:
f = open('negative-words.txt', 'r')
negative_words = f.read().split('\n')
f.close()

f = open('positive-words.txt', 'r')
positive_words = f.read().split('\n')
f.close()

In [147]:
# word_vectors = model.wv
# print(len(model['it']))

def feature_extraction(sentence):
    doc = nlp(sentence)
    
    features = []
    word_embeddings = []
    
    neg_count = 0
    pos_count = 0
    number = 0
    for token in doc:
        if token.text in negative_words:
            neg_count += 1
        elif token.text in positive_words:
            pos_count += 1
        if token.is_stop:
            continue
        if token.pos_ == "NUM":
            number = 1
        # 25 features
        elif token.text in model:
            # print(len(model[token.text]))
            word_embeddings.append(model[token.text])
            
    if len(word_embeddings) == 0:
        word_embeddings.append([0]*embedding_length)
        
    word_embeddings = np.array(word_embeddings)
    
    # 
    features.extend(list(np.mean(word_embeddings, axis = 0)))
    # contains number
    features.append(number)
    
    # contains ?
    if token.text.find('?'):
        features.append(1)
    else:
        features.append(0)
    
    # contains !
    if token.text.find('!'):
        features.append(1)
    else:
        features.append(0)
    
    # negative and positive word count
    features.append(neg_count)
    features.append(pos_count)
    
    # capital ratio
    uppers = [i for i in sentence if i.isupper()]
    capitalratio = len(uppers)/len(sentence)
    features.append(capitalratio)
    
    # length of sentence
    features.append(len(sentence))

    return features

In [148]:
def create_df_features(topic_map):
    rows = []
    for topic, source_tweets in topic_map.items():
        for source_tweet in source_tweets:
            tokenized_source_tweet = feature_extraction(source_tweet.tweet.post_content)
            for reply in source_tweet.replies:
                tokenized_reply = feature_extraction(reply.post_content)
                rows.append((topic, source_tweet.tweet.post_content, reply.post_content, tokenized_source_tweet, tokenized_reply, reply.external_urls, reply.category))
    return pd.DataFrame(rows, columns=['topic', 'original_source_tweet', 'original_reply', 'source_tweet', 'reply', 'external_urls', 'category'])

In [149]:
training_tweets_df_features = create_df_features(training_topic_to_tweets_map)
test_tweets_df_features = create_df_features(test_topic_to_tweets_map)

In [150]:
training_data_features = training_tweets_df_features[['reply', 'category']].values
test_data_features = test_tweets_df_features[['reply', 'category']].values

In [151]:
X_train_pad = np.array([i for i in training_data_features[:, 0]])
y_train_pad = np.array([label_map[i] for i in training_data_features[:, 1]])

X_test_pad = np.array([i for i in test_data_features[:, 0]])
y_test_features = np.array([label_map[i] for i in test_data_features[:, 1]])

X_train_pad.shape, y_test_features.shape

((5568, 32), (1066,))

In [152]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

p = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
p.fit(X_train_pad,y_train) 

y_pred = p.predict(X_test_pad)

results(y_pred, y_test_features, 'Naive Bayes')
# X_train_nb.shape

Naive Bayes:
{0: 1066}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.723264540337711
 Confusion matrix: 
 [[771   0   0   0]
 [147   0   0   0]
 [ 92   0   0   0]
 [ 56   0   0   0]]
              precision    recall  f1-score   support

     comment       0.72      1.00      0.84       771
     support       0.00      0.00      0.00       147
        deny       0.00      0.00      0.00        92
       query       0.00      0.00      0.00        56

    accuracy                           0.72      1066
   macro avg       0.18      0.25      0.21      1066
weighted avg       0.52      0.72      0.61      1066



In [153]:
LR_features = LogisticRegression(max_iter = 5000)
LR_features.fit(X_train_pad, y_train)
y_pred = LR_features.predict(X_test_pad)

results(y_pred, y_test_features, 'Logistic Regression')

Logistic Regression:
{0: 1007, 1: 23, 3: 36}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7467166979362101
 Confusion matrix: 
 [[755   7   0   9]
 [131  15   0   1]
 [ 91   1   0   0]
 [ 30   0   0  26]]
              precision    recall  f1-score   support

     comment       0.75      0.98      0.85       771
     support       0.65      0.10      0.18       147
        deny       0.00      0.00      0.00        92
       query       0.72      0.46      0.57        56

    accuracy                           0.75      1066
   macro avg       0.53      0.39      0.40      1066
weighted avg       0.67      0.75      0.67      1066



In [166]:
SGD_features = SGDClassifier(random_state=0)
SGD_features.fit(X_train_pad, y_train)
y_pred = SGD_features.predict(X_test_pad)

results(y_pred, y_test_features, 'SGD')

SGD:
{0: 1021, 1: 4, 3: 41}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7345215759849906
 Confusion matrix: 
 [[757   2   0  12]
 [144   0   0   3]
 [ 90   2   0   0]
 [ 30   0   0  26]]
              precision    recall  f1-score   support

     comment       0.74      0.98      0.84       771
     support       0.00      0.00      0.00       147
        deny       0.00      0.00      0.00        92
       query       0.63      0.46      0.54        56

    accuracy                           0.73      1066
   macro avg       0.34      0.36      0.35      1066
weighted avg       0.57      0.73      0.64      1066



In [167]:
linear_SVC = svm.LinearSVC(dual=False)
linear_SVC.fit(X_train_pad, y_train)
y_pred = linear_SVC.predict(X_test_pad)

results(y_pred, y_test_features, 'Linear_SVC')

Linear_SVC:
{0: 1002, 1: 16, 3: 48}
{0: 771, 1: 147, 2: 92, 3: 56}
 Classification accuracy:  0.7401500938086304
 Confusion matrix: 
 [[750   3   0  18]
 [133  12   0   2]
 [ 90   1   0   1]
 [ 29   0   0  27]]
              precision    recall  f1-score   support

     comment       0.75      0.97      0.85       771
     support       0.75      0.08      0.15       147
        deny       0.00      0.00      0.00        92
       query       0.56      0.48      0.52        56

    accuracy                           0.74      1066
   macro avg       0.52      0.38      0.38      1066
weighted avg       0.67      0.74      0.66      1066

