# Script to apply the supervised approach as described in Section 5.4.1:

All the preprocessing, use of classifiers, and hyperparameter settings are based on the research done by [1].

Run all cells to apply the supervised approach to CrisisMMD dataset.

##### Note: This step requires to have dataset created using create_dataset.ipynb script to filter out tweets from CrisisMMD dataset that do not exist in real-time. The approach may be applied to CrisisMMD directly as well.

## Initialisations:

In [1]:
# Importing python libraries
import pandas as pd
import numpy as np
import itertools
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [2]:
# Initialising directory paths

# Set following path to annotated tweets present in dataset downloaded from [1]
labelled_data_path = '../../Data/CrisisMMD/CrisisMMD_v2.0/annotations'

# Path to dataset created in current time by create_dataset.ipynb
dataset_store_path = '../../Data/TweetCredibilityDatasets'

# Path to directory to store evaluation results of supervised approach
evaluation_path = '../evaluation/supervised_results'

Following is the list of dataset file names as per the files stored in annotations folder of CrisisMMD dataset. Set the event_name and event_file_name in next cell for running the supervised approach and save corresponding evaluations for a climate event.

1. 'california_wildfires_final_data.tsv'
2. 'hurricane_harvey_final_data.tsv'
3. 'hurricane_irma_final_data.tsv'
4. 'hurricane_maria_final_data.tsv'
5. 'iraq_iran_earthquake_final_data.tsv'
6. 'mexico_earthquake_final_data.tsv'
7. 'srilanka_floods_final_data.tsv'

In [3]:
# Set the event name and file name of climate event for which the similarity scores are to be calculated
event_name = 'california_wildfires'
event_file_name = 'california_wildfires_final_data.tsv'

## Defining functions for text preprocessing and reading tweets from dataset files:

In [4]:
# Following code uses [2] for removing url links as also done in [1]
# All pre-processing performed is as per the processing steps described in [1]

# Initialising text processing libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Initialising stemmer to extract word stems
stemmer = PorterStemmer()

# Initialising tokenizer for tweets using nltk tweet_tokenizer
tweet_tokenizer = TweetTokenizer() 

def tokenize_tweets(corpus):
    tokenized_corpus = []
    for tweet in corpus:
        # Converting text into tokens
        tokens = tweet_tokenizer.tokenize(tweet)
        # Removing stop words and stemming
        processed_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stopwords.words('English')]
        # Removing url tokens
        processed_tokens = [re.sub(r"http\S+", '', token) for token in processed_tokens]
        # Removing special characters
        processed_tokens = [re.sub('\W+','', token) for token in processed_tokens]
        # Recreating the text by joining tokens
        tokenized_corpus.append(' '.join(processed_tokens).strip())
    return tokenized_corpus

In [7]:
# Method to read labelled data from original CrisisMMD files for tweets in the dataset generated in current time

def read_labelled_tweets(file_name):
    # Reading tweets data from excel files created using create_dataset.ipynb
    tweets_data = pd.read_csv(f'{dataset_store_path}/21237189_{event_name}_final_data.csv')    
    # Removing duplicate rows
    tweets_data = tweets_data.drop_duplicates(subset=['id']).reset_index()
    annotated_tweets = pd.read_csv(f'{labelled_data_path}/{file_name}', sep='\t', usecols=
                       ['tweet_id', 'text_info', 'tweet_text'], squeeze=True)
    return annotated_tweets[annotated_tweets['tweet_id'].isin(tweets_data['id'].values)]

## Reading and processing tweet texts:

In [8]:
# Reading all tweets in dataset along with their informative/not informative labels 
tweets = read_labelled_tweets(event_file_name).copy()
# Removing Duplicates
tweets = tweets.drop_duplicates(subset=['tweet_id'])
# Setting the corpus
corpus = tweets['tweet_text'].values
# Preprocessing the corpus
processed_corpus = tokenize_tweets(corpus)

## Calculating TF-IDF features:

In [9]:
# Initialising the vectorizer for generating word n-gram features
# [1] claims that the unigrams and bigrams provide the best results,
# thus setting the parameters accordingly
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
n_grams = '1_2'

In [10]:
# Process applied to complete dataset as also done in [1]
tweets_tfidf = vectorizer.fit_transform(processed_corpus)

In [11]:
# Converting the output labels to binary values
tweet_labels = [1 if tweet_label=='informative' else 0 for tweet_label in tweets['text_info'].values]

In [16]:
# As stated in [1] "The implementation of the classifiers in scikitlearn python library,
# is used setting all the parameters to the default values", the following function is designed,
# to run all classifiers on the dataset with their default settings using 10-fold cross validation.

def run_classifiers(X, y, n_grams):
    results = []
    
    lsvm_result = {}
    lsvm_result['classifier'] = 'Linear SVM'
    lsvm_clf = LinearSVC()
    lsvm_result['precision'] = np.mean(cross_val_score(lsvm_clf, X, y, cv=10, scoring='precision'))
    lsvm_result['recall'] = np.mean(cross_val_score(lsvm_clf, X, y, cv=10, scoring='recall'))
    lsvm_result['f1'] = np.mean(cross_val_score(lsvm_clf, X, y, cv=10, scoring='f1'))
    results.append(lsvm_result)
    
    lr_result = {}
    lr_result['classifier'] = 'Logistic Regression'
    lr_clf = LogisticRegression()
    lr_result['precision'] = np.mean(cross_val_score(lr_clf, X, y, cv=10, scoring='precision'))
    lr_result['recall'] = np.mean(cross_val_score(lr_clf, X, y, cv=10, scoring='recall'))
    lr_result['f1'] = np.mean(cross_val_score(lr_clf, X, y, cv=10, scoring='f1'))
    results.append(lr_result)
    
    rf_result = {}
    rf_result['classifier'] = 'Random Forest'
    rf_clf = RandomForestClassifier()
    rf_result['precision'] = np.mean(cross_val_score(rf_clf, X, y, cv=10, scoring='precision'))
    rf_result['recall'] = np.mean(cross_val_score(rf_clf, X, y, cv=10, scoring='recall'))
    rf_result['f1'] = np.mean(cross_val_score(rf_clf, X, y, cv=10, scoring='f1'))
    results.append(rf_result)
    
    nb_result = {}
    nb_result['classifier'] = 'Naive Bayes'
    nb_clf = GaussianNB()
    nb_result['precision'] = np.mean(cross_val_score(nb_clf, X, y, cv=10, scoring='precision'))
    nb_result['recall'] = np.mean(cross_val_score(nb_clf, X, y, cv=10, scoring='recall'))
    nb_result['f1'] = np.mean(cross_val_score(nb_clf, X, y, cv=10, scoring='f1'))
    results.append(nb_result)
    
    knn_result = {}
    knn_result['classifier'] = 'K-Nearest Neighbour'
    knn_clf = KNeighborsClassifier()
    knn_result['precision'] = np.mean(cross_val_score(knn_clf, X, y, cv=10, scoring='precision'))
    knn_result['recall'] = np.mean(cross_val_score(knn_clf, X, y, cv=10, scoring='recall'))
    knn_result['f1'] = np.mean(cross_val_score(knn_clf, X, y, cv=10, scoring='f1'))
    results.append(knn_result)
    
    results_df = pd.DataFrame(results)
    
    results_df.to_csv(f"{evaluation_path}/{event_name}_supervised_scores_{n_grams}.csv")
    
    return results_df

In [17]:
results = run_classifiers(tweets_tfidf.toarray(), tweet_labels, n_grams)
print(results.to_markdown())

|    | classifier          |   precision |   recall |     f1 |
|---:|:--------------------|------------:|---------:|-------:|
|  0 | Linear SVM          |      0.7842 |   0.9929 | 0.8763 |
|  1 | Logistic Regression |      0.7795 |   1      | 0.8761 |
|  2 | Random Forest       |      0.78   |   0.9988 | 0.8749 |
|  3 | Naive Bayes         |      0.8057 |   0.8354 | 0.82   |
|  4 | K-Nearest Neighbour |      0.7933 |   0.9539 | 0.8661 |


# References:

[1] N. Hassan, W. Gomaa, G. Khoriba, and M. Haggag, “Credibility detection in twitter using word n-gram analysis and supervised machine learning techniques,” International Journal of Intelligent Engineering and Systems, vol. 13, pp. 291–300, Dec. 2020. [Online]. Available: https://doi.org/10.22266/ijies2020.0229.27

[2] "Expression to remove URL links from Twitter tweet," 2021. Available: https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet

[3] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011