# Fake News Detection - English

In [1]:
# Import libraries

# Main
import sys
sys.path.insert(0,'../')
import os
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm

# NLP
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

# Transformers
from transformers import AutoTokenizer, AutoModel, TFAutoModel

# Sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Display
from IPython.display import clear_output
clear_output()

### Loading data

In [2]:
from utils import Utils
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)

In [None]:
# Define language
lang = 'en'

print('Starting...')

news_data, _ = utils.data_loader(lang, 'news', total_data=10000, max_size = None, return_dates = False)
fake_news_data, _ = utils.data_loader(lang, 'FakeNews', total_data=10000, max_size = None, return_dates = False)

print(f'Loaded {len(news_data)} news {len(fake_news_data)} fake news')

data = news_data + fake_news_data
tags = [1]*len(news_data) + [0]*len(fake_news_data)

Starting...
Starting 10 threads to load 10000 documents from news in en
Loaded 10000 files in 301.18 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 10000 documents from FakeNews in en


### Glimpse at data

In [None]:
news_num = 5
print(news_data[news_num])
print('-----------------------')
print(fake_news_data[news_num])

### Feature extraction using BERT

In [7]:
# Stop Words
stop_words = stopwords.words('english')
# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()
# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')
# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()
# Preprocess data
corpus = []
for d in tqdm(data):
    corpus.append(utils.preprocessing(d, stop_words = stop_words,
                                         stemmer = None,
                                         tokenizer = tk,
                                         lemmatizer = lemma))

100%|██████████| 10000/10000 [00:27<00:00, 363.37it/s]


####  Reduce data size

In [8]:
for i in range(len(corpus)):
    corpus[i] = corpus[i][:200]

#### Load model

In [None]:
# Bert instance EN (COVID)
model_name = "mrm8488/distilroberta-finetuned-age_news-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModel.from_pretrained(model_name, output_hidden_states=False)

# Bert instance FR
# tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
# model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased", output_hidden_states=True)

# Bert instance ES (BETICO)
# tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
# model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", output_hidden_states=False)

# tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news", device=0)
# model = AutoModel.from_pretrained("textattack/bert-base-uncased-ag-news", output_hidden_states=False)mrm8488/bert-mini-finetuned-age_news-classification

clear_output()

#### Obtain first doc's embedding

In [10]:
# Test Model for first sentence
inputs = tokenizer(corpus[0], return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)

# Just pooler output as embeddings
embedding = outputs['pooler_output'].detach().numpy()

#### Tokenize each doch in corpus

In [11]:
# Array to save embeddings
reu_embeddings = []

failed_doc_ids = []

for i, doc in enumerate(tqdm(corpus)):
    try:
        # Run Bert for each document
        inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
        outputs = model(**inputs)

        # CLS Token Output
        embedding = outputs['pooler_output'].detach().numpy()[0]
        
        # Append representation
        reu_embeddings.append(embedding)
        
    except:
        failed_doc_ids.append(i)
    
print(f'Failed to tokenize {len(failed_doc_ids)} documents')

100%|██████████| 10000/10000 [16:41<00:00,  9.98it/s]

Failed to tokenize 0 documents





#### Remove failed documents and corresponding tags

In [12]:
# Remove failed docs
for i, doc_id in enumerate(failed_doc_ids):
    corpus.pop(doc_id - i)
    tags.pop(doc_id - i)

#### Split data and train MLP classifier

In [None]:
model_dict = {'NB': GaussianNB(), 
              'LR': LogisticRegression(random_state=0, max_iter = 700), 
              'MLP': MLPClassifier(hidden_layer_sizes=(500,250,100,20), random_state=1, max_iter=700),
              'SVM': svm.SVC()}

In [None]:
def test_sklearn_models(model, metrics_df, X, y, vectorizer_type, val_percentage = 0.2):
    """ Train and evaluate model with specified arguments
    
    Args:
        model (str): Model to train
        metrics_df (pd.DataFrame): Dataframe to save the results
        X (np.ndarray): Features of data
        y (list): Tags of X
        vectorizer_type (str): Vectorizer used to extract features
        val_percentage (float): Validation percentage
    
    Returns:
        pd.DataFrame: Results of trained and evaluated model
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=1-val_percentage)
    clf = model_dict[model]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    pre = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    data = {'model': [model], 'features': [vectorizer_type], 'lang': [lang], 'accuracy': [acc], 'precision': [pre], 'recall': [rec], 'f1': [f1]}
    df = metrics_df.append([pd.DataFrame(data=data)])
    return df

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
        layers.Dense(500, activation="relu", input_shape=(len(X_train), 768)),
        layers.Dense(250, activation='relu'),
        layers.Dense(100, activation='relu'),
        layers.Dense(20, activation='relu'),
        layers.Dense(1, activation='sigmoid')
        ])
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
history = model.fit(np.array(X_train), np.array(y_train), validation_data=(np.array(X_val), np.array(y_val)), epochs=300)
clear_output()

In [None]:
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['accuracy'])

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
clf = MLPClassifier(hidden_layer_sizes=(500,250,100,20), random_state=1, max_iter=700).fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
clf = LogisticRegression(random_state=0, max_iter = 700).fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
clf.score(X_val, y_val)