In [1]:
# Import libraries

# Main
import sys
sys.path.insert(0,'../')
import os
import datetime
import numpy as np
# import tensorflow as tf
from tqdm import tqdm

# NLP
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier

# Transformers
from transformers import AutoTokenizer, AutoModel

# Sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Display
from IPython.display import clear_output
clear_output()

In [2]:
from utils import Utils
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)

In [3]:
# Define language
lang = 'es'

print('Starting...')

news_data, _ = utils.data_loader(lang, 'news', total_data=500, max_size = None, return_dates = False)
fake_news_data, _ = utils.data_loader(lang, 'FakeNews', total_data=500, max_size = None, return_dates = False)

print(f'Loaded {len(news_data)} Tweets {len(fake_news_data)} Reddit docs')

Starting...
Starting 10 threads to load 500 documents from news in es
Loaded 500 files in 14.98 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 500 documents from FakeNews in es
Loaded 500 files in 8.33 seconds.
Removed 0 files becasuse they were too large
Loaded 500 Tweets 500 Reddit docs


In [4]:
for i in range(2):
    print(news_data[i])
    print(fake_news_data[i])
    print('-----------------------')

Numerosas Asociaciones del Sector Turístico europeo, abarcando agencias de viajes, aerolíneas o cruceros, entre otros, han aplaudido las nuevas propuesta de Reglamento de la Unión Europea (UE) sobre el Certificado Verde Digital, que pone en marcha las negociaciones a tres bandas entre el Parlamento, el Consejo y la Comisión Europea. Coinciden en que "es fundamental que las instituciones actúen con celeridad y se pongan de acuerdo para que los certificados sean operativos en junio y se garantice la reciprocidad con los sistemas de fuera de la UE".Instan a negociar y llegar a un acuerdo sobre el pasaporte rápidamente antes de mediados de mayoEn este sentido, los miembros del Sector, entre los que se encuentran la Agrupación Europea de Asociaciones de Agencias de Viajes y Turoperadores (ECTAA), se posicionan del lado del Parlamento europeo, quien propone que las pruebas sean gratuitas para todos los públicos, así como dejar a un lado medidas adicionales, como la cuarentena o la realizació

In [5]:
data = news_data + fake_news_data
tags = [1]*len(news_data) + [0]*len(fake_news_data)

In [6]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Preprocess data
corpus = []
for d in data:
    corpus.append(utils.preprocessing(d, stop_words = stop_words,
                                         stemmer = None,
                                         tokenizer = tk,
                                         lemmatizer = lemma))

In [7]:
for i in range(len(corpus)):
    corpus[i] = corpus[i][:200]

In [9]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news", device=0)
model = AutoModel.from_pretrained("textattack/bert-base-uncased-ag-news", output_hidden_states=False)

Some weights of the model checkpoint at textattack/bert-base-uncased-ag-news were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Test Model for first sentence
inputs = tokenizer(corpus[0][:259], return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)

# Just pooler output as embeddings
embedding = outputs['pooler_output'].detach().numpy()[0]

In [11]:
# Array to save embeddings
reu_embeddings = []

failed_doc_ids = []

for i, doc in enumerate(tqdm(corpus)):
    try:
        # Run Bert for each document
        inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
        outputs = model(**inputs)

        # CLS Token Output
        embedding = outputs['pooler_output'].detach().numpy()[0]
        
        # Append representation
        reu_embeddings.append(embedding)
        
    except:
        failed_doc_ids.append(i)
    
print(f'Failed to tokenize {len(failed_doc_ids)} documents')

100%|██████████| 1000/1000 [05:01<00:00,  3.31it/s]

Failed to tokenize 0 documents





In [12]:
# Remove failed docs
for i, doc_id in enumerate(failed_doc_ids):
    corpus.pop(doc_id - i)
    tags.pop(doc_id - i)

In [13]:
X = reu_embeddings
y = tags
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
clf = MLPClassifier(random_state=1, max_iter=700).fit(X_train, y_train)
clf.score(X_val, y_val)

0.55