In [65]:
# Import libraries

# Main
import sys
sys.path.insert(0,'../')
import os
import datetime
import numpy as np
# import tensorflow as tf
from tqdm import tqdm

# NLP
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier

# Transformers
from transformers import AutoTokenizer, AutoModel

# Sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Display
from IPython.display import clear_output
clear_output()

In [66]:
from utils import Utils
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)

In [67]:
# Define language
lang = 'es'

print('Starting...')

news_data, _ = utils.data_loader(lang, 'news', total_data=500, max_size = None, return_dates = False)
fake_news_data, _ = utils.data_loader(lang, 'FakeNews', total_data=500, max_size = None, return_dates = False)

print(f'Loaded {len(news_data)} Tweets {len(fake_news_data)} Reddit docs')

Starting...
Starting 10 threads to load 500 documents from news in es
Loaded 500 files in 15.38 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 500 documents from FakeNews in es
Loaded 500 files in 11.06 seconds.
Removed 0 files becasuse they were too large
Loaded 500 Tweets 500 Reddit docs


In [68]:
for i in range(2):
    print(news_data[i])
    print(fake_news_data[i])
    print('-----------------------')

Cuenta de usuarioPremiumServiciosEl León de El Español Publicaciones S.A.
                    
                                    Portada Sport (21/03/21)
                            
        
        
        
            "Al asalto de La Liga", así titula el diario Sport su portada de este domingo 21 de marzo. El Barcelona necesita la victoria después del triunfo del Real Madrid frente al Celta de Vigo este sábado. Los culés se enfrentan a la Real Sociedad y lo harán sabiendo el resultado del Atlético.Regístrate gratis y recibe cada mañana las noticias en tu correoEugenia Martínez de Irujo explica cómo se encuentra su hermano CayetanoLlegada masiva de inmigrantes a las playas de CeutaMbappe: "No me da vértigo ser el jugador más...Pallete, Galán y Garralda, entre los asistentes a la presentación del documento 'España 2050'El increíble aspecto del párking y de los nuevos accesos en las obras del Estadio Santiago BernabéuSergio Ramos aprieta y quiere convencer a Zidane para estar contr

In [69]:
data = news_data + fake_news_data
tags = [1]*len(news_data) + [0]*len(fake_news_data)

In [70]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Preprocess data
corpus = []
for d in data:
    corpus.append(utils.preprocessing(d, stop_words = stop_words,
                                         stemmer = None,
                                         tokenizer = tk,
                                         lemmatizer = lemma))

In [71]:
for i in range(len(corpus)):
    corpus[0] = corpus[0][:200]

In [72]:
len(corpus)

1000

In [73]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news")
model = AutoModel.from_pretrained("textattack/bert-base-uncased-ag-news", output_hidden_states=False)

Some weights of the model checkpoint at textattack/bert-base-uncased-ag-news were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [74]:
# Test Model for first sentence
inputs = tokenizer(corpus[0][:259], return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)

# Just pooler output as embeddings
embedding = outputs['pooler_output'].detach().numpy()[0]

In [None]:
# Array to save embeddings
reu_embeddings = []

failed_doc_ids = []

for i, doc in enumerate(tqdm(corpus)):
    try:
        # Run Bert for each document
        inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
        outputs = model(**inputs)

        # CLS Token Output
        embedding = outputs['pooler_output'].detach().numpy()[0]
        
        # Append representation
        reu_embeddings.append(embedding)
        
    except:
        failed_doc_ids.append(i)
    
print(f'Failed to tokenize {len(failed_doc_ids)} documents')

  0%|          | 1/1000 [00:00<06:20,  2.62it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
  8%|▊         | 81/1000 [00:08<03:13,  4.74it/s]

In [None]:
# Remove failed docs
for i, doc_id in enumerate(failed_doc_ids):
    corpus.pop(doc_id - 1)
    tags.pop(doc_id - 1)

In [None]:
X = reu_embeddings
y = tags
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
clf = MLPClassifier(random_state=1, max_iter=700).fit(X_train, y_train)
clf.score(X_val, y_val)