In [1]:
# Import libraries

# Main
import sys
sys.path.insert(0,'../')
import os
import datetime
import numpy as np
import tensorflow as tf

from tqdm import tqdm

# NLP
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier

# Transformers
from transformers import AutoTokenizer, AutoModel

# Sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Display
from IPython.display import clear_output
clear_output()

In [2]:
from utils import Utils
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)

In [3]:
# Define language
lang = 'es'

print('Starting...')

news_data, _ = utils.data_loader(lang, 'news', total_data=10000, max_size = None, return_dates = False)
fake_news_data, _ = utils.data_loader(lang, 'FakeNews', total_data=10000, max_size = None, return_dates = False)

print(f'Loaded {len(news_data)} Tweets {len(fake_news_data)} Reddit docs')

Starting...
Starting 10 threads to load 10000 documents from news in es
Loaded 10000 files in 256.46 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 10000 documents from FakeNews in es
Loaded 10000 files in 20.23 seconds.
Removed 0 files becasuse they were too large
Loaded 10000 Tweets 10000 Reddit docs


In [4]:
for i in range(2):
    print(news_data[i])
    print(fake_news_data[i])
    print('-----------------------')


                    
                    
                       S24H   
   
   
            
                
                    12/05/2021 10:02
                
            
                
                    
                        
                        
                      Comentarios
                
            Inserta Empleo, a través de la  Asociación Salud Mental Salamanca AFEMC, ha adjudicado 3  acciones formativas dirigidas al colectivo de personas con discapacidad.  En esta ocasión los títulos de estas formaciones son: “Planificación y  Organización 15h”, “Atención básica al cliente 50h” y “Motivación  30h”.  Un grupo integrado por 30 personas participarán en estas  formaciones, donde podrán adquirir habilidades enfocadas a la mejora de  la empleabilidad. Los cursos se enmarcan en el Programa Operativo de Inclusión Social y  Economía Social (POISES) y Programa Operativo de Empleo Juvenil  (POEJ), que está desarrollando Fundación ONCE a través de Inserta  Empleo, 

In [5]:
data = news_data + fake_news_data
tags = [1]*len(news_data) + [0]*len(fake_news_data)

In [6]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Preprocess data
corpus = []
for d in tqdm(data):
    corpus.append(utils.preprocessing(d, stop_words = stop_words,
                                         stemmer = None,
                                         tokenizer = tk,
                                         lemmatizer = lemma))

100%|██████████| 20000/20000 [01:03<00:00, 313.12it/s]


In [7]:
for i in range(len(corpus)):
    corpus[i] = corpus[i][:200]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news", device=0)
model = AutoModel.from_pretrained("textattack/bert-base-uncased-ag-news", output_hidden_states=False)

Some weights of the model checkpoint at textattack/bert-base-uncased-ag-news were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Test Model for first sentence
inputs = tokenizer(corpus[0][:259], return_tensors="pt", is_split_into_words=True)
outputs = model(**inputs)

# Just pooler output as embeddings
embedding = outputs['pooler_output'].detach().numpy()[0]

In [None]:
# Array to save embeddings
reu_embeddings = []

failed_doc_ids = []

for i, doc in enumerate(tqdm(corpus)):
    try:
        # Run Bert for each document
        inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
        outputs = model(**inputs)

        # CLS Token Output
        embedding = outputs['pooler_output'].detach().numpy()[0]
        
        # Append representation
        reu_embeddings.append(embedding)
        
    except:
        failed_doc_ids.append(i)
    
print(f'Failed to tokenize {len(failed_doc_ids)} documents')

 21%|██        | 4147/20000 [15:14<56:42,  4.66it/s]  

In [None]:
# Remove failed docs
for i, doc_id in enumerate(failed_doc_ids):
    corpus.pop(doc_id - i)
    tags.pop(doc_id - i)

In [None]:
X = reu_embeddings
y = tags
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
clf = MLPClassifier(hidden_layer_sizes=(500,250,100,20), random_state=1, max_iter=700).fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
        layers.Dense((768)/2, activation="relu", input_shape = X.shape[0]),
        layers.Dense((300), activation='relu'),
        layers.Dense((100), activation='relu'),
        layers.Dense(1, activation='sigmoid')
        ])
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=15)

In [None]:
import matplotlib.pyplot as plt