In [1]:
# PARAMS
lang = 'en'

# Relevant classes
relevant_2 = ["vaccines", "vaccination", "mental health"]
relevant_3 = [""]

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False

In [2]:
# Imports the OS library
import os

# Imports the time library
from time import time

# Imports the document class
from document import Document

# Import TQDM for time measurements
from tqdm import tqdm

# Imports NLTK
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
    
# Imports gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation

# Imports matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker

from sklearn.model_selection import train_test_split

import numpy as np

# Imports tensorflow
import tensorflow as tf
from tensorflow import keras

# Imports the BERT tokenizer and model
from transformers import AutoTokenizer, TFAutoModel

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing

In [3]:
def preprocess_text(data, language='en'):
    
    # Creates the language dictionary
    lang_dict = {
        "en": "english",
        "es": "spanish",
        "fr": "french"
    }
    
    # Sets text into lowercase
    data = data.lower()
    
    # Removes punctuation
    data = strip_punctuation(data)
    
    # Tokenizes by word
    data = word_tokenize(data)
    
    # Removes stopwords
    data = [token for token in data if token not in stopwords.words(lang_dict[language])]
    
    # Creates the stemmer
    stemmer = SnowballStemmer(lang_dict[language])
    
    # Stems data
    data = [ stemmer.stem(token) for token in data]
    
    # Hard padding
    if len(data) > 256:
        data = data[:256]

    return data

## Embedding

In [4]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFAutoModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
def get_embeddings(data):    
    encoded_input = bert_tokenizer(data, padding='max_length', return_tensors='tf', is_split_into_words=True)
    output = bert_model(encoded_input)
    return output["pooler_output"]

## Data Loading

In [7]:
# Creates the news path
path_to_news = "../../data/news/" + lang + '/'

# Gets the file list
files = os.listdir(path_to_news)

# Splits the files into training and validation
train_files, test_files = train_test_split(files, train_size=0.85)

# Creates the list
train_data = []
train_labels = []

# Iterates over files
for i, file in enumerate(tqdm(train_files)):
    
    doc = Document()

    doc.load_from_json(path_to_news + file)
    
    train_data.append(get_embeddings(preprocess_text(doc.text, lang)))
    
    if doc.keyword in relevant_2:
        train_labels_2.append(1)
    else:
        train_labels_2.append(0)
        
    if doc.keyword in relevant_3:
        train_labels_3.append(1)
    else:
        train_labels_3.append(0)
        
    if i == 100000:
        break
    

  0%|          | 0/269637 [00:00<?, ?it/s]


NameError: name 'relevant_2' is not defined

## Model Definition

In [7]:
model = keras.Sequential()

model.add(keras.layers.Dense(128))
model.add(keras.layers.Dense(64))
model.add(keras.layers.Dense(1, activation="sigmoid"))