In [1]:
# Import libraries

# Main
import sys
sys.path.insert(0,'../')
import os
import numpy as np
from tqdm import tqdm
import string, json
import torch, re

# NLP
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

# Transformers
from transformers import AutoTokenizer, AutoModel, TFAutoModel

# Display
from IPython.display import clear_output
clear_output()

In [2]:
def preprocess_social(data, language='en'):
    
    # Creates the language dictionary
    lang_dict = {
        "en": "english",
        "es": "spanish",
        "fr": "french"
    }
    
    data = re.sub(r'http\S+', '', data)
    
    # Sets text into lowercase
    data = data.lower()
    
    # Tokenizes by word
    tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    data = tk.tokenize(data)
    
    data_temp = []
    for word in data:
        if word not in string.punctuation:
            data_temp.append(word)
    data = data_temp
    
    # Removes stopwords
    data = [token for token in data if token not in stopwords.words(lang_dict[language])]
    
    # Creates the stemmer
    stemmer = SnowballStemmer(lang_dict[language])
    
    # Stems data
    data = [stemmer.stem(token) for token in data]
    
    # Returns preprocessed text
    return data

In [3]:
def preprocess_news(data, language='en'):
    
    # Creates the language dictionary
    lang_dict = {
        "en": "english",
        "es": "spanish",
        "fr": "french"
    }
    
    # Sets text into lowercase
    data = data.lower()
    
    # Removes punctuation
    data = strip_punctuation(data)
    
    # Tokenizes by word
    data = word_tokenize(data)
    
    # Removes stopwords
    data = [token for token in data if token not in stopwords.words(lang_dict[language])]
    
    # Creates the stemmer
    stemmer = SnowballStemmer(lang_dict[language])
    
    # Stems data
    data = [stemmer.stem(token) for token in data]
    
    # Returns preprocessed text
    return data

In [4]:
from utils import Utils
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)
!pip install sentencepiece

You should consider upgrading via the '/home/juan/NLP/project/bin/python3 -m pip install --upgrade pip' command.[0m


In [5]:
# Bert instance EN (COVID)
model_name = "microsoft/xtremedistil-l6-h384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModel.from_pretrained(model_name, output_hidden_states=False)
device = torch.device("cuda")
model.to(device)

# Bert instance FR
# tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
# model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased", output_hidden_states=True)

# Bert instance ES (BETICO)
# tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
# model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", output_hidden_states=False)

# tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news", device=0)
# model = AutoModel.from_pretrained("textattack/bert-base-uncased-ag-news", output_hidden_states=False)

clear_output()

In [6]:
def load_sub_data(sub_data, data_type='social'):
    data = []
    names = []
    for file in sub_data:
        with open(os.path.join(path_prefix, source, lang, file)) as data_file:
            data_dict = json.load(data_file)
            names.append(file)
            if data_type == 'social':
                data.append(preprocess_social(data_dict['text'], language=lang))
            else:
                data.append(utils.preprocessing(file, stop_words = stop_words, stemmer = None,
                                                        tokenizer = tk,
                                                        lemmatizer = lemma))
                            # data.append(preprocess_news(data_dict['text'], language=lang))
    return data, names

def infer_embedding_and_save(sub_data, names, failed):
    # Array to save embeddings
    reu_embeddings = []
    failed_doc_ids = []
    embedding_dict = {}
    for i, doc in enumerate(sub_data):
        # try:
            # Run Bert for each document
        inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
        inputs.to(device)
        outputs = model(**inputs)
        # CLS Token Output
        embedding = outputs['pooler_output'].detach().cpu().numpy()[0].tolist()
        # Append representation
        reu_embeddings.append(embedding)
        # except:
        #     failed_doc_ids.append(i)
            
    # Remove failed docs
    for i, doc_id in enumerate(failed_doc_ids):
        sub_data.pop(doc_id - i)
        names.pop(doc_id - i)
    failed = failed + len(failed_doc_ids)
    # print(f'Failed to tokenize {len(failed_doc_ids)} documents')
    
    for file in zip(names, reu_embeddings):
        embedding_dict[file[0]] = file[1]
    
    return embedding_dict, failed

def load_and_save_file(embedding_dict, counter):
    with open(os.path.join(path_prefix, 'embeddings', source, lang, 'BERT_' + str(counter) + '.json'), 'w') as file:
        json.dump(embedding_dict, file)

In [7]:
path_prefix = '/media/juan/Juan/NLP/'
langs = ['en']
sources = ['tweets']
for lang in langs:
    for source in sources:
        # Stop Words
        lang_dict = {
                "en": "english",
                "es": "spanish",
                "fr": "french"
            }
        stop_words = stopwords.words(lang_dict[lang])
        # Stemmers
        stem = SnowballStemmer(lang_dict[lang])
        #p_stem = PorterStemmer()
        # Tokenizers
        #tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
        tk = nltk.RegexpTokenizer(r'\w+')
        # Lemmatizer
        lemma = nltk.stem.WordNetLemmatizer()
        failed = 0
        print('Processing ' + source + ' in ' + lang)
        files_list = os.listdir(os.path.join(path_prefix, source, lang))
        print(len(files_list))
        data_type = 'news'
        file_name = source + '_' + lang + 'embeddings.json'
        splitted_files_list = np.array_split(files_list, 1000)
        for i, chunk in enumerate(tqdm(splitted_files_list)):
            data, names = load_sub_data(chunk, data_type = data_type)
            for index in range(len(data)):
                data[index] = data[index][:450]
            embedding_dict, failed = infer_embedding_and_save(data, names, failed)
            load_and_save_file(embedding_dict, i)
        print('Failed to tokenize ' + str(failed))

Processing tweets in en


  0%|          | 0/1000 [00:00<?, ?it/s]

635776


100%|██████████| 1000/1000 [1:01:35<00:00,  3.70s/it]

Failed to tokenize 0



