# Create Dataset

## Load Data 

In [1]:
# General imports
import os
import sys
import re
import random
import json
import string

# Data Science
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# NLP
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.doc2vec import TaggedDocument, Doc2Vec


# Transformers
from transformers import AutoTokenizer, AutoModel, TFAutoModel
import torch, re

# Display
from tqdm import tqdm
from IPython.display import clear_output
clear_output()

In [2]:
# PARAMS
lang = 'en'
emb = 'doc2vec'
tag_type = 'keywords'

In [3]:
# Import utils class
sys.path.insert(0,'../')
from utils import Utils

# Instanciate utils class
utils = Utils('/media/juan/Juan/NLP/', num_workers=10)

In [4]:
# Load manual tags
with open(f'tags/manual/reddit_{lang}.json', 'r+') as file_str:
    reddit_manual_tags = json.load(file_str)
with open(f'tags/manual/tweets_{lang}.json', 'r+') as file_str:
    twitter_manual_tags = json.load(file_str)
with open(f'tags/manual/news_{lang}.json', 'r+') as file_str:
    news_manual_tags = json.load(file_str)

# Load keyword tags
with open(f'tags/{tag_type}/reddit_{lang}_words.json', 'r+') as file_str:
    reddit_enhanced_tags = json.load(file_str)
with open(f'tags/{tag_type}/tweets_{lang}_words.json', 'r+') as file_str:
    twitter_enhanced_tags = json.load(file_str)

# Load Tagged data
print('Starting to load manual tagged data...')
manual_reddit_data, manual_reddit_file_names = utils.tagged_data_loader(list(reddit_manual_tags.keys()), 'reddit', lang)
manual_twitter_data, manual_twitter_file_names = utils.tagged_data_loader(list(twitter_manual_tags.keys()), 'tweets', lang)
manual_news_data, manual_news_file_names = utils.tagged_data_loader(list(news_manual_tags.keys()), 'news', lang)
print(f'Loaded {len(manual_twitter_data)} tagged Tweets {len(manual_reddit_data)} and tagged Reddit docs')
print('')

# Load Enhanced Tagged data
print('Starting to load keyword tagged data...')
enhanced_reddit_data, enhanced_reddit_file_names = utils.tagged_data_loader(list(reddit_enhanced_tags.keys()), 'reddit', lang)
enhanced_twitter_data, enhanced_twitter_file_names = utils.tagged_data_loader(list(twitter_enhanced_tags.keys()), 'tweets', lang)
print(f'Loaded {len(enhanced_twitter_data)} tagged Tweets {len(enhanced_reddit_data)} and tagged Reddit docs')

# Load Not Tagged data
print('Starting to load not tagged data...')
reddit_data, _, _ = utils.data_loader(lang, 'reddit', total_data=10000, max_size = None)
twitter_data, _, _ = utils.data_loader(lang, 'tweets', total_data=10000, max_size = None)
news_data, _, _= utils.data_loader(lang, 'news', total_data=10000, max_size = None)
print(f'Loaded {len(reddit_data)} Tweets {len(twitter_data)}, Reddit docs and {len(news_data)} docs')
print('')

Starting to load manual tagged data...
Starting 10 threads to load 201 documents from reddit in en
Loaded 201 files in 0.02 seconds.
Starting 10 threads to load 454 documents from tweets in en
Loaded 454 files in 0.05 seconds.
Starting 10 threads to load 100 documents from news in en
Loaded 100 files in 0.01 seconds.
Loaded 454 tagged Tweets 201 and tagged Reddit docs

Starting to load keyword tagged data...
Starting 10 threads to load 30233 documents from reddit in en
Loaded 30233 files in 3.27 seconds.
Starting 10 threads to load 114795 documents from tweets in en
Loaded 114795 files in 17.68 seconds.
Loaded 114795 tagged Tweets 30233 and tagged Reddit docs
Starting to load not tagged data...
Starting 10 threads to load 10000 documents from reddit in en
Loaded 10000 files in 43.35 seconds.
Removed 0 files becasuse they were too large
Starting 10 threads to load 10000 documents from tweets in en
Loaded 10000 files in 32.03 seconds.
Removed 0 files becasuse they were too large
Starting

In [5]:
# Add lists
manual_tags = {**twitter_manual_tags , **reddit_manual_tags} #,**news_manual_tags}
manual_tagged_data = manual_twitter_data + manual_reddit_data #+ manual_news_data
manual_tagged_file_names = manual_twitter_file_names + manual_reddit_file_names #+ manual_news_

enhanced_tags = {**twitter_enhanced_tags , **reddit_enhanced_tags}
enhanced_tagged_data = enhanced_twitter_data + enhanced_reddit_data
enhanced_tagged_file_names = enhanced_twitter_file_names + enhanced_reddit_file_names

extended_data = set(twitter_data + reddit_data + news_data) - set(manual_tagged_data + enhanced_tagged_data)

## Preprocessing

In [6]:
def preprocess_social(data, language='en'):
    
    # Creates the language dictionary
    lang_dict = {
        "en": "english",
        "es": "spanish",
        "fr": "french"
    }
    
    data = re.sub(r'http\S+', '', data)
    
    # Sets text into lowercase
    data = data.lower()
    
    # Tokenizes by word
    tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    data = tk.tokenize(data)
    
    data_temp = []
    for word in data:
        if word not in string.punctuation:
            data_temp.append(word)
    data = data_temp
    
    # Removes stopwords
    data = [token for token in data if token not in stopwords.words(lang_dict[language])]
    
    # Creates the stemmer
    stemmer = SnowballStemmer(lang_dict[language])
    
    # Stems data
    data = [stemmer.stem(token) for token in data]
    
    # Returns preprocessed text
    return data

## Create Embeddings

### BERT

In [7]:
# Load Models
#model_name = "microsoft/xtremedistil-l6-h384-uncased"
model_name = "Darkrider/covidbert_medmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModel.from_pretrained(model_name, output_hidden_states=False)
device = torch.device("cuda")
model.to(device)
clear_output()

In [8]:
def get_bert_embedding(data):
    
    # Preprocess data
    corpus = []
    print('Preprocessing data...')
    for d in tqdm(data):
        corpus.append(preprocess_social(d, language=lang))
    clear_output()
    
    # Array to save embeddings
    embeddings = []
    failed_doc_id = []
    print('Building embeddings...')
    for i, doc in enumerate(tqdm(corpus)):
        try:
            # Run Bert for each document
            inputs = tokenizer(doc, return_tensors="pt", is_split_into_words=True)
            inputs.to(device)
            outputs = model(**inputs)
            # CLS Token Output
            embedding = outputs['pooler_output'].detach().cpu().numpy()[0]
            # Append representation
            embeddings.append(embedding)
        except:
            failed_doc_id.append(i)
    clear_output()
    print(f"Created embeddings for {len(embeddings)} docs and fail to create {len(failed_doc_id)} embeddings")
            
    return embeddings, failed_doc_id

### Doc2Vec

In [9]:
# Load Model
path_to_model = '/media/juan/Juan/NLP/models/doc2vec_' + lang + '.model'
d2v_model = Doc2Vec.load(path_to_model)

In [10]:
def get_doc2vec_embeddings(data):
    
    # Preprocess data
    print('Preprocessing data...')
    corpus = []
    for d in tqdm(data):
        corpus.append(preprocess_social(d, language=lang))
    clear_output()
    
    print('Building embeddings...')
    embeddings = []
    for doc in tqdm(corpus):    
        embeddings.append(d2v_model.infer_vector(doc))
    clear_output()
    
    return embeddings

In [11]:
# Get tags

# Create manual tags matrix for testing
y_test = np.zeros((len(manual_tags), 3))

for i, file_name in enumerate(manual_tagged_file_names):
    if list(manual_tags[file_name].values())[1] or list(manual_tags[file_name].values())[2]:
        y_test[i][0] = 1
    elif list(manual_tags[file_name].values())[3] or list(manual_tags[file_name].values())[4]:
        y_test[i][1] = 1
    else:
        y_test[i][2] = 1
            
# Create enhanced tags matrix for training
y_aux = np.zeros((len(enhanced_tagged_data) + len(extended_data), 3))

for i, file_name in enumerate(enhanced_tagged_file_names):
    if list(enhanced_tags[file_name].values())[1] or list(enhanced_tags[file_name].values())[2]:
        y_aux[i][0] = 1
    elif list(enhanced_tags[file_name].values())[3] or list(enhanced_tags[file_name].values())[4]:
        y_aux[i][1] = 1
    else:
        y_aux[i][2] = 1

# Add tag to last position on array if not tagged
for i, y in enumerate(y_test):
    if not sum(y):
        y[-1] = 1

for i, y in enumerate(y_aux):
    if not sum(y):
        y[-1] = 1


In [12]:
# Get Embeddings 
if emb == 'bert':
    # Test input embeddings
    X_test, f_test_id = get_bert_embedding(manual_tagged_data)
    
    # Train input embeddings
    enhanced_embeddings, f_enhanced_id = get_bert_embedding(enhanced_tagged_data)
    extended_embeddings, f_extended_id = get_bert_embedding(extended_data)
    X_aux = enhanced_embeddings + extended_embeddings
    
    # Remove failed docs
    failed_doc_id = f_enhanced_id + f_extended_id

    y_test = np.delete(y_test, f_test_id, axis=0)
    y_aux = np.delete(y_aux, failed_doc_id, axis=0)
    
elif emb == 'doc2vec':
    # Test input embeddings
    X_test = get_doc2vec_embeddings(manual_tagged_data)
    
    # Train input embeddings
    enhanced_embeddings = get_doc2vec_embeddings(enhanced_tagged_data)
    extended_embeddings = get_doc2vec_embeddings(extended_data)
    X_aux = enhanced_embeddings + extended_embeddings

In [13]:
# Split data into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_aux, y_aux, test_size=0.3, random_state=42)

In [14]:
# Save data

# Input
np.save(f'/media/juan/Juan/NLP/datasets/X_train_{emb}_{lang}.npy', X_train)
np.save(f'/media/juan/Juan/NLP/datasets/X_val_{emb}_{lang}.npy', X_val)
np.save(f'/media/juan/Juan/NLP/datasets/X_test_{emb}_{lang}.npy', X_test)
# Tags
np.save(f'/media/juan/Juan/NLP/datasets/y_train_{emb}_{lang}.npy', y_train)
np.save(f'/media/juan/Juan/NLP/datasets/y_val_{emb}_{lang}.npy', y_val)
np.save(f'/media/juan/Juan/NLP/datasets/y_test_{emb}_{lang}.npy', y_test)