# Enhanced Tagged Data with Word2Vec

## Load Data

In [1]:
# General imports
import pandas as pd
import numpy as np
import os
import sys
import re
import random
import json

import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

# Import Gensim
#import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CESAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Import utils class
sys.path.insert(0,'../')
from utils import Utils

In [3]:
# Instanciate utils class
utils = Utils(r'D:\Cesard\Documents\NLP', num_workers=10)

In [4]:
# Define language
lang = 'es'

# Load manual tags
with open(f'tags/manual/reddit_{lang}.json', 'r+') as file_str:
    reddit_manual_tags = json.load(file_str)
with open(f'tags/manual/tweets_{lang}.json', 'r+') as file_str:
    twitter_manual_tags = json.load(file_str)

# Load keyword tags
with open(f'tags/keywords/reddit_{lang}_words.json', 'r+') as file_str:
    reddit_kw_tags = json.load(file_str)
with open(f'tags/keywords/tweets_{lang}_words.json', 'r+') as file_str:
    twitter_kw_tags = json.load(file_str)

# Load Tagged data
print('Starting to load manual tagged data...')
tagged_reddit_data, tagged_reddit_file_names = utils.tagged_data_loader(list(reddit_manual_tags.keys()), 'reddit', lang)
tagged_twitter_data, tagged_twitter_file_names = utils.tagged_data_loader(list(twitter_manual_tags.keys()), 'tweets', lang)
print(f'Loaded {len(tagged_twitter_data)} tagged Tweets {len(tagged_reddit_data)} and tagged Reddit docs')
print('')

# Load Keyword tagged data
print('Starting to load keyword tagged data...')
reddit_data, reddit_file_names = utils.tagged_data_loader(list(reddit_kw_tags.keys()), 'reddit', lang)
twitter_data, twitter_file_names = utils.tagged_data_loader(list(twitter_kw_tags.keys()), 'tweets', lang)
print(f'Loaded {len(twitter_data)} tagged Tweets {len(reddit_data)} and tagged Reddit docs')

Starting to load manual tagged data...
Starting 10 threads to load 1500 documents from reddit in es
Loaded 1500 files in 0.16 seconds.
Starting 10 threads to load 1000 documents from tweets in es
Loaded 1000 files in 0.11 seconds.
Loaded 1000 tagged Tweets 1500 and tagged Reddit docs

Starting to load keyword tagged data...
Starting 10 threads to load 2283 documents from reddit in es
Loaded 2283 files in 0.27 seconds.
Starting 10 threads to load 35239 documents from tweets in es
Loaded 35239 files in 5.63 seconds.
Loaded 35239 tagged Tweets 2283 and tagged Reddit docs


In [5]:
# Add lists
manual_tags = {**twitter_manual_tags , **reddit_manual_tags}
manual_tagged_data = tagged_twitter_data + tagged_reddit_data 
manual_tagged_file_names = tagged_twitter_file_names + tagged_reddit_file_names 

kw_tags = {**twitter_kw_tags , **reddit_kw_tags}
kw_tagged_data = twitter_data + reddit_data
kw_tagged_file_names = twitter_file_names + reddit_file_names

## Preprocessing

In [6]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
#tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

In [7]:
# Preprocess tagged data
processed_data = []
for d in manual_tagged_data:
    processed_data.append(utils.preprocessing(d, stop_words = stop_words,
                                                 stemmer = None,
                                                 tokenizer = tk,
                                                 lemmatizer = lemma))

# Preprocess data tagged with keywords
for d in kw_tagged_data:
    processed_data.append(utils.preprocessing(d, stop_words = stop_words,
                                                 stemmer = None,
                                                 tokenizer = tk,
                                                 lemmatizer = lemma))

In [8]:
print(manual_tagged_data[0])
print(processed_data[0])

En este feriado del Día del Trabajador “Viernes 30” si habrá toque de queda -#DiaDelTrabajo #Guayaquil #Quito #Ecuador #ToqueDeQueda #COVID19  https://t.co/raebk1Iubj
['En', 'este', 'feriado', 'del', 'Día', 'del', 'Trabajador', 'Viernes', '30', 'si', 'habrá', 'toque', 'de', 'queda', 'DiaDelTrabajo', 'Guayaquil', 'Quito', 'Ecuador', 'ToqueDeQueda', 'COVID19', 'https', 'co', 'raebk1Iubj']


In [9]:
# Group most common bigrams
sent = [row for row in processed_data]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
corpus = bigram[sent]

## Doc2Vec

In [10]:
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
    
    # Tagging each of the data with an ID, and I use the most memory efficient one of just using it's ID
    tagged_data = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(string_data)]
    
    # Instantiating my model
    model = Doc2Vec(alpha=alpha, min_alpha=0.00025, min_count=10, dm =1)

    model.build_vocab(tagged_data)

    for epoch in tqdm(range(max_epochs)):
        model.train(tagged_data, total_examples = model.corpus_count, epochs=model.epochs)
        # Decrease the learning rate
        model.alpha -= 0.0002
        # Fix the learning rate, no decay
        model.min_alpha = model.alpha

    # Saving model
    model.save("en_d2v.model")
    print("Model Saved")
    
# Training
train_doc2vec(processed_data, max_epochs = 25, vec_size = 50, alpha = 0.025)

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [05:08<00:00, 12.34s/it]


Model Saved


In [11]:
# Create Manual Tags matrix
manual_tags_matrix = np.zeros((len(manual_tags), 5))

for i, file_name in enumerate(manual_tagged_file_names):
    for j, tag in enumerate(list(manual_tags[file_name].values())):
        if tag:
            manual_tags_matrix[i][j] = 1
            
# Create KW Tags matrix
kw_tags_matrix = np.zeros((len(kw_tags), 5))

for i, file_name in enumerate(kw_tagged_file_names):
    for j, tag in enumerate(list(kw_tags[file_name].values())):
        if tag:
            kw_tags_matrix[i][j] = 1

In [12]:
len(manual_tagged_file_names)

2500

In [13]:
# Loading in my model
model = Doc2Vec.load("en_d2v.model")

In [19]:
# Class
cat = 2

# Print top
for i in range(1,len(manual_tags)):
    if manual_tags_matrix[i][cat]:
        # Representative Doc (Tagged)
        print(f'TAGGED DOC WITH ID {i}')
        print(manual_tagged_data[i])
        print(processed_data[i])
        print()
        
        # Similar Docs
        print('TOP SIMILAR DOCS')
        similar_docs = model.docvecs.most_similar(str(i), topn = 100)
        for docs in similar_docs:
            index = int(docs[0]) - len(manual_tags)
            if kw_tags_matrix[index][cat]:
                print(f"DOC_ID{int(docs[0])}: {kw_tagged_data[index]}")
            
        print('--------------------------------------------------')
        print()

TAGGED DOC WITH ID 117
El carpintero William López diseñó y creó un pupitre especial para su hija de seis años, quien vuelve a la escuela junto con sus compañeros en medio de la pandemia de coronavirus
#COVID19 #coronavirus #ElSalvador #RegresoAClasesSeguro #ClasesPresenciales
https://t.co/cB2mwXMY39
['El', 'carpintero', 'William', 'López', 'diseñó', 'creó', 'un', 'pupitre', 'especial', 'para', 'su', 'hija', 'de', 'seis', 'años', 'quien', 'vuelve', 'la', 'escuela', 'junto', 'con', 'sus', 'compañeros', 'en', 'medio', 'de', 'la', 'pandemia', 'de', 'coronavirus', 'COVID19', 'coronavirus', 'ElSalvador', 'RegresoAClasesSeguro', 'ClasesPresenciales', 'https', 'co', 'cB2mwXMY39']

TOP SIMILAR DOCS
DOC_ID3415:"Excelsior Pass" busca acelerar la reapertura de estadios deportivos, locales de música, restaurantes y otros negocios con limitaciones de capacidad debido al coronavirus. https://t.co/XTE5uue8sz
--------------------------------------------------

TAGGED DOC WITH ID 127
🤔 no 😷 por favor 🙏