# Enhanced Tagged Data with Word2Vec

## Load Data

In [1]:
# General imports
import nltk
import pandas as pd
import numpy as np
import os
import sys
import re
import random
import json

# Import Gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords

from tqdm import tqdm

# Import utils class
sys.path.insert(0,'../')
from utils import Utils

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CESAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Instanciate utils class
utils = Utils(r'D:\Cesard\Documents\NLP', num_workers=10)

In [3]:
# Define language
lang = 'es'

# Load tags
with open(f'tags/tweets_{lang}.json', 'r+') as file_str:
    twitter_tags = json.load(file_str)
    
with open(f'tags/reddit_{lang}.json', 'r+') as file_str:
    reddit_tags = json.load(file_str)


# Load Tagged data
print('Starting to load tagged data...')

tagged_reddit_data, tagged_reddit_file_names = utils.tagged_data_loader(list(reddit_tags.keys()), 'reddit', lang)
tagged_twitter_data, tagged_twitter_file_names = utils.tagged_data_loader(list(twitter_tags.keys()), 'tweets', lang)

print(f'Loaded {len(tagged_twitter_data)} tagged Tweets {len(tagged_reddit_data)} and tagged Reddit docs')

# Load Not tagged data
print('')
print('Starting to load not tagged data...')

reddit_data, _ = utils.data_loader(lang, 'reddit', total_data=None, max_size = 300, return_dates = False)
twitter_data, _ = utils.data_loader(lang, 'tweets', total_data=None, max_size = 300, return_dates = False)

print(f'Loaded {len(twitter_data)} Tweets {len(reddit_data)} and Reddit docs whithout tag')

Starting to load tagged data...
Starting 10 threads to load 1500 documents from reddit in es
Loaded 1500 files in 0.19 seconds.
Starting 10 threads to load 1000 documents from tweets in es
Loaded 1000 files in 0.11 seconds.
Loaded 1000 tagged Tweets 1500 and tagged Reddit docs

Starting to load not tagged data...
Starting 10 threads to load 9113 documents from reddit in es
Loaded 8437 files in 1.41 seconds.
Removed 676 files becasuse they were too large
Starting 10 threads to load 241995 documents from tweets in es
Loaded 241994 files in 192.68 seconds.
Removed 1 files becasuse they were too large
Loaded 241994 Tweets 8437 and Reddit docs whithout tag


In [4]:
# Add lists
data = twitter_data + reddit_data

tags = {**twitter_tags , **reddit_tags}
tagged_data = tagged_twitter_data + tagged_reddit_data
tagged_file_names = tagged_twitter_file_names + tagged_reddit_file_names

In [5]:
# Append Ideal Docs
vaccines = ['inicia vacunación', 'vacunas pfizer', 'vacunas aztraseneca']
mental_health = ['estrés, ansiedad y problemas psicologicos asociados a la salud mental']
school_reopening = ['cierre y reapertura de colegios o instituciones educativas']
household_violence = ['']

ideal_docs = vaccines + mental_health + school_reopening + household_violence

## Preprocessing

In [6]:
# Stop Words
stop_words = stopwords.words('english')

# Stemmers
stem = SnowballStemmer('english')
#p_stem = PorterStemmer()

# Tokenizers
tk = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
#tk = nltk.RegexpTokenizer(r'\w+')

# Lemmatizer
lemma = nltk.stem.WordNetLemmatizer()

# Preprocess tagged data
processed_data = []
for d in tagged_data:
    processed_data.append(utils.preprocessing(d, stop_words = stop_words,
                                                 stemmer = None,
                                                 tokenizer = tk,
                                                 lemmatizer = lemma))

# Preprocess data without tag
for d in data:
    processed_data.append(utils.preprocessing(d, stop_words = stop_words,
                                                 stemmer = None,
                                                 tokenizer = tk,
                                                 lemmatizer = lemma))
# Preprocess data without tag
for d in ideal_docs:
    processed_data.append(utils.preprocessing(d, stop_words = stop_words,
                                                 stemmer = None,
                                                 tokenizer = tk,
                                                 lemmatizer = lemma))

In [7]:
# Group most common bigrams
sent = [row for row in processed_data]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
corpus = bigram[sent]

## Doc2Vec

In [10]:
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
    
    # Tagging each of the data with an ID, and I use the most memory efficient one of just using it's ID
    tagged_data = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(string_data)]
    
    # Instantiating my model
    model = Doc2Vec(alpha=alpha, min_alpha=0.00025, min_count=10, dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data, total_examples = model.corpus_count, epochs=model.epochs)
        # Decrease the learning rate
        model.alpha -= 0.0002
        # Fix the learning rate, no decay
        model.min_alpha = model.alpha

    # Saving model
    model.save("en_d2v.model")
    print("Model Saved")
    
# Training
train_doc2vec(corpus, max_epochs = 25, vec_size = 20, alpha = 0.025)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
Model Saved


In [11]:
# Create Tags matrix
tags_matrix = np.zeros((len(tagged_file_names), 5))

for i, file_name in enumerate(tagged_file_names):
    for j, tag in enumerate(list(tags[file_name].values())):
        if tag:
            tags_matrix[i][j] = 1

In [12]:
# Loading in my model
model = Doc2Vec.load("en_d2v.model")

In [17]:
for i in range(1,len(tagged_data)):
    if tags_matrix[i][3]:
        # Representative Doc (Tagged)
        print(f'TAGGED DOC WITH ID {i}')
        print((tagged_data + data)[i])
        print(processed_data[i])
        print()
        
        # Similar Docs
        print('TOP SIMILAR DOCS')
        similar_docs = model.docvecs.most_similar(str(i), topn = 5)
        for docs in similar_docs:
            print((tagged_data + data)[int(docs[0])])
            #print(processed_data[int(docs[0])])
        print('--------------------------------------------------')
        print()