In [1]:
# General imports
import nltk
import pandas as pd
import os
import sys
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import re
import random
# Import utils class
sys.path.insert(0,'../')
from utils import Utils

In [2]:
# Instanciate utils class
utils = Utils('/media/juan/Juan/NLP/', num_workers=15)

In [3]:
# Load tweets
en_tweets = utils.data_loader('en', 'tweets', total_data=10000)

Starting threads to load 10000 documents from tweets in en
Loaded 10000 files in 14.039345979690552 seconds.


In [4]:
# Create objects for preprocessing method
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
stemmer = SnowballStemmer('spanish')
stop_words = stopwords.words('spanish')

processed_tweets = []
for tweet in tqdm(en_tweets):
    tweet = re.sub(r'http\S+', '', tweet)
    processed_tweets.append(utils.preprocessing(tweet,
                            stop_words = stop_words,
                            stemmer = stemmer,
                            tokenizer = tokenizer))

100%|██████████| 10000/10000 [00:02<00:00, 4370.04it/s]


In [5]:
for tweet in range(5):
    rand_num = random.randint(0, len(en_tweets) - 1)
    print('Original tweet: ' + en_tweets[rand_num])
    print('Processed tweet: ' + str(processed_tweets[rand_num]) + '\n')
    print('-------------------------------------------------------\n')

Original tweet: When #COVID19 forced them to go remote, schools found creative ways to expand broadband access for students and families. A year later, community leaders say long-term solutions are still needed in Eastern NC. https://t.co/17pwmu479j
Processed tweet: ['when', 'covid', '19', 'forced', 'them', 'to', 'go', 'remote', 'schools', 'found', 'creative', 'ways', 'to', 'expand', 'broadband', 'access', 'for', 'students', 'and', 'families', 'year', 'later', 'community', 'leaders', 'say', 'longterm', 'solutions', 'are', 'still', 'needed', 'in', 'eastern', 'nc']

-------------------------------------------------------

Original tweet: Duh, President Pudding Brain said we have 800 million vaccines.  That would be more than we need for our population because of math. 

Send the excess to the open border for goodness sake. 

States have a new Covid problem: Too much vaccine - POLITICO https://t.co/CcpFlf3jV8
Processed tweet: ['duh', 'president', 'pudding', 'brain', 'said', 'we', 'have', 

In [6]:
# Group most common bigrams
from gensim.models.phrases import Phrases, Phraser
sent = [row for row in processed_tweets]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
tweets = bigram[sent]

In [None]:
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
    # Tagging each of the data with an ID, and I use the most memory efficient one of just using it's ID
    tagged_data = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(string_data)]
    
    # Instantiating my model
    model = Doc2Vec(alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data, total_examples = model.corpus_count, epochs=model.epochs)
        # Decrease the learning rate
        model.alpha -= 0.0002
        # Fix the learning rate, no decay
        model.min_alpha = model.alpha

    # Saving model
    model.save("en_d2v.model")
    print("Model Saved")
    
# Training
train_doc2vec(tweets, max_epochs = 100, vec_size = 20, alpha = 0.025)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
