In [34]:
# General imports
import nltk
import pandas as pd
import os
import sys
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import re
import random
# Import utils class
sys.path.insert(0,'../')
from utils import Utils

In [2]:
# Instanciate utils class
utils = Utils('/media/juan/Juan/NLP/', num_workers=15)

In [3]:
# Load tweets
en_tweets = utils.data_loader('en', 'tweets', total_data=100000)

Starting threads to load 100000 documents from tweets in en
Loaded 100000 files in 898.5474643707275 seconds.


In [4]:
# Create objects for preprocessing method
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
stemmer = SnowballStemmer('spanish')
stop_words = stopwords.words('spanish')

processed_tweets = []
for tweet in tqdm(en_tweets):
    tweet = re.sub(r'http\S+', '', tweet)
    processed_tweets.append(utils.preprocessing(tweet,
                            stop_words = stop_words,
                            stemmer = stemmer,
                            tokenizer = tokenizer))

100%|██████████| 100000/100000 [00:24<00:00, 4147.17it/s]


In [5]:
for tweet in range(5):
    rand_num = random.randint(0, len(en_tweets) - 1)
    print('Original tweet: ' + en_tweets[rand_num])
    print('Processed tweet: ' + str(processed_tweets[rand_num]) + '\n')
    print('-------------------------------------------------------\n')

Original tweet: Don’t rehab criminals. A good PR team, a pandemic and a dysfunctional gomen is giving Najib the golden opportunity to wash his slate clean cuz he knws yall dumbass will eat it up. 

Lest you forget he robbed our nation off billions. And he ain’t even served time yet. https://t.co/d6gG0CH6rN
Processed tweet: ['don', '’', 't', 'rehab', 'criminals', 'good', 'pr', 'team', 'pandemic', 'and', 'dysfunctional', 'gomen', 'is', 'giving', 'najib', 'the', 'golden', 'opportunity', 'to', 'wash', 'his', 'slate', 'clean', 'cuz', 'knws', 'yall', 'dumbass', 'will', 'eat', 'it', 'up', 'lest', 'you', 'forget', 'robbed', 'our', 'nation', 'off', 'billions', 'and', 'ain', '’', 't', 'even', 'served', 'time', 'yet']

-------------------------------------------------------

Original tweet: @jen_jstephen @drphiliplee1 I can understand your concern. We need a world wide effort to get more vaccine into developing nations to stop more new covid variants coming into being that could spread faster or 

In [32]:
# Group most common bigrams
from gensim.models.phrases import Phrases, Phraser
sent = [row for row in processed_tweets]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
tweets = bigram[sent]

In [None]:
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
    # Tagging each of the data with an ID, and I use the most memory efficient one of just using it's ID
    tagged_data = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(string_data)]
    
    # Instantiating my model
    model = Doc2Vec(alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data, total_examples = model.corpus_count, epochs=model.epochs)
        # Decrease the learning rate
        model.alpha -= 0.0002
        # Fix the learning rate, no decay
        model.min_alpha = model.alpha

    # Saving model
    model.save("models/d2v.model")
    print("Model Saved")
    
# Training
train_doc2vec(tweets, max_epochs = 100, vec_size = 20, alpha = 0.025)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
