In [1]:
lang = 'en'
sources = ["news", "tweets", "reddit"]
path_to_model = "./doc2vec_es.model"


%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False

from IPython.display import clear_output

In [2]:
!pip install gensim
!pip install nltk
!pip install scikit-learn

clear_output()

In [3]:
# Imports the OS library
import os

# Imports the JSON library
import  json

# Imports the time library
from time import time

# Imports numpy
import numpy as np

# Imports the document class
from document import Document

# Import TQDM for time measurements
from tqdm import tqdm

# Imports string
import string

# Imports regular expressions
import re

# Imports NLTK
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
    
# Imports gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import strip_punctuation

# Imports scikit learn
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Imports matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker

# Imports random
from random import sample

clear_output()

## Preprocessing Definition

In [4]:
def preprocess_text(data, lang_stopwords, stemmer, twt_token, source="news"):
    
    preprocessed_text = []
    
    for doc in tqdm(data):
    
        # Sets text into lowercase
        doc = doc.lower()
    
        if source == "news":

            # Removes punctuation
            doc = strip_punctuation(doc)

            # Tokenizes by word
            doc = word_tokenize(doc)

        else:

            # Tweet tokenizer
            doc = twt_token.tokenize(doc)

        # Removes stopwords
        doc = [token for token in doc if token not in lang_stopwords]

        # Stems data
        doc = [stemmer.stem(token) for token in doc]
            
        # Appends doc to preprocessed text list
        preprocessed_text.append(doc)

    # Returns the preprocessed text
    return preprocessed_text
        
        

## Data load functions

In [5]:
def load_files(filenames, prefix):
    
    text_list = []
    
    for file in tqdm(filenames):
        
        with open(prefix + file, "r") as file:
        
            doc = json.load(file)
        
            text = doc["text"]
            
            text_list.append(text)
        
    return text_list

## Model Loading

In [6]:
d2v_model = Doc2Vec.load(path_to_model)

## Vector Generation

In [7]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

# Creates the language dictionary
lang_dict = {
    "en": "english",
    "es": "spanish",
    "fr": "french"
}

# Gets the stopwords
lang_sw = stopwords.words(lang_dict[lang])

# Creates the stemmer
stemmer = SnowballStemmer(lang_dict[lang])

# Creates the tweet tokenizer
twt_token = TweetTokenizer()

# Creates a batch count
j = 0

# Iterates over sources
for src in sources:
    
    path_to_files = "../../data/" + src + '/' + lang + '/'
    
    filenames = batch(os.listdir(path_to_files), 20000)
    
    for files_batch in filenames:
        
        vector_dict = {}
    
        docs = load_files(files_batch, path_to_files)
    
        docs = preprocess_text(docs, lang_sw, stemmer, twt_token, src)
    
        for i, file in enumerate(tqdm(files_batch)):
            
            words_list = docs[i]
            
            vector = list(d2v_model.infer_vector(words_list))
            
            vector_dict[file] = eval(str(vector))
            
        dict_fname = "./embeddings/embeddings_{}_{}.json".format(lang, j)
        
        with open(dict_fname, "w") as dict_file:    
            json.dump(vector_dict, dict_file)
            
        j += 1


100%|██████████| 20000/20000 [00:05<00:00, 3912.02it/s]
100%|██████████| 20000/20000 [05:30<00:00, 60.58it/s]
100%|██████████| 20000/20000 [03:34<00:00, 93.08it/s] 
100%|██████████| 20000/20000 [00:04<00:00, 4084.36it/s]
100%|██████████| 20000/20000 [05:45<00:00, 57.88it/s]
100%|██████████| 20000/20000 [03:41<00:00, 90.14it/s] 
100%|██████████| 20000/20000 [00:04<00:00, 4292.87it/s]
100%|██████████| 20000/20000 [06:09<00:00, 54.15it/s]
100%|██████████| 20000/20000 [04:10<00:00, 79.85it/s] 
100%|██████████| 20000/20000 [00:04<00:00, 4392.86it/s]
100%|██████████| 20000/20000 [05:49<00:00, 57.27it/s]
100%|██████████| 20000/20000 [05:51<00:00, 56.87it/s] 
100%|██████████| 20000/20000 [00:07<00:00, 2652.48it/s]
100%|██████████| 20000/20000 [06:37<00:00, 50.30it/s]
100%|██████████| 20000/20000 [03:42<00:00, 89.98it/s] 
100%|██████████| 20000/20000 [00:04<00:00, 4385.75it/s]
100%|██████████| 20000/20000 [05:33<00:00, 60.04it/s]
100%|██████████| 20000/20000 [03:33<00:00, 93.51it/s] 
100%|█████