In [1]:
lang = 'fr'
sources = ["news", "tweets", "reddit"]


%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False

from IPython.display import clear_output

In [2]:
!pip install gensim
!pip install nltk
!pip install scikit-learn

clear_output()

In [3]:
# Imports the OS library
import os

# Imports the JSON library
import  json

# Imports the time library
from time import time

# Imports numpy
import numpy as np

# Imports the document class
from document import Document

# Import TQDM for time measurements
from tqdm import tqdm

# Imports string
import string

# Imports regular expressions
import re

# Imports NLTK
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
    
# Imports gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import strip_punctuation

# Imports scikit learn
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Imports matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker

# Imports random
from random import sample

clear_output()

## Preprocessing Definition

In [4]:
def preprocess_text(data, lang_stopwords, stemmer, twt_token, source="news"):
    
    preprocessed_text = []
    
    for doc in tqdm(data):
    
        # Sets text into lowercase
        doc = doc.lower()
    
        if source == "news":

            # Removes punctuation
            doc = strip_punctuation(doc)

            # Tokenizes by word
            doc = word_tokenize(doc)

        else:

            # Tweet tokenizer
            doc = twt_token.tokenize(doc)

        # Removes stopwords
        doc = [token for token in doc if token not in lang_stopwords]

        # Stems data
        doc = [stemmer.stem(token) for token in doc]
            
        # Appends doc to preprocessed text list
        preprocessed_text.append(doc)

    # Returns the preprocessed text
    return preprocessed_text
        
        

## Data load functions

In [5]:
def load_files(filenames, prefix):
    
    text_list = []
    
    for file in tqdm(filenames):
        
        with open(prefix + file, "r") as file:
        
            doc = json.load(file)
        
            text = doc["text"]
            
            text_list.append(text)
        
    return text_list

## Data Loading

In [6]:
# Loads the documents
train_docs = []

# Creates the language dictionary
lang_dict = {
    "en": "english",
    "es": "spanish",
    "fr": "french"
}

# Gets the stopwords
lang_sw = stopwords.words(lang_dict[lang])

# Creates the stemmer
stemmer = SnowballStemmer(lang_dict[lang])

# Creates the tweet tokenizer
twt_token = TweetTokenizer()

# Iterates over sources
for src in sources:
    
    path_to_files = "../../data/" + src + '/' + lang + '/'
    
    filenames = os.listdir(path_to_files)
    
    docs = load_files(filenames, path_to_files)
    
    docs = preprocess_text(docs, lang_sw, stemmer, twt_token, src)
    
    train_docs.extend(docs)
    
# Creates the tagged documents array
train_docs = [TaggedDocument(words, tags="{}".format(i)) for i, words in enumerate(tqdm(train_docs))]

100%|██████████| 224894/224894 [01:05<00:00, 3441.85it/s]
100%|██████████| 224894/224894 [1:20:13<00:00, 46.72it/s]
100%|██████████| 119670/119670 [00:19<00:00, 6100.20it/s]
100%|██████████| 119670/119670 [01:21<00:00, 1472.87it/s]
100%|██████████| 4645/4645 [00:00<00:00, 6013.01it/s]
100%|██████████| 4645/4645 [00:06<00:00, 683.35it/s]
100%|██████████| 349209/349209 [00:03<00:00, 94790.89it/s] 


In [7]:
ALPHA = 0.025
MIN_COUNT = 10
MAX_EPOCHS = 10
DIMENSIONS = 512

model = Doc2Vec(
    alpha=ALPHA, 
    min_alpha=0.00025, 
    min_count=MIN_COUNT, 
    vector_size=DIMENSIONS,
    dm=1
)

dtime = time()

model.build_vocab(train_docs)

dtime = time() - dtime

print("Built model vocab. Took {:2.2f}s".format(dtime))

for epoch in range(MAX_EPOCHS):
    
    dtime = time()
    
    model.train(train_docs, total_examples = model.corpus_count, epochs=model.epochs)
        
    # Decrease the learning rate
    model.alpha -= 0.0002
    
    # Fix the learning rate, no decay
    model.min_alpha = model.alpha
    
    dtime = time() - dtime
    
    print("Trained epoch {}. Took {:4.2f}s".format(epoch, dtime), end='\r')
    
    model.save("doc2vec_{}.model".format(lang))

Built model vocab. Took 26.73s
Trained epoch 9. Took 1860.00s