In [None]:
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

#spacy
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

#vis
import pyLDAvis
import pyLDAvis.gensim_models
#nltk.download(punkt)
#nltk.download(wordnet) 


In [None]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
stories = load_data("local_data.json")["File"]
print(stories[0][:100])

In [None]:
stops = set(stopwords.words("english"))

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    #Use "python -m spacy download en_core_web_sm" if error [E050]
    nlp = spacy.load("en_core_web_sm",disable=["parser","ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
        
    return(texts_out)

lemmatized_texts = lemmatization(stories[:1])
print(lemmatized_texts[0][:100])

In [None]:
#deaccenting and removing stop words
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print (data_words[0][0:20])

In [None]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=2,threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words],threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts): 
    return(trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

data_bigrams_trigrams = list(data_bigrams_trigrams)
print(data_bigrams_trigrams)
   
#texts = [word.encode("utf-8").split(/)[0] for word in texts]
# lemmatize the document(Other methods can be used to execute this. This method is best when working on collab or with gensim V3,gensim.utils has been dropped for V4)
# texts = [[word.decode("utf-8").split(/)[0] for word in lemmatize( .join(line), allowed_tags=re.compile((NN)), min_length=5)] for line in texts]
#trigrams_ = [t for t in trigram[bigram[sent]] if t.count( ) == 2]

In [None]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [None]:
lda_model = gensim.models.ldamodel.LdaModel.load("Models/test_model_1.model")

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis