In [54]:


import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
# lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary)

import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [55]:
df = pd.read_pickle("../data/dataframes/SDG/all_sdg_fixed_dst.pkl")
df['TXT'] = df['AB'] +" "+ df['TI'] +" "+ df ['DE']

data = df.TXT.values.tolist()

In [56]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])



[['the', 'paris', 'agreement', 'success', 'depends', 'on', 'parties', 'implementation', 'of', 'their', 'nationally', 'determined', 'contributions', 'ndcs', 'towards', 'the', 'paris', 'agreement', 'goals', 'in', 'these', 'climate', 'action', 'plans', 'most', 'developing', 'countries', 'make', 'their', 'mitigation', 'and', 'adaptation', 'contributions', 'conditional', 'upon', 'receiving', 'international', 'support', 'finance', 'technology', 'transfer', 'and', 'or', 'capacity', 'building', 'while', 'provision', 'of', 'support', 'for', 'ndc', 'implementation', 'could', 'enhance', 'equity', 'among', 'countries', 'the', 'feasibility', 'of', 'ndc', 'implementation', 'might', 'be', 'challenged', 'by', 'the', 'large', 'number', 'of', 'conditional', 'ndcs', 'this', 'paper', 'addresses', 'the', 'implications', 'of', 'this', 'tension', 'based', 'on', 'an', 'analysis', 'of', 'all', 'ndcs', 'we', 'find', 'that', 'feasibility', 'is', 'challenged', 'because', 'conditions', 'applied', 'to', 'ndcs', 'ar

In [57]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['the', 'paris_agreement', 'success', 'depends_on', 'parties', 'implementation', 'of', 'their', 'nationally_determined_contributions_ndcs', 'towards', 'the', 'paris_agreement_goals', 'in', 'these', 'climate', 'action_plans', 'most', 'developing_countries', 'make', 'their', 'mitigation', 'and', 'adaptation', 'contributions', 'conditional_upon', 'receiving', 'international', 'support', 'finance', 'technology_transfer', 'and', 'or', 'capacity_building', 'while', 'provision', 'of', 'support', 'for', 'ndc', 'implementation', 'could', 'enhance', 'equity', 'among', 'countries', 'the', 'feasibility', 'of', 'ndc', 'implementation', 'might_be', 'challenged', 'by', 'the', 'large_number', 'of', 'conditional_ndcs', 'this_paper', 'addresses', 'the', 'implications', 'of', 'this', 'tension', 'based', 'on', 'an', 'analysis', 'of', 'all', 'ndcs', 'we_find', 'that', 'feasibility', 'is', 'challenged', 'because', 'conditions', 'applied', 'to', 'ndcs', 'are', 'often', 'not', 'well', 'defined', 'moreover', '

In [58]:
# spacy stopwords
sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words
print(len(all_stopwords))
all_stopwords.update(["technology", "use","uses", "used", "model", "models", "system","systems", "base","based","bases", "high", "index","approach",
                      "information", "datum", "basis", "process", "tool","new", "problem", "result", "results", "resource",
                      "method", "image", "study", "feature", "technique", "different", "test", "low", "low", "class", "analysis",
                      "paper", "provide", "provides", "provided", "data", "increase", "increases", "increased", "propose", "proposes",
                     "proposed", "feature", "features", "big", "level", 'object', "levels", "objects", "method","methods",
                     "modelled", "modeled", "modelling", "modeling", "large", "case", "cases", "present", "presents", "consider",
                     "considering", "considered", "prediction", "predicts", "predict", "predicted", "compare", "compares", "compared",
                     "comparing", "improve", "improves", "improving", "improved", "estimate", "estimating", "estimates", "estimated",
                     "network", "networks", "control", "controls", "controlled", "controlling", "include", "includes", "including",
                     "included", "show", "shows", "showed", "showing", "important", "high", "develop", "develops", "developed", "developing",
                     "change", "changes", "changed", "changing", "performance", "apply", "applies", "applied", "applying",
                     "observe", "observes", "observing", "observed", "lean", "learns", "learned", "value", "obtain", "obtained",
                     "obtains", "obtained", "indicate", "indicates", "indicating","indicated", "application", "applications", "reduce", "time", "design", "research", "management"])
print(len(all_stopwords))



466
466


In [59]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in all_stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [60]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_nostops)



# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])



[['paris_agreement', 'success', 'depend', 'party', 'implementation', 'paris_agreement_goal', 'climate', 'action_plan', 'country', 'mitigation', 'adaptation', 'contribution', 'conditional', 'receive', 'international', 'support', 'finance', 'transfer', 'capacity_builde', 'provision', 'support', 'ndc', 'implementation', 'enhance', 'equity', 'country', 'implementation', 'challenge', 'number', 'conditional_ndcs', 'address', 'implication', 'tension', 'ndcs', 'find', 'feasibility', 'challenge', 'condition', 'ndcs', 'define', 'cost', 'implement', 'conditional', 'contribution', 'cover', 'exist', 'promise', 'support', 'country', 'entire', 'annual', 'climate', 'finance', 'earmark', 'implementation', 'consistent', 'principle', 'equity', 'prioritization', 'paris_agreement', 'high', 'proportion', 'small_island', 'states_sid', 'conditional_ndcs', 'country', 'difference', 'distribution', 'country', 'request', 'support', 'currently', 'receive', 'support', 'particular', 'middle_income_countrie', 'demons

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
# print(corpus[:1])


In [None]:


# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]



In [None]:
all_stopwords

In [None]:
all_stopwords.update(
    {"rights_reserve", "decision_make", "decision_maker", "principal_component", "recent_year", "spatial_revolution",
     "authors_publishe", "classification_accuracy"}
)
all_stopwords

In [None]:
from collections import Counter

frequencies = Counter()
frequencies_ngram = Counter()
stuff = [[(id2word[id], freq) for id, freq in cp] for cp in corpus]
for i, tok in enumerate(stuff):
    for tup in tok:
        if tup[0] not in all_stopwords:
            if "_" in tup[0]:
                frequencies_ngram[tup[0]] += 1
            frequencies[tup[0]] +=1


print(frequencies_ngram.most_common(10))


In [None]:


# Nuage de mots
from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", max_words=100, scale=2)
wordcloud.generate_from_frequencies(frequencies_ngram)

wordcloud.to_file("../img/worldcloud_sdg.png", )
wordcloud.to_image()



In [None]:
# Nuage de mots
from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", max_words=100)
wordcloud.generate_from_frequencies(frequencies)
wordcloud.to_image()
# wordcloud.to_file("img/worldcloud.png")
wordcloud.to_image()