In [1]:
### Topic Modelling in in French / English ### 

""" 
        @author: Hair Albeiro Parra Barrera (Jair)
        Dependencies: nltk, spaCy, gensim, googletrans
        Copyright: This work is licensed under a Creative Commons 
                    Attribution-NonCommercial-NoDerivatives 4.0 International License.
                    Please see https://creativecommons.org/licenses/by-nc-nd/4.0/
                    Contact https://blog.jairparraml.com/ for information. 
        Based on tutorials by Susan Li, "Topic Modeling and Latent Dirichlet Allocation (LDA) in Python"
        at https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
"""

# Ignore
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [2]:
### Language Detection ### 

# We first one to find out whether a certain text is in English or French. 

from langdetect import detect

sample_text_fr = """ Il n’a pas été possible de faire des observations au début et à la fin des ateliers puisque pour observer correctement chaque enfant du groupe et animer l’atelier, 4 semaines d’observation ont été nécessaire, à raison d’une demie journée par semaine, sur les 12 semaines que se donne l’atelier
Il a fallu 4 ateliers sur 12 pour d'abord coté le niveau de l'enfant au départ. Il y avait donc trop d'indicateur pour pouvoir faire l'exercice du début à la fin. 
Nous avons donc pris la décisions de réduire le nombre d'indicateur à 3 ou 4 pour l'avenir afin de faire le processus du début à la fin, c'est ce que nous mettrons en place dans la prochaine année.
C'est donc à suivre...
"""

sample_text_en = """Virtually all participants indicated they enjoyed being outdoors, learning new skills, making new friends and learning about the natural environment. Most agreed they learned how to be an effective leader. Similarly, virtually all participants indicated a desire to participate in further camps."""

print(detect(sample_text_fr)) # fr 
print(detect(sample_text_en)) # en 


fr
en


In [3]:
# We will use Latent Dirichlet Allocation to model 
# We will apply LDA to convert a set of reserach papers to a set
# of topics. 

# **Research paper topic modelling** is an unsupervised ML 
#  method that helps us discover hidden semantic structures in a paper, 
# that allows us to learn topic representations of papers in a corpus. 
# The model can be applied to ay kinds of lables on documens, 
# such as tags on post on the website. 

# *****************************************************************8

### The Process ### 

    # - Pick the number of topics ahead of time even if wer're not 
    #   sure of that the topics are. 
    # - Each document is represented as a distribution over topics 
    # - Each topic is represented as a distribution over words. 
    
# The research paper text data is just a bunch of unlabeles texts


In [10]:
### The Data ### 

import pandas as pd
from IPython.display import display, HTML

PATH = "topic_modelling_dataset.xlsx"
df = pd.read_excel(PATH) # load into a data-frame 
display(df.head())

# In order to keep it simple , and for reference, we will use only two of the columns
df2 = df[['NO','RESULTATS_2018']]
display(df2.head())

Unnamed: 0,NO,ORG_NAME,TERRITOIRE,SOUS-TERRITOIRE,SECTEUR,RESULTATS_2018
0,4215,La Relance-jeunes et familles (RJF) inc.,Centre-est de l'île,Centre-Sud,Famille,IDEM
1,2591,Projet TRIP,Centre-est de l'île,Centre-Sud,Jeunes,Voir documenrt en annexe
2,187408,"Spectre de rue, programme de travail de milieu",Centre-est de l'île,Centre-Sud,Jeunes,"À ce stade du projet, cet item est sans objet."
3,49,Les Grands Frères et Grandes Soeurs du Grand M...,Centre-est de l'île,Le Plateau Mont-Royal,Jeunes,
4,7929,Suicide-Action Montréal inc.,Centre-nord de l'île,Saint-Michel,Écoute et référence,Les participants doivent remplir une appréciat...


Unnamed: 0,NO,RESULTATS_2018
0,4215,IDEM
1,2591,Voir documenrt en annexe
2,187408,"À ce stade du projet, cet item est sans objet."
3,49,
4,7929,Les participants doivent remplir une appréciat...


In [23]:
### Topic Modelling info ### 

# We will use Latent Dirichlet Allocation to model 
# We will apply LDA to convert a set of reserach papers to a set
# of topics. 

# **Research paper topic modelling** is an unsupervised ML 
#  method that helps us discover hidden semantic structures in a paper, 
# that allows us to learn topic representations of papers in a corpus. 
# The model can be applied to ay kinds of lables on documens, 
# such as tags on post on the website. 

# *****************************************************************8

### The Process ### 

    # - Pick the number of topics ahead of time even if wer're not 
    #   sure of that the topics are. 
    # - Each document is represented as a distribution over topics 
    # - Each topic is represented as a distribution over words. 
    
# The research paper text data is just a bunch of unlabeles texts

# *******************************************************************

In [12]:
### Text Cleaning ### 

## Lemmatizing 

import nltk 
import spacy
from stop_words import get_stop_words

nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
fr_stop_w = get_stop_words('fr')
en_stop_w = get_stop_words('en')
fr_stop_w.append('plus') # We noticed it appears a lot 
en_stop_w.append('plus')

# create a filtered lemmas to exclude determiners (DET), adpositions(aka prepositions) (ADP), 
# punctuation (PUNCT), conjuctions (CONJ,CCONJ), numerals (NUM), symbols (SYM), spaces (NUM), 
# and non-alpha tokens. We will also filter stopwords. 
# Full list can be found at https://spacy.io/api/annotation

def filter_lemmas(doc, tags = ["DET","ADP","PUNCT","CONJ","CCONJ","SCONJ","NUM","SYM","SPACE"], lang='fr'): 
    """Expects a spaCY doc object processed with a nlp pipeline"""
    lemmas = []
    if lang == 'fr': 
        lemmas = [(token.text, token.lemma_) for token in doc if token.pos_ not in tags 
                  and token.text.isalpha() and token.text not in fr_stop_w and len( str(token.text)) > 2]
    else: 
        lemmas = [(token.text, token.lemma_) for token in doc if token.pos_ not in tags 
                  and token.text.isalpha() and token.text not in en_stop_w and len( str(token.text)) > 2]
    return(lemmas)

# Test with the sample 
doc = nlp_fr(sample_text_fr)
print(filter_lemmas(doc))

print("\ndoc tokens:\n", len(doc))

[('possible', 'possible'), ('observations', 'observation'), ('fin', 'fin'), ('ateliers', 'atelier'), ('puisque', 'puisqu'), ('observer', 'observer'), ('correctement', 'correctement'), ('enfant', 'enfant'), ('groupe', 'groupe'), ('animer', 'animer'), ('atelier', 'atelier'), ('semaines', 'semaine'), ('observation', 'observation'), ('nécessaire', 'nécessaire'), ('raison', 'raison'), ('demie', 'demie'), ('journée', 'journer'), ('semaine', 'semaine'), ('semaines', 'semaine'), ('donne', 'donne'), ('atelier', 'atelier'), ('fallu', 'falloir'), ('ateliers', 'atelier'), ('abord', 'abord'), ('coté', 'coter'), ('niveau', 'niveau'), ('enfant', 'enfant'), ('départ', 'départ'), ('indicateur', 'indicateur'), ('pouvoir', 'pouvoir'), ('exercice', 'exercice'), ('fin', 'fin'), ('Nous', 'nous'), ('pris', 'prendre'), ('décisions', 'décision'), ('réduire', 'réduire'), ('nombre', 'nombre'), ('indicateur', 'indicateur'), ('avenir', 'avenir'), ('processus', 'processus'), ('fin', 'fin'), ('mettrons', 'mettre'), 

In [15]:
### Text filtering ### 

nlp_fr = spacy.load("fr_core_news_md")
nlp_en = spacy.load("en_core_web_md")

list1 = df2['NO']
list2 = df2['RESULTATS_2018']
dict_df = dict(zip(list1, list2))

def filter_texts(texts_list):
    output = []
    for text in texts_list: 
        text = str(text)
        if len(text) > 0: 
            # lang = 'fr'
            lang = detect(text) # detect language
            if lang == 'fr': 
                doc = nlp_fr(text)
                lemmas = [lemma[1] for lemma in filter_lemmas(doc)]
                output.append(lemmas)
            else: 
                doc = nlp_en(text)
                lemmas = [lemma[1] for lemma in filter_lemmas(doc, lang='en')]
                output.append(lemmas)
        else: 
            output.append("[???]")
        
    return output

filtered_resultats = filter_texts(list2)

# Notice that the lengths do match
print("Length of original list (RESULTATS_2018): ", len(list2))
print("\nLength of filtered RESULTATS_2018{}\n".format(len(filtered_resultats)))

# print the results 
print(filtered_resultats[2])

df['filtered'] = filtered_resultats

display(df.head())


Length of original list (RESULTATS_2018):  269

Length of filtered RESULTATS_2018269

['stade', 'projet', 'item', 'objet']


Unnamed: 0,NO,ORG_NAME,TERRITOIRE,SOUS-TERRITOIRE,SECTEUR,RESULTATS_2018,filtered
0,4215,La Relance-jeunes et familles (RJF) inc.,Centre-est de l'île,Centre-Sud,Famille,IDEM,[IDEM]
1,2591,Projet TRIP,Centre-est de l'île,Centre-Sud,Jeunes,Voir documenrt en annexe,"[voir, documenrt, annexe]"
2,187408,"Spectre de rue, programme de travail de milieu",Centre-est de l'île,Centre-Sud,Jeunes,"À ce stade du projet, cet item est sans objet.","[stade, projet, item, objet]"
3,49,Les Grands Frères et Grandes Soeurs du Grand M...,Centre-est de l'île,Le Plateau Mont-Royal,Jeunes,,[nan]
4,7929,Suicide-Action Montréal inc.,Centre-nord de l'île,Saint-Michel,Écoute et référence,Les participants doivent remplir une appréciat...,"[participant, devoir, remplir, appréciation, a..."


In [16]:
### LDA with Gensim ### 
            
# First, we are creating a dictionary from the data,  
# Then convert to bag of words corpus and save the dictionary
# and corpus for future use. 

# ignore
import warnings
warnings.filterwarnings("ignore")
# ignore

import pickle
from gensim import corpora 

# pass the text to the corpora object and create a dictionary object
dictionary = corpora.Dictionary(filtered_resultats)
corpus = [dictionary.doc2bow(text) for text in filtered_resultats]

# Save the corpus with pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim') 


# We are asking LDA to find 5 topics in the data: 

import gensim 

# load the LDA model 
ldamodel = gensim.models.LdaModel(corpus, num_topics = 10, 
                                  id2word=dictionary, passes=100)
ldamodel.save('model_fr.gensim')

# Get the topics 
topics = ldamodel.print_topics(num_words=3)
for topic in topics:
    print(topic)


(0, '0.011*"jeune" + 0.009*"organisme" + 0.009*"famille"')
(1, '0.018*"logement" + 0.016*"locataire" + 0.008*"santé"')
(2, '0.017*"moyenne" + 0.016*"membre" + 0.011*"résultat"')
(3, '0.013*"activité" + 0.013*"jeune" + 0.012*"service"')
(4, '0.034*"enfant" + 0.014*"année" + 0.013*"fin"')
(5, '0.018*"participant" + 0.009*"increase" + 0.008*"activité"')
(6, '0.012*"participant" + 0.011*"famille" + 0.011*"parent"')
(7, '0.018*"bénévole" + 0.009*"organisme" + 0.007*"satisfaction"')
(8, '0.014*"activité" + 0.012*"participant" + 0.009*"jeune"')
(9, '0.015*"groupe" + 0.012*"femme" + 0.011*"participant"')


In [46]:
# Get the topics 
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)
    
    
# Recover words in the topics 
print("\n", ldamodel.show_topic(2), "\n") 

topic_words = [tup[0] for tup in ldamodel.show_topic(0,topn=10)] 
print("Example extraction words for a topic:\n", topic_words)

(0, '0.011*"jeune" + 0.009*"organisme" + 0.009*"famille" + 0.008*"intervention" + 0.007*"développer"')
(1, '0.018*"logement" + 0.016*"locataire" + 0.008*"santé" + 0.008*"membre" + 0.006*"objectif"')
(2, '0.017*"moyenne" + 0.016*"membre" + 0.011*"résultat" + 0.008*"augmentation" + 0.008*"service"')
(3, '0.013*"activité" + 0.013*"jeune" + 0.012*"service" + 0.009*"groupe" + 0.008*"participant"')
(4, '0.034*"enfant" + 0.014*"année" + 0.013*"fin" + 0.012*"parent" + 0.010*"groupe"')
(5, '0.018*"participant" + 0.009*"increase" + 0.008*"activité" + 0.007*"social" + 0.007*"objectif"')
(6, '0.012*"participant" + 0.011*"famille" + 0.011*"parent" + 0.010*"enfant" + 0.010*"activité"')
(7, '0.018*"bénévole" + 0.009*"organisme" + 0.007*"satisfaction" + 0.007*"guide" + 0.006*"service"')
(8, '0.014*"activité" + 0.012*"participant" + 0.009*"jeune" + 0.006*"femme" + 0.006*"permettre"')
(9, '0.015*"groupe" + 0.012*"femme" + 0.011*"participant" + 0.007*"atelier" + 0.007*"résultat"')

 [('moyenne', 0.016800

In [153]:
# We can now output a visualization of the possible topics

dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl','rb'))
lda = gensim.models.LdaModel.load('model_fr.gensim')

# Library to visualize the LDA
import pyLDAvis.gensim

# set up the display features 
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [162]:
### Try with a new text ### 

test_text = """C'est très difficile de faire des avances à moins qu'on commence 
                à facilitier des activités pour des enfants et les familles. Une 
                activité de plus peut faire la différence dans des projets sociaux. 
                On a donc besoin de la collaboration des organismes pour obtenir 
                des meilleurs résultats. 
""" 
doc = nlp_fr(test_text)
text_lemmas = [lemma[1] for lemma in filter_lemmas(doc)]
new_doc_bow = dictionary.doc2bow(text_lemmas)
print(ldamodel.get_document_topics(new_doc_bow))

## Note maps go to x -> x+1

[(6, 0.17447309), (9, 0.7755191)]
