In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
!ls ../data

DEMOCRATIE_ET_CITOYENNETE.csv
EVENTS.csv
LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.csv
LA_TRANSITION_ECOLOGIQUE.csv
ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.csv
Population_par_departement.csv
readme.md


In [6]:
df = pd.read_csv('../data/DEMOCRATIE_ET_CITOYENNETE.csv', index_col=0)

In [105]:
texts = (df.iloc[:, -2]+' '+df.iloc[:, -3]+' '+df.iloc[:, -4]+' '+df.iloc[:, -5]).dropna().values

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000
stop_words = ['le', 'la', 'est', 'de', 'qui', 'et', 'se', 'il', 'pour', 'un', 'des', 'une', 'sur', 'ou', 'je', 'les', 'que', 'dans', 'suis', 'sont', 'au', 'en', 'on', 'par', 'ai', 'du', 'ce', 'mais', 'ne', 'pas', 'plus', 'qu', 'être', 'leur', 'aux', 'mon', 'moi', 'me', 'ma', 'son', 'avoir', 'car', 'nous', 'ils', 'etc', 'vous', 'faire', 'si']

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(texts)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tf = tf_vectorizer.fit_transform(texts)
tf_feature_names = tf_vectorizer.get_feature_names()

In [107]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [108]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 8
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
france français tout ont peut personnes ceux monde
Topic 1:
quotas langue oui apprentissage française besoins fonction français
Topic 2:
politique accueil migratoire intégration doit avec européenne niveau
Topic 3:
pays aider origine développement leurs ces aide développer
Topic 4:
notre nos valeurs lois respect culture vie société
Topic 5:
travail logement oui intégration donner accueillir éducation non
Topic 6:
faut intégrer arrêter accueillir mettre cela aussi donc
Topic 7:
migrants accueillir économiques accueil non migrant france accepter
Topic 8:
sais oui rien sujet pense difficile trop pourquoi
Topic 9:
immigration choisie droit asile trop regroupement familial frontières
Topic 0:
voir ci dessus main oeuvre réponse accompagnement humanité
Topic 1:
français france doit ans nationalité française cas personne
Topic 2:
pays développement aider immigration faut origine contre oui
Topic 3:
politique immigration doit avec intégration france migratoire cette
Topic 4:
immigratio

In [109]:
nmf.components_.shape

(10, 1000)

In [110]:
first_topic = pd.DataFrame(nmf.transform(tfidf)).apply(np.argmax, axis=1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


In [111]:
first_topic.groupby(first_topic.values).count()

0    6727
1    2757
2    4177
3    2462
4    1981
5    1652
6    1475
7    1861
8     738
9    2920
dtype: int64

In [112]:
pd.DataFrame(nmf.transform(tfidf)).apply(np.argmax)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


0     2575
1    16273
2    20422
3    20273
4    19584
5    21747
6    18007
7     2370
8      285
9    20377
dtype: int64