In [157]:
import tomotopy as tp
import spacy
from spacy.tokens import DocBin, Doc
Doc.set_extension("ID", default='')
Doc.set_extension("headline", default='')
Doc.set_extension("label", default='')
import pandas as pd 
import os 
from tqdm import tqdm
import numpy as np
import pandas as pd
import regex as re
re.DEFAULT_VERSION = re.VERSION1
import plotly.express as px
from IPython.display import Image
from gensim.models.phrases import Phraser, Phrases
import time
import gc
from bertopic import BERTopic 
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

## Apply NLP pipeline for tokenization, lemmatization and other features for latter uses

In [2]:
textDf = pd.read_csv('data/le_figaro.csv')
textDf = textDf.fillna('')
textDf

Unnamed: 0,date,pageNum,paragraphNum,content,headline,label
0,1870-01-01,0,9,Admbûstrateur . AUGUSTE DUNIONT ABOMMBHXins Pa...,,
1,1870-01-01,0,13,m part ri pas demain à l'occasion du jour de l...,Le Figaro,
2,1870-01-01,0,20-21-22-23-24,"Notre éclectisme en roli tique qui, pour être ...",,
3,1870-01-01,0,25-26,"fœil bravement fixé sur le couteau, gravi les ...",,
4,1870-01-01,0,27-28,". Hier matii , lés gens dé Batignolles considé...",,
...,...,...,...,...,...,...
791868,1910-12-31,5,7,Le sujet imaginé par MM. Gheusi et Mé rane aur...,LA SOIRÉE LE MIRACLE A L'OPÉRA,culture
791869,1910-12-31,5,9-10-11-12-13-14-15,M lle Chenal M. Muratore si l'on en juge parla...,A L'OPÉRA Le Miracle,culture
791870,1910-12-31,5,16,à 8 heures pour les représentations de M. Gili...,"Ce-soir : A l'Opéra,",culture
791871,1910-12-31,5,18-19,l'Habitation forcée SUITE derrière le vert ble...,Feuilleton du FIGARO du 31 Décembre,autres


In [8]:
doc_bin = DocBin(store_user_data=True)

nlp = spacy.load('fr_core_news_lg', exclude=["ner"])
texts = ((row.content, ('_'.join([str(row['date']),str(row['pageNum']),str(row['paragraphNum'])]), 
                        row['headline'], row['label']))
         for _, row in textDf.iterrows())
count = 0
nDocs = len(textDf)
with tqdm(total=len(textDf),mininterval = 5, miniters =1000) as pbar:
    for doc, (ID, headline, label) in nlp.pipe(texts,as_tuples = True, batch_size=2048,n_process=16):
        doc._.ID = ID
        doc._.headline = headline
        doc._.label = label
        doc_bin.add(doc)
        pbar.update(1)
        #split data into several docbin
        if (count%50_000 == 0) or (count == nDocs-1):
            i = int(count/50_000)-4 if (count != nDocs-1) else int(count/50_000) -3
            if count>0:
                doc_bin.to_disk(f"data/spacy/le_figaro{i}.spacy")
                del doc_bin
                gc.collect()
            doc_bin = DocBin(store_user_data=True)
            
        count+=1


 49%|███████████████▎               | 391872/791873 [1:42:38<1:44:46, 63.63it/s]


In [9]:
len(doc_bin)

0

# Pachinko allocation

In [10]:
nlp = spacy.load('fr_core_news_lg', disable=["ner"])

doc_bin = DocBin(store_user_data=True)
for root, dirs, files in os.walk('data/spacy'):
    for name in tqdm(files):
        if name.endswith((".spacy")):
            doc_bin.merge(DocBin(store_user_data=True).from_disk("./data/spacy/"+name))


100%|███████████████████████████████████████████| 12/12 [02:14<00:00, 11.19s/it]
0it [00:00, ?it/s]


In [47]:
def preprocess_text(doc):
    ID = doc._.ID
    headline = doc._.headline
    label = doc._.label
    oovRatio = np.mean([1 if word.is_oov else 0 for word in doc if word.is_alpha])
    lemmas = [word.lemma_.lower() for word in doc 
           if word.is_alpha and (not word.is_stop) and (len(word.lemma_)>2) and (not word.is_oov)] 
    
    return ID, doc.text, lemmas, oovRatio, headline, label

In [55]:
docs = [preprocess_text(doc) for doc in doc_bin.get_docs(nlp.vocab)]
textDf = pd.DataFrame(docs, columns=['ID','content','lemmatized','oovRatio','headline','label'])
textDf = textDf[(textDf.lemmatized.apply(len)>5) &
                (textDf.oovRatio<0.6) &
                (textDf.label=='')]
print(len(textDf))

709265


In [65]:
raw_articles = textDf["lemmatized"].tolist()
bigram = Phrases(raw_articles, min_count=10, threshold=10)
bigram_mod = Phraser(bigram)
raw_articles = list(bigram_mod[raw_articles])
trigram = Phrases(raw_articles, min_count=10, threshold=10)
trigram_mod = Phraser(bigram)
raw_articles = list(trigram_mod[raw_articles])
textDf["nGram"] = raw_articles
textDf["nGram"] = textDf["nGram"].apply(' '.join)
textDf["lemmatized"] = textDf["lemmatized"].apply(' '.join)
textDf.to_csv('data/le_figaro_lemmatized_without_stop.csv',index=False)

In [None]:
textDf = pd.read_csv('data/le_figaro_lemmatized_without_stop.csv')
raw_articles = textDf.nGram.str.split().to_list()

In [66]:
corpus = tp.utils.Corpus()
for doc in raw_articles:
    if doc:
        corpus.add_doc(doc)

In [None]:
k1_max = 7
k1_min = 2 
k2_max = 18
k2_min = 8
params = []
scores = []
num_iter = 0
max_iter = 0
for k1 in range(k1_min, k1_max):
    for k2 in range(max(k1,k2_min),k2_max):
        max_iter +=1
        
start = time.time()
for k1 in range(k1_min, k1_max):
    for k2 in range(max(k1,k2_min),k2_max):
        num_iter+=1
        model = tp.PAModel(tw=tp.TermWeight.IDF,min_cf = 10, min_df=5, rm_top=25, k1=k1, k2=k2, corpus=corpus)
        model.burn_in = 20
        model.train(40, workers=24)
        score = tp.coherence.Coherence(model, coherence="c_v").get_score()
        params.append((k1,k2))
        scores.append(score)
        print("Runtime: %.2f seconds" %(time.time() - start), "|| Number of Searches: %s out of  %s" %(num_iter, max_iter), "|| k1: %s & k2: %s || coherence : %.3f" %(k1,k2,score))

In [82]:
for ind,score in enumerate(scores):
    if score>max(scores)*0.95:
        print("k1: %s & k2: %s || coherence : %.3f" %(params[ind][0],params[ind][1],score))


k1: 3 & k2: 9 || coherence : 0.659
k1: 4 & k2: 11 || coherence : 0.685
k1: 4 & k2: 14 || coherence : 0.694
k1: 5 & k2: 15 || coherence : 0.668


In [91]:
k1 = 4
k2 = 14
model = tp.PAModel(tw=tp.TermWeight.IDF,min_cf = 10, min_df=5, rm_top=25, k1=k1, k2=k2, corpus=corpus)
model.burn_in = 20
model.train(100, workers=24)
score = tp.coherence.Coherence(model, coherence="c_v").get_score()
print(score)

0.7261144071817398


In [94]:
for k in range(k2):
    print('Topic #{}'.format(k))
    print("\t", [w for w, p in model.get_topic_words(k)])

Topic #0
	 ['gouvernement', 'angleterre', 'anglais', 'allemagne', 'ministre', 'russie', 'france', 'français', 'allemand', 'empereur']
Topic #1
	 ['officier', 'armée', 'troupe', 'soldat', 'capitaine', 'militaire', 'navire', 'commandant', 'guerre', 'colonel']
Topic #2
	 ['comte', 'mlle', 'comtesse', 'comte_comtesse', 'marquis', 'baron', 'vicomte', 'baronne', 'paul', 'docteur']
Topic #3
	 ['politique', 'france', 'pays', 'république', 'peuple', 'chose', 'esprit', 'gouvernement', 'estper', 'idée']
Topic #4
	 ['monsieur', 'femme', 'aller', 'oeil', 'dire', 'rien', 'main', 'cœur', 'vie', 'jamais']
Topic #5
	 ['rue', 'maison', 'eau', 'vendre', 'prix', 'franc', 'blanc', 'vin', 'noir', 'gramme']
Topic #6
	 ['prix', 'roi', 'course', 'fête', 'prince', 'cheval', 'palais', 'voiture', 'empereur', 'majesté']
Topic #7
	 ['théâtre', 'représentation', 'pièce', 'artiste', 'opéra', 'jouer', 'mlle', 'soir', 'succès', 'rôle']
Topic #8
	 ['franc', 'chambre', 'commission', 'député', 'loi', 'séance', 'million', 

# BERTopic

In [149]:

## tokenizer ici meme, découper pour doc entre 256 et 512 bert token, en respectant les phrases et fournir embeddings + token + text
def splitLongDoc(doc):
    if len(doc._.label) != 0:
        return '','', ''
    length = len(doc)
    if length>=800:
        nSegment = int(length/800)+1
        maxLen = length/nSegment
        content = ['']
        for sentence in doc.sents:
            sentLen = len(sentence)
            if (len(content[-1])+sentLen<maxLen) or (len(content[-1]) < 200):
                content[-1] = content[-1] +' '+ sentence.text
            else:
                content.append(sentence.text)
        if len(content[-1])<100:
            content[-2] = content[-2] + ' ' +content[-1]
            content = content[:-1]
            
    else:
        text = doc.text
        content = [text] if len(text)>100 else ''
        
    return doc._.ID, content, doc._.headline
            
docs = [splitLongDoc(doc) for doc in doc_bin.get_docs(nlp.vocab)]
docs = [(ID, content, headline) for ID, content, headline in docs if content != '']
BERTDf = pd.DataFrame(docs, columns=['ID','content','headline'])
BERTDf= BERTDf.explode('content')


In [172]:
embedding_model = SentenceTransformer('dbmdz/bert-base-french-europeana-cased',device='cuda:1')

No sentence-transformers model found with name /home/gsicard/.cache/torch/sentence_transformers/dbmdz_bert-base-french-europeana-cased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/gsicard/.cache/torch/sentence_transformers/dbmdz_bert-base-french-europeana-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect 

In [174]:
texts = BERTDf.content.tolist()
vectorizer_model = CountVectorizer(ngram_range=(1, 3), 
                                   max_df = 0.6,
                                   min_df = 5,
                                   strip_accents = 'unicode')
topic_model = BERTopic(verbose=True,
                       embedding_model=embedding_model,#'dbmdz/bert-base-french-europeana-cased',
                       nr_topics = 'auto',
                       min_topic_size = 100,
                       vectorizer_model = vectorizer_model)
topics, probs = topic_model.fit_transform(texts)


Batches:   0%|          | 0/53489 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [181]:
embedding_model.encode(texts[:2000],show_progress_bar=True)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

array([[-0.4239223 ,  0.02248213, -0.20564407, ..., -0.8670402 ,
        -0.22687754, -0.03607981],
       [-0.24473065, -0.05312259, -0.09280771, ..., -0.5015501 ,
         0.09548544,  0.5582943 ],
       [-0.5172163 , -0.2801716 , -0.24598305, ..., -0.21981826,
         0.14662059, -0.07403921],
       ...,
       [-0.33199543, -0.01667139, -0.2851535 , ..., -0.7983566 ,
        -0.14050919,  0.04579991],
       [-0.37848774,  0.03212813, -0.37159485, ..., -0.7773362 ,
         0.20419456, -0.01591571],
       [-0.46303445,  0.16347334, -0.29598263, ..., -0.7773549 ,
        -0.1193127 ,  0.17085151]], dtype=float32)

In [None]:
t = time.time()
a = tokenizer(texts[:20])
print(time.time()-t)
print(a)

In [195]:
from transformers.pipelines import pipeline, autoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/electra-base-french-europeana-cased-discriminator")
model =  pipeline("feature-extraction", model='dbmdz/bert-base-french-europeana-cased')


Some weights of the model checkpoint at dbmdz/bert-base-french-europeana-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [196]:
b = model(texts[:1])

In [204]:
len(tokenizer(texts[0])['input_ids'])

206

In [213]:
np.array(b) == np.array(model(tokenizer.decode(tokenizer(texts[:1])['input_ids'])))

TypeError: 'list' object cannot be interpreted as an integer

In [144]:

BERTDf[(BERTDf.content.apply(len)<120) & (BERTDf.content.apply(len)>75)]

Unnamed: 0,ID,content,headline
63,1898-03-22_0_34,"Mais, monsieur, il y a plus fort que cela en F...",
73,1898-03-22_1_9,"ble, que lesJAnglais occupent la baie de Mirs,...",
90,1898-03-22_2_24,"CENTRAL-HOTEL, le plus grand et le plus élégan...",AVIS DIVERS
91,1898-03-22_2_25,Demandez dans les bons restaurants l'exquis PE...,
169,1898-03-23_2_30,Consultez votre médecin sur l'usage du Képhir ...,VOUS QUI SOUFFREZ DE L'ESTOMAC
...,...,...,...
791821,1878-05-12_2_162,ENFANTS naturels. Constitution secrète d'ass '...,
791823,1878-05-12_2_164,Ns hautes référ dem régie d'une g J propriété ...,INGÉ
791828,1878-05-12_2_230,"FONDS ETRANGERS 3 0 0 CONSOLIDÉS, Midi heure é...",
791846,1878-05-13_0_29,Deux médecins sont invités et présentés l'un à...,
