In [266]:
import tomotopy as tp
import spacy
from spacy.tokens import DocBin, Doc
Doc.set_extension("ID", default='')
Doc.set_extension("headline", default='')
Doc.set_extension("label", default='')
import pandas as pd 
import os 
from tqdm import tqdm
import numpy as np
import pandas as pd
import regex as re
re.DEFAULT_VERSION = re.VERSION1
import plotly.express as px
from IPython.display import Image
from gensim.models.phrases import Phraser, Phrases
import time
import gc
from bertopic import BERTopic 
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel
import zipfile
from unidecode import unidecode


## Apply NLP pipeline for tokenization, lemmatization and other features for latter uses

In [38]:
textDf = pd.read_csv('data/le_figaro.csv')
textDf = textDf.fillna('')
textDf

Unnamed: 0,date,pageNum,paragraphNum,content,headline,label
0,1870-01-01,0,9,Admbûstrateur . AUGUSTE DUNIONT ABOMMBHXins Pa...,,
1,1870-01-01,0,13,m part ri pas demain à l'occasion du jour de l...,Le Figaro,
2,1870-01-01,0,20-21-22-23-24,"Notre éclectisme en roli tique qui, pour être ...",,
3,1870-01-01,0,25-26,"fœil bravement fixé sur le couteau, gravi les ...",,
4,1870-01-01,0,27-28,". Hier matii , lés gens dé Batignolles considé...",,
...,...,...,...,...,...,...
791868,1910-12-31,5,7,Le sujet imaginé par MM. Gheusi et Mé rane aur...,LA SOIRÉE LE MIRACLE A L'OPÉRA,culture
791869,1910-12-31,5,9-10-11-12-13-14-15,M lle Chenal M. Muratore si l'on en juge parla...,A L'OPÉRA Le Miracle,culture
791870,1910-12-31,5,16,à 8 heures pour les représentations de M. Gili...,"Ce-soir : A l'Opéra,",culture
791871,1910-12-31,5,18-19,l'Habitation forcée SUITE derrière le vert ble...,Feuilleton du FIGARO du 31 Décembre,autres


In [8]:
doc_bin = DocBin(store_user_data=True)

nlp = spacy.load('fr_core_news_lg', exclude=["ner"])
texts = ((row.content, ('_'.join([str(row['date']),str(row['pageNum']),str(row['paragraphNum'])]), 
                        row['headline'], row['label']))
         for _, row in textDf.iterrows())
count = 0
nDocs = len(textDf)
with tqdm(total=len(textDf),mininterval = 5, miniters =1000) as pbar:
    for doc, (ID, headline, label) in nlp.pipe(texts,as_tuples = True, batch_size=2048,n_process=16):
        doc._.ID = ID
        doc._.headline = headline
        doc._.label = label
        doc_bin.add(doc)
        pbar.update(1)
        #split data into several docbin
        if (count%50_000 == 0) or (count == nDocs-1):
            i = int(count/50_000)-4 if (count != nDocs-1) else int(count/50_000) -3
            if count>0:
                doc_bin.to_disk(f"data/spacy/le_figaro{i}.spacy")
                del doc_bin
                gc.collect()
            doc_bin = DocBin(store_user_data=True)
            
        count+=1


 49%|███████████████▎               | 391872/791873 [1:42:38<1:44:46, 63.63it/s]


In [9]:
len(doc_bin)

0

# Pachinko allocation

In [2]:
nlp = spacy.load('fr_core_news_lg', disable=["ner"])

doc_bin = DocBin(store_user_data=True)
for root, dirs, files in os.walk('data/spacy'):
    for name in tqdm(files):
        if name.endswith((".spacy")):
            doc_bin.merge(DocBin(store_user_data=True).from_disk("./data/spacy/"+name))


100%|███████████████████████████████████████████| 12/12 [01:34<00:00,  7.88s/it]
0it [00:00, ?it/s]


In [47]:
def preprocess_text(doc):
    ID = doc._.ID
    headline = doc._.headline
    label = doc._.label
    oovRatio = np.mean([1 if word.is_oov else 0 for word in doc if word.is_alpha])
    lemmas = [word.lemma_.lower() for word in doc 
           if word.is_alpha and (not word.is_stop) and (len(word.lemma_)>2) and (not word.is_oov)] 
    
    return ID, doc.text, lemmas, oovRatio, headline, label

In [55]:
docs = [preprocess_text(doc) for doc in doc_bin.get_docs(nlp.vocab)]
textDf = pd.DataFrame(docs, columns=['ID','content','lemmatized','oovRatio','headline','label'])
textDf = textDf[(textDf.lemmatized.apply(len)>5) &
                (textDf.oovRatio<0.6) &
                (textDf.label=='')]
print(len(textDf))

709265


In [65]:
raw_articles = textDf["lemmatized"].tolist()
bigram = Phrases(raw_articles, min_count=10, threshold=10)
bigram_mod = Phraser(bigram)
raw_articles = list(bigram_mod[raw_articles])
trigram = Phrases(raw_articles, min_count=10, threshold=10)
trigram_mod = Phraser(bigram)
raw_articles = list(trigram_mod[raw_articles])
textDf["nGram"] = raw_articles
textDf["nGram"] = textDf["nGram"].apply(' '.join)
textDf["lemmatized"] = textDf["lemmatized"].apply(' '.join)
textDf.to_csv('data/le_figaro_lemmatized_without_stop.csv',index=False)

In [None]:
textDf = pd.read_csv('data/le_figaro_lemmatized_without_stop.csv')
raw_articles = textDf.nGram.str.split().to_list()

In [66]:
corpus = tp.utils.Corpus()
for doc in raw_articles:
    if doc:
        corpus.add_doc(doc)

In [None]:
k1_max = 7
k1_min = 2 
k2_max = 18
k2_min = 8
params = []
scores = []
num_iter = 0
max_iter = 0
for k1 in range(k1_min, k1_max):
    for k2 in range(max(k1,k2_min),k2_max):
        max_iter +=1
        
start = time.time()
for k1 in range(k1_min, k1_max):
    for k2 in range(max(k1,k2_min),k2_max):
        num_iter+=1
        model = tp.PAModel(tw=tp.TermWeight.IDF,min_cf = 10, min_df=5, rm_top=25, k1=k1, k2=k2, corpus=corpus)
        model.burn_in = 20
        model.train(40, workers=24)
        score = tp.coherence.Coherence(model, coherence="c_v").get_score()
        params.append((k1,k2))
        scores.append(score)
        print("Runtime: %.2f seconds" %(time.time() - start), "|| Number of Searches: %s out of  %s" %(num_iter, max_iter), "|| k1: %s & k2: %s || coherence : %.3f" %(k1,k2,score))

In [82]:
for ind,score in enumerate(scores):
    if score>max(scores)*0.95:
        print("k1: %s & k2: %s || coherence : %.3f" %(params[ind][0],params[ind][1],score))


k1: 3 & k2: 9 || coherence : 0.659
k1: 4 & k2: 11 || coherence : 0.685
k1: 4 & k2: 14 || coherence : 0.694
k1: 5 & k2: 15 || coherence : 0.668


In [91]:
k1 = 4
k2 = 14
model = tp.PAModel(tw=tp.TermWeight.IDF,min_cf = 10, min_df=5, rm_top=25, k1=k1, k2=k2, corpus=corpus)
model.burn_in = 20
model.train(100, workers=24)
score = tp.coherence.Coherence(model, coherence="c_v").get_score()
print(score)

0.7261144071817398


In [94]:
for k in range(k2):
    print('Topic #{}'.format(k))
    print("\t", [w for w, p in model.get_topic_words(k)])

Topic #0
	 ['gouvernement', 'angleterre', 'anglais', 'allemagne', 'ministre', 'russie', 'france', 'français', 'allemand', 'empereur']
Topic #1
	 ['officier', 'armée', 'troupe', 'soldat', 'capitaine', 'militaire', 'navire', 'commandant', 'guerre', 'colonel']
Topic #2
	 ['comte', 'mlle', 'comtesse', 'comte_comtesse', 'marquis', 'baron', 'vicomte', 'baronne', 'paul', 'docteur']
Topic #3
	 ['politique', 'france', 'pays', 'république', 'peuple', 'chose', 'esprit', 'gouvernement', 'estper', 'idée']
Topic #4
	 ['monsieur', 'femme', 'aller', 'oeil', 'dire', 'rien', 'main', 'cœur', 'vie', 'jamais']
Topic #5
	 ['rue', 'maison', 'eau', 'vendre', 'prix', 'franc', 'blanc', 'vin', 'noir', 'gramme']
Topic #6
	 ['prix', 'roi', 'course', 'fête', 'prince', 'cheval', 'palais', 'voiture', 'empereur', 'majesté']
Topic #7
	 ['théâtre', 'représentation', 'pièce', 'artiste', 'opéra', 'jouer', 'mlle', 'soir', 'succès', 'rôle']
Topic #8
	 ['franc', 'chambre', 'commission', 'député', 'loi', 'séance', 'million', 

# BERTopic

In [425]:
'''## tokenizer ici meme, découper pour doc entre 256 et 512 bert token, en respectant les phrases et fournir embeddings + token + text
def splitLongDoc(doc):
    if len(doc._.label) != 0:
        return '','', ''
    length = len(doc)
    if length>=800:
        nSegment = int(length/800)+1
        maxLen = length/nSegment
        content = ['']
        for sentence in doc.sents:
            sentLen = len(sentence)
            if (len(content[-1])+sentLen<maxLen) or (len(content[-1]) < 200):
                content[-1] = content[-1] +' '+ sentence.text
            else:
                content.append(sentence.text)
        if len(content[-1])<100:
            content[-2] = content[-2] + ' ' +content[-1]
            content = content[:-1]
            
    else:
        text = doc.text
        content = [text] if len(text)>100 else ''
        
    return doc._.ID, content, doc._.headline'''

class DocSplitor:
    def __init__(self, model = None, tokenizer = None, lim = 382, device='cuda:1'):
        self.device = device
        self.model = model.to(device)
        print(f'Computation on {model.device}')
        self.tokenizer = tokenizer
        self.lim = lim
    
    def mergeList(self, l1,l2):
        return l1+l2
    
    def mergeStr(self, s1,s2):
        return s1+' '+s2
    
    def merge(self, toks1, toks2):
        return {'content':self.mergeStr(toks1['content'],toks2['content']), 
                'tokens':self.mergeList(toks1['tokens'],toks2['tokens']), 
                'lemmas':self.mergeList(toks1['lemmas'],toks2['lemmas']),
                'nTokens':toks1['nTokens']+toks2['nTokens']}
    
    def mergeAll(self, tokenized):
        if len(tokenized)>2:
            return self.merge(tokenized[0],self.mergeAll(tokenized[1:]))
        else:
            return self.merge(*tokenized) if len(tokenized)>1 else tokenized[0]
    
    def checkLim(self, toks1,toks2):
        return ((toks1['nTokens'] + toks2['nTokens']) <= self.lim)
    
    
    def resize(self, sentence): 
        tokens = self.tokenizer.encode(sentence.text)[1:-1]
        length = len(tokens)
        ratio = len(tokens)/self.lim
        if (ratio<=1): 
            segments = [(sentence,tokens)]
        else:
            segmentSize = int(length/(int(ratio)+2))+1
            segments = [sentence[segmentSize*i:segmentSize*(i+1)] for i in range((int(ratio)+1))]
            segments  =[(segment,self.tokenizer.encode(segment.text)[1:-1])
                        for segment in segments]
                         
        
        return segments
    
    
    def sentenceProcessing(self, sentence, tokens):
        lemmas = [word.lemma_.lower() for word in sentence 
                  if word.is_alpha and (not word.is_stop) and (len(word.lemma_)>2) and (not word.is_oov)]
        return {'content': sentence.text,
                'tokens': tokens,
                'lemmas': lemmas,
                'nTokens': len(tokens)}
    
    
    def groupSentences(self, tokens, length):
        # function to merge list such that it's stay smaller than lim and each block is approximatly the same size 
        if (length>=3):
            if (tokens[0]['nTokens'] <= tokens[2]['nTokens']) and self.checkLim(tokens[0],tokens[1]):
                return ([self.merge(tokens[0],tokens[1])] + 
                        self.groupSentences(tokens[2:],length-2 ))
            
            elif self.checkLim(tokens[1],tokens[2]):
                return ([tokens[0], self.merge(tokens[1],tokens[2])] + 
                        self.groupSentences(tokens[3:],length-3 ))
            
            else: 
                return (tokens[:2] +
                        self.groupSentences(tokens[2:],length-2 ))
            
        elif (length==2):
            if self.checkLim(tokens[0],tokens[1]):
                return [self.merge(tokens[0],tokens[1])] 
            
            else:
                return tokens
            
        else:
            return tokens

    def tokenize(self, doc):

        if len(doc._.label) != 0 or (len(doc)<50):
            return ('','',[], [], [], [])
        # [1:-1] to remove delimiter
        
        tokenized = [self.sentenceProcessing(segment,tokens) 
                     for sentence in doc.sents
                     for (segment,tokens) in self.resize(sentence)]
        if sum([sentence['nTokens'] for sentence in tokenized])>self.lim:
            
            mergeable = True
            while mergeable:
                nBlock = len(tokenized)
                tokenized = self.groupSentences(tokenized,nBlock)
                mergeable = not (nBlock == len(tokenized))
        else: 
            tokenized = [self.mergeAll(tokenized)]

        return (doc._.ID, 
                doc._.headline, 
                [str(i) for i in range(len(tokenized))],
                [sentence['content'] for sentence in tokenized], 
                [sentence['lemmas'] for sentence in tokenized], 
                [[2]+sentence['tokens']+[3] for sentence in tokenized])
    
    def getAttentionMask(self, attentionLen): 
        return [1]*attentionLen + [0]*(self.lim+2-attentionLen)
    
    
    def batchEmbeds(self, input_ids,attention_mask, tokEmb = False, nToks = None):
        
        output = self.model(input_ids=input_ids.to(self.device),
                            attention_mask=attention_mask.to(self.device),
                            output_hidden_states=tokEmb,
                            output_attentions=False)
        if tokEmb:
            tEmbs = sum(output.hidden_states[-4:])/4
            tEmbs = torch.vstack([tEmbs[ind,:attentionLen,:] 
                                  for ind, attentionLen in enumerate(nToks)])
            return tEmbs.cpu().numpy().astype('float16')
        
        else:
            return output.pooler_output.cpu().numpy().astype('float16')
    
    
    def saveNumpyAsZip(self, embs,index,path,nbatch):
        
        if os.path.isfile(path):
            with zipfile.ZipFile(path, 'a') as archive:
                with archive.open(f'index{nbatch}.npy','w') as file:
                    np.save(file, index)
                with archive.open(f'embs{nbatch}.npy','w') as file:
                    np.save(file, embs)
        else: 
            with open(path, 'wb') as file:
                np.savez_compressed(file,**{ f'index{nbatch}' :index , f'embs{nbatch}' : embs})
                
                
    def getEmbeds(self, tokens, ID, tokEmb = False):
        nToks= [len(toks) for toks in tokens]
        attention_mask = torch.ByteTensor([self.getAttentionMask(attentionLen)
                                          for attentionLen in nToks],device='cpu')
        
        input_ids = torch.nn.utils.rnn.pad_sequence([torch.IntTensor(toks,device='cpu') 
                                                     for toks in [[0]*(self.lim+2)]+tokens], 
                                                     batch_first=True)[1:,:]

        if tokEmb:
            self.getTokEmbeds(input_ids, attention_mask, ID, nToks, batchSize=256)
        else:
            self.getDocEmbeds(input_ids, attention_mask, ID, batchSize=512)
        
    
    def getDocEmbeds(self, input_ids, attention_mask, ID, saveFreq =100, batchSize = 512):
        docEmbs = []
        count = 0
        with torch.no_grad():
            for batch in tqdm(range(0,len(ID),batchSize)):
                
                dEmbs = self.batchEmbeds(input_ids[batch:batch+batchSize,:],
                                         attention_mask[batch:batch+batchSize,:])
                
                docEmbs.append(dEmbs)
                
                if ((batch%(batchSize*saveFreq)== 0) and(batch>0))  or (batch+batchSize>=len(ID)):
                    
                    docInd = np.array(ID[batchSize*saveFreq*count:batchSize*saveFreq*(count+1)])
                    self.saveNumpyAsZip(np.vstack(docEmbs),docInd,'data/bertEmbeddings/docEmbs.npz',count)
                    del docEmbs, docInd
                    docEmbs = []
                    torch.cuda.empty_cache()
                    gc.collect()
                    count+=1
                    
                    
    def getTokEmbeds(self, input_ids, attention_mask, ID, nToks, saveFreq =1000, batchSize = 256):
        tokEmbs =  []
        count = 0
        
        with torch.no_grad():
            for batch in tqdm(range(0,len(ID),batchSize)):
                tEmbs = self.batchEmbeds(input_ids[batch:batch+batchSize,:],
                                         attention_mask[batch:batch+batchSize,:],
                                         tokEmb = True,
                                         nToks = nToks[batch:batch+batchSize])
                
                tokEmbs.append(tEmbs)
                
                if ((batch%(batchSize*saveFreq)== 0) and(batch>0))  or (batch+batchSize>=len(ID)):
                    
                    tokInd = np.array([(ID[batchSize*saveFreq*count+i],tokIndex) 
                                       for i, attentionLen 
                                       in enumerate(nToks[batchSize*saveFreq*count:batchSize*saveFreq*(count+1)])
                                       for tokIndex in range(attentionLen)])
                    #self.tokInd = tokInd
                    #self.tokEmbs = tokEmbs
                    self.saveNumpyAsZip(np.vstack(tokEmbs),tokInd,'data/bertEmbeddings/tokEmbs.npz',count)
                    del tokEmbs, tokInd
                    tokEmbs =  []
                    torch.cuda.empty_cache()
                    gc.collect()
                    count+=1
                    
            

        
        
    

In [406]:
globals()['_361']

Unnamed: 0,ID,headline,segmentID,content,lemmas,tokens
0,1898-03-21_0_27_0,De notre correspondant de,0,Budapest : A l'occasion du prochain jubilé du ...,"[budapest, occasion, prochain, jubilé, règne, ...","[2, 16466, 30, 37, 79, 11, 2213, 378, 2086, 71..."
2,1898-03-21_0_29_0,,0,Au Tribunal correctionnel : X Le président . E...,"[tribunal, correctionnel, président, avoir, vo...","[2, 748, 11746, 13330, 30, 60, 447, 1031, 18, ..."
3,1898-03-21_0_30_0,,0,. Nous aurons le plaisir d'offrir demain à nos...,"[avoir, plaisir, offrir, demain, lecteur, prim...","[2, 18, 970, 10172, 354, 3906, 71, 11, 7002, 2..."
4,1898-03-21_0_31_0,POUR ALICE LA VIGNE AVEUGLE -,0,Nous ne publierons que d tnain la quatrième li...,"[publier, liste, souscription, recevoir, repré...","[2, 970, 446, 7212, 406, 393, 71, 6477, 468, 3..."
5,1898-03-21_0_32-33_0,De notre correspondant de,0,"Nice : t Ce matin, deux services ont-été céléb...","[nice, matin, service, être, célébrer, neuf, h...","[2, 5759, 30, 87, 860, 974, 16, 603, 3186, 507..."
...,...,...,...,...,...,...
791865,1878-05-13_2_9-10-11-12-13-14-15_11,PREMIÈRES KEFEÉSEHTATIONS,11,-soir dimanche à la représentation clés Sept C...,"[soir, dimanche, représentation, clé, château,...","[2, 17, 897, 1993, 130, 348, 4079, 25010, 1242..."
791866,1878-05-13_2_17-18_0,,0,coursés au bois de boulogne Les journées se su...,"[bois, boulogne, journée, longchamps, brillant...","[2, 1009, 381, 366, 2590, 336, 5502, 2293, 519..."
791866,1878-05-13_2_17-18_1,,1,Ce costume est très à-la mode. Il m'a paru aus...,"[costume, mode, paraître, chapeau, fort, genre...","[2, 860, 10300, 401, 739, 130, 17, 348, 4726, ..."
791866,1878-05-13_2_17-18_2,,2,Mais nous ne manquons pas de courses en semain...,"[manquer, course, semaine, pouvoir, rattraper,...","[2, 811, 512, 446, 774, 19943, 435, 336, 4843,..."


In [4]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
model =  AutoModel.from_pretrained('dbmdz/bert-base-french-europeana-cased')

Some weights of the model checkpoint at dbmdz/bert-base-french-europeana-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
splitor = DocSplitor(model,tokenizer)
            
docs = [splitor.tokenize(doc) for doc in doc_bin.get_docs(nlp.vocab)]
BERTDf = pd.DataFrame(docs, columns=['ID','headline','segmentID','content','lemmas','tokens'])
BERTDf = BERTDf.explode(['segmentID','content','lemmas','tokens'])
BERTDf  =BERTDf[~BERTDf.content.isna()].copy()
BERTDf['ID'] += '_' +BERTDf['segmentID'] 
BERTDf.to_csv('data/le_figaro_BERT.csv',index=False)

del docs
del doc_bin
gc.collect()

Computation on cuda:1


Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


0

In [246]:
#with open('data/bertEmbeddings/embs.npz', 'wb') as file:
#    np.savez_compressed(file, **{'ok':[1,2,3],'wah':[3,2,1]})
    
with open(f'data/bertEmbeddings/tokEmbs.npz', 'rb') as tokFile:
    test = np.load(tokFile)
    #ind = pd.DataFrame((test[f'index{0}'][0]))
    #embs = pd.DataFrame(test[f'embs{0}']).apply(np.array,axis=1).rename("embs")
    content = [pd.concat([pd.DataFrame((test[f'index{i}']),columns = ['ID','tokInd']),
                          pd.DataFrame(test[f'embs{i}']).apply(np.array,axis=1).rename("embs")],axis=1) 
               for i in range(int(len(test.files)/2))]
    print(len(content))
    print(len(set(content[0].ID.values)|set(content[1].ID.values)))
    print([len(np.unique(d.ID)) for d in content])
    df  = pd.concat(content)
        
        #ind, embs = zip(*[(ind, embs) for ind, embs in test.items()])

2
512
[256, 256]


In [298]:
splitor = DocSplitor(model,tokenizer,device='cuda:0')
#test = BERTDf.sample(512)
splitor.getEmbeds(BERTDf.tokens.tolist(),BERTDf.ID.tolist())

Computation on cuda:0


100%|████████████████████████████████████| 2675/2675 [11:00:31<00:00, 14.82s/it]


In [424]:
del splitor
torch.cuda.empty_cache()
gc.collect()

Exception ignored in: <function ZipFile.__del__ at 0x7fd84e986dc0>
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.9/zipfile.py", line 1816, in __del__
    self.close()
  File "/opt/anaconda/lib/python3.9/zipfile.py", line 1833, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


219

In [302]:
topic = {'science':['découverte','science','scienti','techni'],
         'futur': ['déclin','décadence','futur','avenir','progrès']}
keywords = topic['science'] + topic['futur']

BERTDfKws = BERTDf[BERTDf.content.apply(unidecode).str.lower().apply(lambda x : any(kw in x for kw in keywords))]


Computation on cuda:0


In [None]:
splitor = DocSplitor(model,tokenizer,device='cuda:1')
splitor.getEmbeds(BERTDfKws.tokens.tolist(),BERTDfKws.ID.tolist(), tokEmb=True)

Computation on cuda:1


 41%|████████████████▉                        | 145/352 [18:16<25:52,  7.50s/it]

In [313]:
with open(f'data/bertEmbeddings/docEmbs.npz', 'rb') as tokFile:
    zipedFiles = np.load(tokFile)
    #remain = [pd.DataFrame(zipedFile[f'embs{i}'][len(zipedFile[f'index{i}']):])
    #           for i in range(int(len(zipedFile.files)/2))]
    #content = [pd.DataFrame(zipedFile[f'embs{i}'][:len(zipedFile[f'index{i}'])], index= zipedFile[f'index{i}'])
    #           for i in range(int(len(zipedFile.files)/2))]
    embs = [zipedFiles[file][:len(zipedFiles[file])]
            for file in zipedFiles if 'embs' in file]
    index = [zipedFiles[file][:len(zipedFiles[file])]
            for file in zipedFiles if 'index' in file]
    docEmbs = pd.DataFrame(np.vstack(embs),index =np.concatenate(index)).apply(np.array,axis=1)
    del embs
    del index

In [None]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3), 
                                   max_df = 0.7,
                                   min_df = 10,
                                   strip_accents = 'unicode')
topic_model = BERTopic(verbose=True,
                       #embedding_model=embedding_model,#'dbmdz/bert-base-french-europeana-cased',
                       nr_topics = 'auto',
                       min_topic_size = 100,
                       vectorizer_model = vectorizer_model)
topics, probs = topic_model.fit_transform(BERTDf.set_index('ID').loc[docEmbs.index].content.tolist(),
                                          np.array(docEmbs.tolist()))
topic_model.save("BERTopic.bin")

In [88]:
np.array(test['docEmbs'].tolist()).shape

(1000, 768)

In [None]:
t = time.time()
a = tokenizer(texts[:20])
print(time.time()-t)
print(a)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/222k [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-french-europeana-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [215]:
BERTDf[BERTDf.tokens.apply(len)<20]

Unnamed: 0,ID,headline,content,tokens
44,1898-03-22_0_7,H. DE VILLEMESSANï? Fondateur ABONNEMENT ' Tro...,On t'abonne dans tous les Bureaux de Poste de ...,"[2, 639, 87, 11, 8467, 422, 671, 361, 11881, 3..."
45,1898-03-22_0_9,"RÉDACTION ADMINISTRATION PUBUOTÏÎ 26, Rue Drou...","ré, i. UAIie 102.46 Rédaction TÉLÉPHONE 102.47...","[2, 437, 16, 76, 18, 57, 1913, 364, 3530, 18, ..."
354,1898-03-26_1_31,,Notre Service' de Librairie so chargé d'envoye...,"[2, 2997, 9323, 11, 336, 14730, 573, 3252, 71,..."
397,1898-03-26_3_95,pOLE j ORD pOLE j jORD,Ouvert de 8 h du matin à midi minuit. Salle en...,"[2, 7760, 336, 28, 75, 378, 974, 130, 1486, 65..."
777,1898-04-01_3_21,"Ce-soir, au Gymnase,","neuvième spectacle d'abonnement, 4 série des v...","[2, 14855, 3953, 71, 11, 9769, 16, 24, 1567, 3..."
...,...,...,...,...
733307,1878-05-11_2_56,"A. DALIFOL, 172, quai Jemmapes, 172 FORGES et ...","Mach à vapeur ver tic' 1 et horizont 1 144, Fa...","[2, 19208, 130, 5283, 1127, 5239, 11, 21, 353,..."
733326,1878-05-11_2_98,"FARINE MORTON ,",Alimentation naturelle des Enfants. Se vend ch...,"[2, 29816, 7697, 370, 14532, 18, 1083, 2733, 1..."
733377,1878-05-11_2_196,,Mme R désire trouver place de dame de compagni...,"[2, 868, 54, 4735, 2061, 1087, 336, 4211, 336,..."
733485,1878-05-12_2_100,"APP grands et petits, chamb",cabin magas boutiq terrains il louer. Pris mod...,"[2, 23585, 213, 4402, 2705, 18214, 9362, 399, ..."


In [138]:
a = model(torch.tensor([tokenizer.encode(t) for t in texts[:3]]))

NameError: name 'texts' is not defined

In [203]:
a['last_hidden_state'][:,0,:]-a['pooler_output']

tensor([[ 2.4733e-01,  1.6857e-01,  6.5338e-01,  1.7024e-01,  5.9312e-02,
          2.0989e-02, -3.5540e-01,  3.5775e-01,  1.0148e+00,  6.4225e-01,
         -1.5062e-01,  1.0290e+01, -2.5911e-01,  6.7702e-01, -1.9008e-01,
         -6.8659e-02,  1.2729e+00,  1.0137e+00,  8.2785e-01, -4.8681e-01,
          4.3917e-01,  3.0807e-01, -3.1741e-02, -1.6302e+00, -7.0986e-01,
         -1.5057e-01,  3.5652e-01,  4.2652e-01, -1.2928e-01,  6.0701e-01,
          1.6977e+00,  9.8092e-01, -4.3878e-01,  5.2851e-01, -1.0169e+00,
         -1.4644e+00, -1.0988e+00,  4.5423e-01,  7.5249e-01,  1.4139e-01,
          7.5322e-01, -1.3733e+00,  2.2834e-01,  1.5011e+00,  3.0981e-01,
         -6.1100e-01,  8.8547e-01,  6.7765e-01,  1.3227e-01,  4.6886e-01,
          1.1077e+00,  7.7351e-01,  2.5898e-01, -4.8556e-01,  5.0439e-01,
         -9.4194e-01, -3.0323e-01,  2.6878e-01,  4.5885e-01, -1.0652e-01,
         -3.8497e+00,  5.9461e-01,  6.5919e-01, -1.7917e-01,  3.8417e-01,
         -3.3791e-01,  6.4720e-03,  1.

In [198]:
[i for i in a]

['last_hidden_state', 'pooler_output']