In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install bertopic
!pip install ctransformers
!pip install 'transformers[torch]'
!pip install kaleido

[0m

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.decomposition import PCA
from bertopic.representation import KeyBERTInspired
from huggingface_hub import login
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datetime import datetime
from bertopic.representation import TextGeneration
from collections import Counter
import matplotlib.pyplot as plt
import re
import os

In [4]:
def clean_text(df):
  '''
  Fusionner les textes en un rapport complet
  '''
  merged_texts = []
  current_case = None
  current_text = ''
  for i, row in df.iterrows():
    if current_case is None:
        current_case = row['case']
        current_text = str(row['text']).strip()
    elif current_case == row['case']:
        current_text += '\n' + str(row['text']).strip()
    else:
        merged_texts.append(current_text)
        current_case = row['case']
        current_text = str(row['text']).strip()

  merged_texts.append(current_text)
  df = df[df.columns.difference(['text'])].drop_duplicates('case').reset_index(drop=True)
  df['merged_texts'] = merged_texts

  return df

In [5]:
def clean_text_special(df):
  '''
  La meme fonction de clean_text, mais avec un peu de modification pour s'adapter a
  des caratercs speciaux produit quand les fichiers sont importes dans Colab
  '''
  merged_texts = []

  current_case = None

  current_text = ''

  for i, row in df.iterrows():
    if current_case is None:
        current_case = row['ï»¿case']
        current_text = str(row['text']).strip()
    elif current_case == row['ï»¿case']:
        current_text += '\n' + str(row['text']).strip()
    else:
        merged_texts.append(current_text)
        current_case = row['ï»¿case']
        current_text = str(row['text']).strip()

  merged_texts.append(current_text)

  df = df[df.columns.difference(['text'])].drop_duplicates('ï»¿case').reset_index(drop=True)
  df['merged_texts'] = merged_texts

  return df

In [6]:
def remove_abnormal(text_list, abnormal_cara):
  '''
  Nettoyer les caracteres anormaux produits dans le processus de scraper
  '''
  non_latin_chars = set()
  latin_punctuations = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

  for text in text_list:
    non_latin_chars.update(set(re.findall(r'[^\x00-\x7F]', text)))

    non_latin_chars = non_latin_chars.difference(latin_punctuations)

    non_latin_chars_list = list(non_latin_chars)

  for i, text in enumerate(text_list):
        for key, value in abnormal_cara.items():
            text_list[i] = text_list[i].replace(key, value)

        filtered_text = ''.join([char for char in text_list[i] if char.isascii() or char in latin_punctuations])
        text_list[i] = filtered_text

  return text_list

In [7]:
#importer les fichier en csv, un peu moche j'avoue
argentina_df = pd.read_csv('/content/drive/MyDrive/data/Argentina.csv',encoding='Latin-1')
dominican_df = pd.read_csv('/content/drive/MyDrive/data/Dominican.csv')
honduras_df = pd.read_csv('/content/drive/MyDrive/data/Honduras.csv')
mexico_df = pd.read_csv('/content/drive/MyDrive/data/Mexico.csv')
salvador_df = pd.read_csv('/content/drive/MyDrive/data/Salvador.csv')
peru_df = pd.read_csv('/content/drive/MyDrive/data/Peru.csv')
uruguay_df = pd.read_csv('/content/drive/MyDrive/data/Uruguay.csv')
venezuela_df = pd.read_csv('/content/drive/MyDrive/data/Venezuela.csv')
cuba_df = pd.read_csv('/content/drive/MyDrive/data/Cuba.csv')
ecuador_df = pd.read_csv('/content/drive/MyDrive/data/Ecuador.csv')
guatemala_df = pd.read_csv('/content/drive/MyDrive/data/Guatemala.csv')
nicaragua_df = pd.read_csv('/content/drive/MyDrive/data/Nicaragua.csv')
panama_df = pd.read_csv('/content/drive/MyDrive/data/Panama.csv')
paraguay_df = pd.read_csv('/content/drive/MyDrive/data/Paraguay.csv')
bolivia_df = pd.read_csv('/content/drive/MyDrive/data/Bolivia.csv')
brazil_df = pd.read_csv('/content/drive/MyDrive/data/Brazil.csv')
chile_df = pd.read_csv('/content/drive/MyDrive/data/Chile.csv')
colombia_df = pd.read_csv('/content/drive/MyDrive/data/Colombia.csv')
costa_rica_df = pd.read_csv('/content/drive/MyDrive/data/Costa_rica.csv')
haiti_df = pd.read_csv('/content/drive/MyDrive/data/Haiti.csv')
jamaica_df = pd.read_csv('/content/drive/MyDrive/data/Jamaica.csv')

In [8]:
#Nettoyer les textes
peru_df = clean_text(peru_df)
salvador_df = clean_text(salvador_df)
honduras_df = clean_text(honduras_df)
haiti_df = clean_text(haiti_df)
jamaica_df = clean_text(jamaica_df)
uruguay_df = clean_text(uruguay_df)
venezuela_df = clean_text(venezuela_df)
nicaragua_df = clean_text(nicaragua_df)
panama_df = clean_text(panama_df)
paraguay_df = clean_text(paraguay_df)
mexico_df = clean_text(mexico_df)
ecuador_df = clean_text(ecuador_df)
guatemala_df = clean_text(guatemala_df)
dominican_df = clean_text(dominican_df)
argentina_df = clean_text(argentina_df)
bolivia_df = clean_text(bolivia_df)
brazil_df = clean_text(brazil_df)
chile_df = clean_text(chile_df)
colombia_df = clean_text(colombia_df)
costa_rica_df = clean_text(costa_rica_df)
cuba_df = clean_text(cuba_df)

In [9]:
#Creer une liste de df pour faciliter les manipulation
df_list = [peru_df,salvador_df,honduras_df,haiti_df,jamaica_df,uruguay_df,venezuela_df,nicaragua_df,panama_df,
           paraguay_df,mexico_df,ecuador_df,guatemala_df,dominican_df,argentina_df,bolivia_df,brazil_df,
           chile_df,colombia_df,costa_rica_df,cuba_df]

In [10]:
#Obtenir une liste generale de texte
general_list = []
for df in df_list:
  general_list.extend(df['merged_texts'].tolist())

In [11]:
#Obtenir un dataframe general
df_main=pd.DataFrame()
df_main=pd.concat(df_list)

In [12]:
#dictionnaire des caracteres anormaux qui doit etre nettoyes
abnormal_cara = {'渕':'m',
 '鈥?':"' ",
 '淯':'U',
 '淐':'C',
 '淜':'K',
 '渧':'v',
 '脕':'Á',
 '撀爄':'- ',
 '淓':'E',
 '僒':'T',
 '揕':'L',
 '榃':'W',
 '揚':'P',
 '淚':'I',
 '淢':'M',
 '渟':'s',
 '淟':'L',
 '渞':'r',
 '淒':'D',
 '淧':'P',
 '揓':'J',
 '揈':'E',
 '脫':'Ó',
 '橲':'S',
 '眉':'ü',
 '榠':'i',
 '橫':'M',
 '減':'p',
 '搘':'w',
 '淲':'W',
 '脥':'Í',
 '僂':'E',
 '榦':'o',
 '铆':'í',
 '脿':'à',
 '谩':'á',
 '淎':'A',
 '揗':'M',
 '榯':'t',
 '淔':'F',
 '淸':'[',
 '榣':'l',
 '溾€?':'...',
 '渓':'l',
 '僉':'L',
 '揑':'I',
 '么':'ô',
 '榝':'f',
 '僐':'R',
 '榡':'j',
 '铿乧':'f',
 '渙':'o',
 '檚':'s',
 '潞':'º',
 '済':'g',
 '脷':'Ú',
 '僆':'I',
 '渇':'f',
 '淣':'N',
 '溾€β爎':'… r',
 '淩':'R',
 '淏':'B',
 '脡':'É',
 '揤':'-',
 '淵':'Y',
 '脩':'Ñ',
 '渨':'w',
 '揂':'A',
 '揊':'F',
 '榳':'w',
 '淕':'G',
 '測':'y',
 '淥':'O',
 '滻':'I',
 '櫭':'ê',
 '渘':'n',
 '淨':'Q',
 '乬':'g',
 '淶':'Z',
 '渏':'j',
 '搕':'t',
 '渂':'b',
 '渢':'t',
 '淛':'J',
 '茅':'é',
 '渁':'a',
 '猼':'t',
 '揅':'C',
 '揝':'S',
 '渮':'z',
 '揌':'H',
 '淪':'S',
 '淰':'V',
 '揹':'d',
 '渄':'d',
 '聽':' ',
 '僊':'M',
 '贸':'ó',
 '煤':'ú',
 '揢':'U',
 '揇':'D',
 '掳':'°',
 '帽':'ñ',
 '橧':'i',
 '撀爄':'i',
 '榮':'s',
 '揃':'B',
 '鈥':" ",
 '渆':'e',
 '淭':'T',
 '淗':'H',
 '揺':'e',
 '榗':'c',
 '働':'p',
 '渉':'h',
 '僃':'P',
 '渃':'c',
 '僅':'H',
 '溍':'Á',
 '揘':'N',
 '檛':'t',
 '渋':'i',
 '揟':'T',
 '猫':'è',
 '榖':'b',
 '搉':'n',
 '渦':'u'}

In [13]:
#Nettoyer la liste des textes
general_list = remove_abnormal(general_list,abnormal_cara)

In [14]:
#Nettoyer le df general
modified_list = list(df_main['merged_texts'])
modified_list = remove_abnormal(modified_list,abnormal_cara)
df_main['merged_texts'] = modified_list

In [15]:
def extract_content(text):
  '''
  Extraire les noms de pays dans le dataframe
  '''
  match = re.search(r'\((.*?)\)', text)
  if match:
        return match.group(1)
  else:
        return None

In [16]:
#Extraire les noms de pays dans le dataframe et remplacer le colonne
df_main['case'] = df_main['case'].apply(extract_content)

In [17]:
def extract_year(date_str):
  '''
  Extraire les annees au lieu des dates precises
  '''
  date_obj = datetime.strptime(date_str, '%d-%b-%y')
  year = date_obj.year
  if year <= 2024:
        return year
  else:
        return year-100

In [18]:
#Extraire les annees et remplacer la colonne
df_main['document_nr'] = df_main['document_nr'].apply(extract_year)

In [19]:
#Supprimer les colonne redondantes et vides
df_main = df_main.drop(df_main.columns[6:], axis=1)

In [20]:
#Enlever les textes moins de 500 signes, qui sont peut-etre trop courts
rows = []
for index, row in df_main.iterrows():
  if len(row['merged_texts'])>500:
     rows.append(row)
df_main_fitered = pd.DataFrame(rows)

In [21]:
#Enlever les textes moins de 500 signes, qui sont peut-etre trop courts
filtered_list = [item for item in general_list if len(item) >= 500]

In [22]:
df_main_fitered

Unnamed: 0,case,document_href,document_name,document_nr,log,merged_texts
0,Peru,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,General Confederation of Workers of Peru (CGTP),2019,Scraping https://www.ilo.org/dyn/normlex/en/f?...,536.The complaint is contained in a communicat...
1,Peru,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,Autonomous Workers' Confederation of Peru (CATP),2019,Scraping https://www.ilo.org/dyn/normlex/en/f?...,474.The complaint is contained in a communicat...
2,Peru,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,Autonomous Workers' Confederation of Peru (CATP),2018,Scraping https://www.ilo.org/dyn/normlex/en/f?...,393.The complaint is contained in a communicat...
3,Peru,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,Autonomous Workers' Confederation of Peru (CATP),2018,Scraping https://www.ilo.org/dyn/normlex/en/f?...,611.The complaint is contained in communicatio...
4,Peru,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,General Confederation of Workers of Peru (CGTP),2017,Scraping https://www.ilo.org/dyn/normlex/en/f?...,"65.The Committee last examined this case, whic..."
...,...,...,...,...,...,...
10,Cuba,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,The International Federation of Christian Trad...,1962,Scraping https://www.ilo.org/dyn/normlex/en/f?...,103.The Committee considered these three cases...
11,Cuba,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,"The Federation of Cuban Electricity, Gas and W...",1961,Scraping https://www.ilo.org/dyn/normlex/en/f?...,55.The Committee has already examined this cas...
12,Cuba,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,The Confederation of Workers of Latin America ...,1956,Scraping https://www.ilo.org/dyn/normlex/en/f?...,71.In February 1958 the Committee resumed its ...
13,Cuba,['f?p=1000:50032:0::NO:50032:P50032_COMPLAINT_...,The Confederation of Workers of Latin America ...,1953,Scraping https://www.ilo.org/dyn/normlex/en/f?...,493.In accordance with paragraph 2 of a resolu...


In [None]:
#Connecter a HuggingFace
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
#Importer Llama 3
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer.pad_token_id=tokenizer.eos_token_id

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,max_new_tokens=50)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
#Prompt
prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

In [None]:
#Generer les textes, les codes venant de BERTopic j'ai pas modifier
zephyr = TextGeneration(pipe, prompt=prompt)
representation_model = {"Zephyr": zephyr}


In [23]:
#Obetenir stop words
en_stop=list(en_stop)

In [None]:
#Proceder topic modeling [CETTE PARTIE EST ABANDONNER, les codes qui marchent bien est en bas]
vectorizer_model = CountVectorizer(stop_words=en_stop,ngram_range=(1,2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
dim_model = PCA(n_components=10)

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='multilingual',
    calculate_probabilities=True,
    verbose=True,
    ctfidf_model=ctfidf_model,
    umap_model=dim_model,
    representation_model=representation_model,
    min_topic_size=20
)
topics, probs = model.fit_transform(chunked_text)

In [None]:
model.get_topic_info()
model.visualize_hierarchy()
model.visualize_barchart()
model.visualize_topics()
model.visualize_documents(chunked_text)

In [None]:
#Couper les textes selon la longeur de signe [ABANDONNEE]
def split_text(text, max_length=512):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def split_list_of_texts(texts, max_length=512):
    result = []
    for text in texts:
        result.extend(split_text(text, max_length))
    return result


split_general_list = split_list_of_texts(general_list, max_length=7000)

In [24]:
def chunk_text(texte:str, number_of_words, as_strings=True):
  '''
  Couper les textes selon le nombre des mots
  '''
  mots = texte.split(' ')
  chunks = [mots[i:i+number_of_words] for i in range(0, len(mots), number_of_words)]
  if as_strings:
      return [' '.join(chunk) for chunk in chunks]
  else:
      return chunks

In [25]:
#Recouper les textes en chunk de 1500 mots, et stocker dans le dataframe

new_rows = []

for index, row in df_main_fitered.iterrows():


    chunks = chunk_text(row['merged_texts'], 1500)

    for chunk in chunks:
        new_row = row.copy()
        new_row['chunk'] = chunk
        new_rows.append(new_row)

df_chunked = pd.DataFrame(new_rows)
df_chunked.drop('merged_texts', axis=1)
df_chunked['chunk'] = df_chunked['chunk'].replace('\d', '', regex=True)

#Enlever les chiffres dans le textes
df_chunked['chunk'] = df_chunked['chunk'].replace('\d', '', regex=True)


In [26]:
def calculate_word_frequency(text_list):
  '''
  Obtenir la frequence des mots dans le corpus
  '''
  text = ' '.join(text_list)
  words = text.lower().split()
  word_freq = Counter(words)
  return word_freq

word_freq_list = calculate_word_frequency(general_list)
sorted_word_frequency = sorted(word_freq_list.items(), key=lambda x: x[1], reverse=True)

In [27]:
#Obtenir les listes des mots dont la fréquence au dessus d'un certain seuil

word_above_2000, word_above_1000, word_above_500, word_above_400, word_above_300,word_above_200 = [],[],[],[],[],[]
for word_pair in sorted_word_frequency:
   if word_pair[1] >= 2000 :
      word_above_2000.append(word_pair[0])
   else:
      continue

for word_pair in sorted_word_frequency:
   if word_pair[1] >= 1000 :
      word_above_1000.append(word_pair[0])
   else:
      continue

for word_pair in sorted_word_frequency:
   if word_pair[1] >= 500 :
      word_above_500.append(word_pair[0])
   else:
      continue

for word_pair in sorted_word_frequency:
   if word_pair[1] >= 400 :
      word_above_400.append(word_pair[0])
   else:
      continue

for word_pair in sorted_word_frequency:
   if word_pair[1] >= 300 :
      word_above_300.append(word_pair[0])
   else:
      continue

for word_pair in sorted_word_frequency:
   if word_pair[1] >= 200 :
      word_above_200.append(word_pair[0])
   else:
      continue

In [28]:
#Elargir la liste de stop words
en_stop=list(en_stop)
en_stop.extend(['ii','iii','2002','2003','2004','2004','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018'])
en_stop.extend(word_above_2000)
stop_words_2000 = en_stop

In [29]:
#Elargir la liste de stop words, c'est ce qui est utilise dans l'etape prochaine
en_stop=list(en_stop)
en_stop.extend(['1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
                'para','paras'])
en_stop.extend(word_above_1000)
stop_words_1000 = en_stop

In [30]:
#Elargir la liste de stop words
en_stop=list(en_stop)
en_stop.extend(['ii','iii','2002','2003','2004','2004','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018'])
en_stop.extend(word_above_500)
stop_words_500 = en_stop

In [None]:
#Executer le topic modeling
vectorizer_model = CountVectorizer(stop_words=stop_words_1000, ngram_range=(1,1))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    ctfidf_model=ctfidf_model,
    umap_model=umap_model,
    representation_model=representation_model,
    min_topic_size=20
)



In [None]:
topics, probs = model.fit_transform(list(df_chunked['chunk']))

2024-05-14 15:42:07,735 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

2024-05-14 15:42:36,591 - BERTopic - Embedding - Completed ✓
2024-05-14 15:42:36,592 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-14 15:43:04,129 - BERTopic - Dimensionality - Completed ✓
2024-05-14 15:43:04,131 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-14 15:43:04,699 - BERTopic - Cluster - Completed ✓
2024-05-14 15:43:04,707 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 32/32 [3:23:25<00:00, 381.43s/it]
2024-05-14 19:06:36,767 - BERTopic - Representation - Completed ✓


In [None]:
topics_over_time = model.topics_over_time(list(df_chunked['chunk']), list(df_chunked['document_nr']), global_tuning=True, evolution_tuning=True, nr_bins=20)

20it [00:08,  2.48it/s]


In [None]:
topics_over_time.to_csv("DTM.csv")

In [None]:
model.visualize_topics_over_time(topics_over_time)

In [None]:
#Sauvegarder les resultats
data ={'Document': list(df_chunked['chunk']) , 'Topic': topics, 'Time': list(df_chunked['document_nr']), 'Country':list(df_chunked['case'])}
results =pd.DataFrame(data)
results.to_csv('results.csv')
df = model.get_topic_info()
df.to_csv("topic_info.csv")

In [None]:
model.visualize_barchart(top_n_topics=50)

In [None]:
hierarchical_topics = model.hierarchical_topics(list(df_chunked['chunk']))

100%|██████████| 21/21 [00:00<00:00, 166.79it/s]


In [None]:
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
model.visualize_hierarchy()

In [None]:
model.visualize_topics()

In [None]:
model.visualize_documents(list(df_chunked['chunk']))

In [None]:
#Topic modeling avec stop_words_2000, [ABANDONNEE temporairement]
vectorizer_model = CountVectorizer(stop_words=stop_words_2000, ngram_range=(1,2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
dim_model = PCA(n_components=10)

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='multilingual',
    calculate_probabilities=True,
    verbose=True,
    ctfidf_model=ctfidf_model,
    umap_model=dim_model,
    representation_model=representation_model
)

topics, probs = model.fit_transform(chunked_text)

2024-05-01 20:01:47,508 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/686 [00:00<?, ?it/s]

2024-05-01 20:21:32,632 - BERTopic - Embedding - Completed ✓
2024-05-01 20:21:32,634 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 20:21:32,813 - BERTopic - Dimensionality - Completed ✓
2024-05-01 20:21:32,816 - BERTopic - Cluster - Start clustering the reduced embeddings

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-05-01 20:21:44,736 - BERTopic - Cluster - Completed ✓
2024-05-01 20:21:44,744 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 6/6 [06:30<00:00, 65.17s/it]
2024-05-01 20:28:31,500 - BERTopic - Representation - Completed ✓


In [None]:
model.get_topic_info()

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_topics()

In [None]:
#Reorganiser le df selon les annees
df_chunked_ordered = df_chunked.sort_values(by='document_nr')

In [None]:
#Les annees definies pour le topic modeling suivant
year_range = [(1946, 1960),
 (1961, 1970),
 (1971, 1980),
 (1981, 1990),
 (1991, 2000),
 (2001, 2010),
 (2011, 2024)]

In [None]:
#Creer les fichiers de stockage
if not os.path.exists("images"):
    os.mkdir("images")

if not os.path.exists("results"):
    os.mkdir("results")

In [None]:
#Executer le topic modeling pour tous les 10 ans et sauvegarder les fichiers
for start, end in year_range:

    subset_df = df_chunked_ordered[(df_chunked_ordered['document_nr'] >= start) & (df_chunked_ordered['document_nr'] <= end)]
    topics, probs = model.fit_transform(list(subset_df['chunk']))

    data ={'Document': list(subset_df['chunk']) , 'Topic': topics, 'Time': list(subset_df['document_nr']), 'Country':list(subset_df['case'])}
    results =pd.DataFrame(data)
    results.to_csv('results/results_{}.csv'.format(start))

    df = model.get_topic_info()
    df.to_csv("results/topic_info_{}.csv".format(start))


2024-05-05 17:39:01,980 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-05 17:39:02,557 - BERTopic - Embedding - Completed ✓
2024-05-05 17:39:02,558 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 17:39:04,624 - BERTopic - Dimensionality - Completed ✓
2024-05-05 17:39:04,626 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 17:39:04,635 - BERTopic - Cluster - Completed ✓
2024-05-05 17:39:04,639 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 1/1 [07:42<00:00, 462.37s/it]
2024-05-05 17:46:47,258 - BERTopic - Representation - Completed ✓
2024-05-05 17:46:47,357 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-05-05 17:46:48,468 - BERTopic - Embedding - Completed ✓
2024-05-05 17:46:48,469 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 17:46:51,245 - BERTopic - Dimensionality - Completed ✓
2024-05-05 17:46:51,246 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 17:46:51,263 - BERTopic - Cluster - Completed ✓
2024-05-05 17:46:51,266 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 4/4 [31:36<00:00, 474.12s/it]
2024-05-05 18:18:28,172 - BERTopic - Representation - Completed ✓
2024-05-05 18:18:28,378 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

2024-05-05 18:18:30,184 - BERTopic - Embedding - Completed ✓
2024-05-05 18:18:30,186 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 18:18:32,521 - BERTopic - Dimensionality - Completed ✓
2024-05-05 18:18:32,523 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 18:18:32,545 - BERTopic - Cluster - Completed ✓
2024-05-05 18:18:32,549 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 3/3 [24:02<00:00, 480.79s/it]
2024-05-05 18:42:35,691 - BERTopic - Representation - Completed ✓
2024-05-05 18:42:35,976 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

2024-05-05 18:42:39,488 - BERTopic - Embedding - Completed ✓
2024-05-05 18:42:39,489 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 18:42:42,746 - BERTopic - Dimensionality - Completed ✓
2024-05-05 18:42:42,747 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 18:42:42,795 - BERTopic - Cluster - Completed ✓
2024-05-05 18:42:42,800 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 4/4 [33:27<00:00, 501.81s/it]
2024-05-05 19:16:11,461 - BERTopic - Representation - Completed ✓
2024-05-05 19:16:11,898 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

2024-05-05 19:16:15,691 - BERTopic - Embedding - Completed ✓
2024-05-05 19:16:15,692 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 19:16:18,924 - BERTopic - Dimensionality - Completed ✓
2024-05-05 19:16:18,925 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 19:16:18,970 - BERTopic - Cluster - Completed ✓
2024-05-05 19:16:18,975 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 3/3 [28:49<00:00, 576.60s/it]
2024-05-05 19:45:10,256 - BERTopic - Representation - Completed ✓
2024-05-05 19:45:10,703 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

2024-05-05 19:45:17,011 - BERTopic - Embedding - Completed ✓
2024-05-05 19:45:17,013 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 19:45:21,661 - BERTopic - Dimensionality - Completed ✓
2024-05-05 19:45:21,662 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 19:45:21,743 - BERTopic - Cluster - Completed ✓
2024-05-05 19:45:21,746 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 2/2 [17:45<00:00, 532.76s/it]
2024-05-05 20:03:09,677 - BERTopic - Representation - Completed ✓
2024-05-05 20:03:10,282 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

2024-05-05 20:03:15,261 - BERTopic - Embedding - Completed ✓
2024-05-05 20:03:15,262 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-05 20:03:19,006 - BERTopic - Dimensionality - Completed ✓
2024-05-05 20:03:19,007 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-05 20:03:19,081 - BERTopic - Cluster - Completed ✓
2024-05-05 20:03:19,086 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 13/13 [1:53:36<00:00, 524.33s/it]
2024-05-05 21:56:57,581 - BERTopic - Representation - Completed ✓


In [None]:
#Segmenter les phrases selon les points
new_text_list = []
for text in general_list:
    sentences = text.split('.')

    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    new_text_list.extend(sentences)

In [None]:
#Proceder le NER et sauvegarder les resultats
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(new_text_list)
df_results.to_csv('ner_results.csv')

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_results=pd.read_csv('/content/ner_results.csv')

  df_results=pd.read_csv('/content/ner_results.csv')


===========================LDA==========================================

In [None]:
!pip install gensim
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [32]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

  and should_run_async(code)


[0mCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [39]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import spacy

In [31]:
chunk_txt = list(df_chunked['chunk'])

In [44]:
lemmes = ''
count = 0
for text in chunk_txt:
 nlp = spacy.load('en_core_web_sm')
 doc = nlp(text)
 lemmes += ' '.join([token.lemma_ for token in doc])
 print("{} sur {} done".format(count,len(chunk_txt)))
 count += 1

  and should_run_async(code)


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
71 sur 5071 done
72 sur 5071 done
73 sur 5071 done
74 sur 5071 done
75 sur 5071 done
76 sur 5071 done
77 sur 5071 done
78 sur 5071 done
79 sur 5071 done
80 sur 5071 done
81 sur 5071 done
82 sur 5071 done
83 sur 5071 done
84 sur 5071 done
85 sur 5071 done
86 sur 5071 done
87 sur 5071 done
88 sur 5071 done
89 sur 5071 done
90 sur 5071 done
91 sur 5071 done
92 sur 5071 done
93 sur 5071 done
94 sur 5071 done
95 sur 5071 done
96 sur 5071 done
97 sur 5071 done
98 sur 5071 done
99 sur 5071 done
100 sur 5071 done
101 sur 5071 done
102 sur 5071 done
103 sur 5071 done
104 sur 5071 done
105 sur 5071 done
106 sur 5071 done
107 sur 5071 done
108 sur 5071 done
109 sur 5071 done
110 sur 5071 done
111 sur 5071 done
112 sur 5071 done
113 sur 5071 done
114 sur 5071 done
115 sur 5071 done
116 sur 5071 done
117 sur 5071 done
118 sur 5071 done
119 sur 5071 done
120 sur 5071 done
121 sur 5071 done
122 sur 5071 done
123 sur 5071 done
124 sur 5071 done
125 sur 5071 don

In [45]:
nouveau_texte = ''
for char in lemmes:
    if char.isalpha() == True or char==' ':
        nouveau_texte += char
nouveau_texte = re.sub('\s+', ' ', nouveau_texte.lower())

  and should_run_async(code)
  nouveau_texte = re.sub('\s+', ' ', nouveau_texte.lower())


In [46]:
# on enlève les stopwords
no_sw = ''
for word in nouveau_texte.split(' '):
    if word not in stop_words_1000:
        no_sw += word + ' '

print(no_sw[0:1000])

  and should_run_async(code)


contain peru cgtp observation peru ratify telefnica saa hereinafter dismiss significant affiliate telefnica suttp rise change amparo proceeding violate record anti practice high infraction sunafil benefit policy anti criterion refer benefit policy employee adopt exclusive benefit scope consequently policy apply exclusively non remuneration fringe benefit govern benefit entitle govern respective conclude explain exclusive benefit include refreshment mobility allowance annual incentive career bonus addition fringe benefit discriminatory criterion ratify disseminate document entitle transformndonos ser far develop applicable non relationship regulate outside scope document exclusive employee employee remuneration review calendar year adjustment inflation positioning market adjustment merit add benefit policy applicable unionized far require suttp perform compensatory holiday entitle rest require perform compensatory engage systematic hostility unionized particularly vulnerable fail suttp 

In [47]:
texte_chunk_LDA = chunk_text(no_sw, 700, as_strings=False)

for texte in texte_chunk_LDA:
    print(texte[:10])

['contain', 'peru', 'cgtp', 'observation', 'peru', 'ratify', 'telefnica', 'saa', 'hereinafter', 'dismiss']
['objective', 'prior', 'transfer', 'area', 'change', 'assess', 'organizational', 'level', 'match', 'profile']
['unionized', 'employee', 'director', 'constitute', 'total', 'employee', 'practice', 'negative', 'impact', 'rate']
['insurance', 'reinsurance', 'hereinafter', 'd', 'rmac', 'eps', 'hereinafter', 'e', 'operate', 'insurance']
['sinutreapp', 'ongoing', 'promotion', 'accorde', 'h', 'include', 'sinutreapp', 'coalition', 'regulation', 'grant']
['lima', 'reduce', 'workforce', 'mega', 'plant', 'pucusana', 'accord', 'aim', 'particularly', 'unionized']
['professional', 'obligation', 'specifically', 'fall', 'asleep', 'night', 'shift', 'risk', 'safety', 'ceras']
['original', 'pende', 'recall', 'anti', 'rapidly', 'remedy', 'effective', 'excessive', 'constitute', 'attack']
['sunafil', 'verify', 'elect', 'undermine', 'position', 'concernedc', 'consider', 'close', 'th', 'occasion']
['fisca

  and should_run_async(code)


['announce', 'implementation', 'reach', 'appear', 'refer', 'reinstatementb', 'tito', 'alfredo', 'matos', 'galarza']
['repository', 'casimiro', 'ulloa', 'hospital', 'premise', 'decade', 'activity', 'available', 'sole', 'condition']
['intervene', 'creation', 'maintenance', 'offence', 'regulation', 'affect', 'impede', 'representation', 'base', 'enshrine']
['headquarters', 'textile', 'school', 'sbn', 'jar', 'reclaim', 'site', 'favour', 'ground', 'year']
['effectively', 'party', 'good', 'faith', 'fairly', 'harmonious', 'negotiating', 'mean', 'sincere', 'effort']
['proceeding', 'vidal', 'proceeding', 'harassment', 'clemente', 'rodrguez', 'sutsencico', 'individual', 'coerce', 'sign']
['mara', 'covarrubias', 'jorge', 'carrillo', 'vrtiz', 'transmit', 'follow', 'pende', 'th', 'invite']
['increase', 'remuneration', 'far', 'cover', 'budgetary', 'govern', 'activity', 'bear', 'mind', 'financing']
['sunat', 'sinaut', 'sunat', 'demand', 'good', 'faith', 'tactic', 'proposal', 'phase', 'willingness']
['

In [48]:
id2word = corpora.Dictionary(texte_chunk_LDA)

corpus = []
for text in texte_chunk_LDA:
    new = id2word.doc2bow(text) # convertit en bag of words
    corpus.append(new)

# on affiche les bigrammes
for t in corpus:
    print(t[:20])

  and should_run_async(code)


[(0, 1), (1, 1), (2, 4), (3, 1), (4, 4), (5, 2), (6, 2), (7, 1), (8, 2), (9, 3), (10, 1), (11, 1), (12, 4), (13, 3), (14, 6), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
[(0, 1), (1, 1), (2, 1), (4, 1), (6, 1), (7, 1), (12, 3), (13, 1), (14, 8), (16, 2), (18, 2), (19, 1), (20, 1), (21, 1), (23, 6), (24, 4), (25, 6), (26, 8), (27, 1), (29, 1)]
[(2, 2), (7, 1), (9, 5), (11, 4), (12, 2), (14, 4), (16, 1), (18, 2), (23, 1), (24, 4), (25, 2), (26, 5), (29, 1), (30, 1), (35, 2), (37, 1), (38, 1), (40, 7), (41, 1), (47, 1)]
[(2, 8), (5, 2), (9, 1), (12, 1), (14, 2), (16, 2), (17, 1), (18, 1), (22, 2), (23, 4), (24, 1), (25, 1), (26, 1), (36, 2), (39, 1), (40, 6), (50, 3), (53, 4), (55, 1), (58, 1)]
[(2, 3), (5, 1), (9, 2), (11, 2), (14, 2), (17, 1), (18, 2), (22, 1), (23, 10), (25, 3), (26, 1), (29, 2), (36, 3), (37, 1), (38, 1), (40, 7), (53, 3), (61, 1), (66, 2), (67, 1)]
[(2, 2), (4, 2), (5, 1), (9, 3), (11, 2), (14, 3), (16, 1), (18, 2), (23, 8), (26, 1), (29, 1), (36, 1), (37, 1), (38, 

In [49]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # on choisit le corpus
                      id2word=id2word, # le dictionnaire associé au corpus
                      num_topics=200, # le nombre de topics souhaité
                      random_state=100, # valeur du générateur de nombres aléatoires pour initialiser la séquence aléatoire
                      update_every=1, # nombre de documents à traiter à chaque mise à jour du modèle (ici il sera mis à jour après chaque document)
                      chunksize=100, # taille des lots de documents utilisés pour l'entraînement du modèle
                      passes=50, # nb d'itérations complètes sur l'ensemble du corpus
                      alpha="auto") # valeur d'alpha

  and should_run_async(code)


In [50]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# mds = "mmds" nous indique que nous utilisons la méthode "mmds" (Maximal Marginal Distance Scaling) comme méthode multidimensionnelle (MDS) utilisée pour réduire la dimensionnalité des données lors de la visualisation
# R correspond au nombre de termes les plus fréquents à afficher pour chaque sujet dans la visualisation
vis

  and should_run_async(code)
  pid = os.fork()


In [51]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=texte_chunk_LDA, dictionary=id2word)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence score: ', coherence_lda)

  and should_run_async(code)
  self.pid = os.fork()


Coherence score:  0.32184202627745717
