In [1]:
!pip install ctransformers --upgrade



In [2]:
!pip install bertopic accelerate bitsandbytes xformers adjustText



In [3]:
!pip install fastparquet



In [4]:
import pandas as pd
from torch import bfloat16
import transformers
from torch import cuda
from transformers import pipeline
from ctransformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

In [5]:
dataset = pd.read_parquet("https://github.com/Mlad-en/Intro_Data_Science/raw/main/cleaned_data/combined_data_2013_2022_per_paragraph.parquet", engine="fastparquet")

In [10]:
texts = dataset[dataset["text_split"].notna()]["text_split"].to_list()
dates= dataset[dataset["text_split"].notna()]["year"].to_list()

In [8]:
umap_model =  UMAP(
    n_neighbors=5,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=123
    )
hdbscan_model = HDBSCAN(
    min_cluster_size=200,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    )

keybert = KeyBERTInspired()

mmr = MaximalMarginalRelevance(diversity=0.3)

representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
}

topic_model = BERTopic(
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  top_n_words=10,
  verbose=True
)

In [11]:
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/1291 [00:00<?, ?it/s]

2023-09-28 12:10:38,520 - BERTopic - Transformed documents to Embeddings
2023-09-28 12:11:08,793 - BERTopic - Reduced dimensionality
2023-09-28 12:11:14,754 - BERTopic - Clustered reduced embeddings


In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,16694,-1_the_and_of_to,"[the, and, of, to, in, that, we, our, for, is]","[nations, efforts, agenda, sustainable, cooper...","[the, and, of, to, in, that, we, our, for, is]",[Our challenge is to find effective ways and m...
1,0,2120,0_pandemic_covid19_the_to,"[pandemic, covid19, the, to, health, and, vacc...","[pandemic, vaccine, vaccines, vaccination, cov...","[pandemic, covid19, the, to, health, and, vacc...","[Mr President,\nOnce the Pandemic will have be..."
2,1,2067,1_climate_change_to_the,"[climate, change, to, the, we, of, and, our, i...","[climate, sustainable, warming, emissions, nat...","[climate, change, to, the, we, of, and, our, i...","[Everything is fragile, everything is precario..."
3,2,2022,2_nations_united_the_and,"[nations, united, the, and, of, to, that, we, ...","[multilateralism, nations, multilateral, un, u...","[nations, united, the, and, of, to, that, we, ...","[As an organization, we have urgent work to do..."
4,3,1795,3_development_sustainable_goals_agenda,"[development, sustainable, goals, agenda, the,...","[sustainable, achieving, development, 2030, ac...","[development, sustainable, goals, agenda, the,...",[The second great challenge is development. Si...
5,4,1373,4_afghanistan_the_and_of,"[afghanistan, the, and, of, in, to, for, that,...","[asean, afghanistan, taliban, turkmenistan, na...","[afghanistan, the, and, of, in, to, for, that,...",[Europe to reach 33 million tons annually.\n\n...
6,5,989,5_palestinian_israel_the_peace,"[palestinian, israel, the, peace, palestine, e...","[palestinians, palestinian, palestine, gaza, i...","[palestinian, israel, the, peace, palestine, e...",[The commitment to and responsibility for peac...
7,6,895,6_ukraine_russia_the_of,"[ukraine, russia, the, of, russian, and, in, t...","[ukraine, crimea, ukraines, ukrainian, ukraini...","[ukraine, russia, the, of, russian, and, in, t...","[Imagine if, instead, Russia had engaged in tr..."
8,7,852,7_we_our_us_and,"[we, our, us, and, to, that, is, of, world, it]","[cooperation, solidarity, peace, humankind, we...","[we, our, us, and, to, that, is, of, world, it]",[The universal push to transform the world thr...
9,8,814,8_syria_syrian_the_chemical,"[syria, syrian, the, chemical, to, in, of, and...","[syria, syrian, syrias, syrians, regime, human...","[syria, syrian, the, chemical, to, in, of, and...","[In Syria, we are pursuing the fight against I..."


In [15]:
topics_per_document = topic_model.get_document_info(texts)

In [18]:
topics_per_document[topics_per_document["Name"].str.contains("climate")].to_parquet("topics_per_paragraph.parquet")

In [23]:
topics_per_document[topics_per_document["Topic"].isin([1, 3])].to_parquet("topics_per_paragraph_climate_change_sustainability.parquet")

In [20]:
topics_over_time = topic_model.topics_over_time(texts, dates, nr_bins=20)

10it [00:06,  1.51it/s]


In [21]:
topic_model.visualize_topics_over_time(topics_over_time)

In [24]:
topic_model.save("topic_model_bert", serialization="safetensors")