In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/agriculture-data/cleaned_text_data.csv


In [2]:
import pandas as pd
data = pd.read_csv('/kaggle/input/agriculture-data/cleaned_text_data.csv')

In [3]:
#pip install bertopic --q

In [4]:
df = data.drop(columns=['file_name'])

In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [6]:
df.head(-5)

Unnamed: 0,text,date
0,With a big drop in Annual Investment Allowance...,2020-10-27
1,Though the moisture was welcomed in much of th...,2020-10-27
2,Hello and welcome to the ZimmCast. This podcas...,2020-10-27
3,Ecological monitoring is the recording of biol...,2020-10-27
4,Farmers have been encouraged to develop a cont...,2020-10-27
...,...,...
22804,It comes as no shock that the government's new...,2022-06-19
22805,Sheep farmers trying to make the most of grass...,2022-06-19
22806,Basic fish farm equipment is the same for smal...,2022-06-19
22807,Market Morsel Speculators have a huge role in ...,2022-06-19


In [7]:
df_filtered = df[(df['date'].dt.year >= 2020) & (df['date'].dt.year <= 2022)]

In [8]:
texts = df_filtered['text'].tolist()
years = df_filtered['date'].dt.year.tolist()

In [9]:
len(texts), len(years)

(22814, 22814)

# UMAP
UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something that is easier to use with HDBSCAN in order to create good clusters. 

We can instantiate our UMAP model and pass it to BERTopic:

In [10]:
#pip install umap-learn --q

In [11]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(2, 2), stop_words="english")
ctfidf_model = ClassTfidfTransformer()

2025-04-13 14:55:45.401812: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744556145.424240      78 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744556145.430976      78 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
# Initialize the topic model
topic_model = BERTopic(
    verbose=True,
    embedding_model="paraphrase-MiniLM-L12-v2",  # Compact and fast model
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=35,  # Minimum size of a topic cluster
    low_memory=True, 
    calculate_probabilities=False
)

# Fit the model on your filtered texts
topics, _ = topic_model.fit_transform(texts)

# Get number of topics (including -1 outliers)
num_topics = len(topic_model.get_topic_info())

print("Number of topics found:", num_topics)

2025-04-13 15:01:49,298 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/713 [00:00<?, ?it/s]

2025-04-13 15:04:01,271 - BERTopic - Embedding - Completed ✓
2025-04-13 15:04:01,272 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-13 15:04:37,165 - BERTopic - Dimensionality - Completed ✓
2025-04-13 15:04:37,168 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Number of topics found: 508


In [14]:
freq = topic_model.get_topic_info(); freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7296,-1_climate change_supply chain_united states_p...,"[climate change, supply chain, united states, ...",[Introduction to dairy farming in Uttar Prades...
1,0,414,0_soy sauce_sesame oil_stir fry_sesame seeds,"[soy sauce, sesame oil, stir fry, sesame seeds...","[This red braised chicken recipe (hongshao ji,..."
2,1,256,1_precision agriculture_precision ag_artificia...,"[precision agriculture, precision ag, artifici...",[Precision agriculture technology study shows ...
3,2,207,2_feeds way_feed websites_enter url_networks s...,"[feeds way, feed websites, enter url, networks...",[Webpage to RSS Finally you can follow website...
4,3,176,3_farm laws_minister narendra_protesting farme...,"[farm laws, minister narendra, protesting farm...",[(Representative image) NEW DELHI: Indian agri...
5,4,159,4_milk price_milk prices_dairy farmers_milk pr...,"[milk price, milk prices, dairy farmers, milk ...",[To assess the effect on dairy farming systems...
6,5,154,5_soy wax_99 reg_valentine day_scented candle,"[soy wax, 99 reg, valentine day, scented candl...",[Peter Bernik / Shutterstock.com Americans pla...
7,6,141,6_newsletters internet_based interests_deliver...,"[newsletters internet, based interests, delive...","[This website uses tracking tools, including c..."
8,7,128,7_cattle prices_prime cattle_million head_fed ...,"[cattle prices, prime cattle, million head, fe...",[Deadweight cattle prices have reached new rec...
9,8,126,8_agricultural sector_value chain_cross river_...,"[agricultural sector, value chain, cross river...","[By Chidi Opara Mr Sabo Nanono, the Minister o..."


-1 refers to all outliers and should typically be ignored. Next, let's take a look at a frequent topic that were generated:

In [15]:
topic_model.get_topic(4)

[('milk price', 0.01074803469159329),
 ('milk prices', 0.009058072384653653),
 ('dairy farmers', 0.008412760396716781),
 ('milk production', 0.007946755057481625),
 ('dairy products', 0.006082982255193204),
 ('organic dairy', 0.005183452991932935),
 ('cost production', 0.004750392328420717),
 ('dairy producers', 0.004364033051737179),
 ('price milk', 0.0043004335441997645),
 ('liquid milk', 0.0038856892349516886)]

In [16]:
# Make sure your timestamps are a list of years (or datetime)
timestamps = df_filtered['date'].dt.year.tolist()  # Or use full datetime if preferred

# Compute topics over time
topics_over_time = topic_model.topics_over_time(
    docs=texts,                # Use same texts from fit_transform             # Use topics returned by fit_transform
    timestamps=timestamps,     # List of years or datetimes
    global_tuning=True,
    evolution_tuning=True,
    nr_bins=20
)

3it [09:47, 195.85s/it]


In [17]:
topics_over_time.head()

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"covid 19, climate change, pigeon pea, food sec...",975,2019.998
1,1,"precision agriculture, agriculture market, agr...",62,2019.998
2,2,"websites rss, finally follow, rss finally, url...",159,2019.998
3,3,"farm laws, minister narendra, protesting farme...",150,2019.998
4,4,"milk price, andersons targets, scottish dairy,...",12,2019.998


In [20]:
fig2 = topic_model.visualize_barchart(top_n_topics=9, height=700, width = 700)
fig2.write_html("topic_word_scores.html")

In [21]:
fig3 = topic_model.visualize_hierarchy(top_n_topics=50, width=800)
fig3.write_html("hierarchical_clustering.html")

In [27]:
fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, height=650)
fig.write_html("topics_over_time2.html")

In [23]:
fig1 = topic_model.visualize_topics()
fig1.write_html("intertopic_distance_map2.html")

In [24]:
fig4 = topic_model.visualize_heatmap(n_clusters=20, top_n_topics=100)
fig4.write_html("similarity_matrix.html")