In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import plotly.io as pio

In [2]:
# Define some helper functions to extract the text from the documents

def extract_headline(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return [tag.text for tag in soup.find_all('h1')][0]

def remove_tags(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')

    for tag in soup.find_all('h1'):
        tag.decompose()

    return soup.get_text()

def extract_meta(df):
    raw_docs = df["body"].tolist()
    
    data = []
    for article in raw_docs:
        entry = {}
        
        for field in ["Geography", "Commodity", "Sector", "Subject Area", "Source"]:
            match = re.search(f"{field}: (.+)", article)
            entry[field] = match.group(1).strip() if match else None
        
        metadata_start = min([article.find(f"{field}:") for field in ["Geography", "Commodity", "Sector", "Subject Area", "Source"] if f"{field}:" in article] or [len(article)])
        entry["Article"] = article[:metadata_start].strip()
        
        data.append(entry)

    meta = pd.DataFrame(data).set_axis(["geography","commodity","sector","subject_area","source","article"], axis=1)
    return pd.concat([df, meta], axis=1)


In [3]:
# Create main dataset

news_meta = pd.read_excel("data/market_news.xlsx", sheet_name="meta") \
    .rename(columns=str.lower)

articles = pd.read_excel("data/market_news.xlsx", sheet_name="articles", skiprows=2) \
    .rename(columns={"body_Advanced":"raw_corpus"}) \
    .assign(headline=lambda x: x["raw_corpus"].apply(lambda x: extract_headline(x)),
            body=lambda x: x["raw_corpus"].apply(lambda x: remove_tags(x)).str.lstrip())

data_raw = news_meta \
    .merge(articles, how="inner", on=["headline"]) \
    .assign(yw = lambda x: x["date"].dt.strftime("%Y-%w")) \
    .query("~body.str.contains('Please click on the newswire')") \
    .query("~body.str.contains('Middle East sour crude')") \
    .query("~body.str.startswith('Commodity:')") \
    .reset_index(drop=True)


In [4]:
data = extract_meta(data_raw) \
    .assign(article_len = lambda x: x["article"].apply(lambda y: len(y))) \
    .query("article_len > 20")

In [5]:
data

Unnamed: 0,date,headline,id,raw_corpus,body,yw,geography,commodity,sector,subject_area,source,article,article_len
0,2025-05-07,Brazil's Petrobras cuts ULSD S10 prices by 5%;...,e12f7caf-666e-40b9-b892-eb1b59b84b8c,<h1>Brazil's Petrobras cuts ULSD S10 prices by...,Brazilian state-owned integrated oil company P...,2025-3,Brazil,Ultra low sulfur diesel fuel; Diesel fuel; Gas...,Fuels and Refining Plus; Refined Products,Prices; Company sales; Markets,Petrobras,Brazilian state-owned integrated oil company P...,3464
1,2025-05-07,Energy Transfer targets 2025 FID on Lake Charl...,9cc16a74-602d-4889-aa79-b4b1823b058e,<h1>Energy Transfer targets 2025 FID on Lake C...,Energy Transfer is sticking to its target of a...,2025-3,United States; China; Louisiana,LNG; Natural gas; Ethane; Ethylene; NGL; Propa...,Americas Gas; Americas Gas and Power Plus,Projects; Midstream operations; Exports; LNG t...,,Energy Transfer is sticking to its target of a...,3224
2,2025-05-07,Hong Kong's May ex-wharf LSFO term premiums sp...,d2f9999e-53b8-4498-996f-5d7d8fa65f82,<h1>Hong Kong's May ex-wharf LSFO term premium...,Term contract premiums for May-loading ex-whar...,2025-3,Malaysia; Singapore; Hong Kong,Heavy distillates; Fuel oil; Gasoil; Shipping;...,Fuels and Refining Plus; Refined Products; Cru...,Prices; Markets; Supply and demand; Production...,,Term contract premiums for May-loading ex-whar...,3096
3,2025-05-07,OIL FUTURES: Crude rises on US-China trade tal...,ff09ba38-0e6a-4aec-9ea3-b2be200bfa17,<h1>OIL FUTURES: Crude rises on US-China trade...,Crude futures extended gains in midmorning Asi...,2025-3,Global,Refined products; Crude oil,Fuels and Refining Plus; Crude Oil Plus; Crude...,Futures; Prices,,Crude futures extended gains in midmorning Asi...,2361
4,2025-05-07,"POSCO, LG Chem to develop carbon capture proje...",ce6c6e78-4598-4722-8fc5-8d1f3d4b88f4,"<h1>POSCO, LG Chem to develop carbon capture p...",South Korea's POSCO Holdings and LG Chem aim t...,2025-3,South Korea,Steel; Carbon dioxide; Methane; Carbon-neutral...,Ferrous Metals Plus; Steel; Chemicals Plus; Pe...,Technology; Sustainability; Project planning; ...,,South Korea's POSCO Holdings and LG Chem aim t...,2435
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17344,2024-06-26,NWE inland propane markets diverge while suppl...,78f1f125-58ef-45c7-8156-16b866742bd6,<h1>NWE inland propane markets diverge while s...,The Northwest European inland propane markets ...,2024-3,North West Europe,Propane; Natural gas,Petrochemicals; Refined Products,Arbitrage; Prices; Supply; Supply and demand,,The Northwest European inland propane markets ...,1996
17345,2024-06-26,"NWE, Med LPG imports slide in June amid margin...",546eda10-5347-4a5a-8dc5-e5adf18d2d95,"<h1>NWE, Med LPG imports slide in June amid ma...",LPG flows into the Northwest Europe and Medite...,2024-3,Asia; Belgium; Europe; France; Italy; Japan; N...,Very large gas carrier; Shipping; Propane; Pet...,Petrochemicals; Refined Products,Arbitrage; Demand; Prices; Supply; Supply and ...,,LPG flows into the Northwest Europe and Medite...,3999
17346,2024-06-26,OIL FUTURES: Crude bounces back on expectation...,e0a870eb-0ac8-4e1c-9e0a-613afec5a1d6,<h1>OIL FUTURES: Crude bounces back on expecta...,Crude oil futures bounced back in early aftern...,2024-3,Europe; France; Global; Russia; Ukraine; Unite...,Refined products; Gasoline; Crude oil; Brent c...,Crude; Refined Products,Futures; Prices,,Crude oil futures bounced back in early aftern...,2115
17347,2024-06-26,South Korea's Samsung Heavy launches LNG carri...,9574e344-fc7d-4e28-b3ae-475ac78592a6,<h1>South Korea's Samsung Heavy launches LNG c...,South Korea's Samsung Heavy Industries has lau...,2024-3,Asia; Europe; South Korea; United States,Shipping; LNG,Americas Gas; Energy Transition; LNG; Shipping,Energy efficiency; Energy policy; Energy trans...,,South Korea's Samsung Heavy Industries has lau...,1015


In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedding_model,
    language="english",
    calculate_probabilities=True,
    verbose=True
)

In [7]:
# Train model on full dataset
docs = data['article'].tolist()
topics, probs = topic_model.fit_transform(docs)

2025-05-15 18:34:26,076 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/543 [00:00<?, ?it/s]

2025-05-15 18:35:09,352 - BERTopic - Embedding - Completed ✓
2025-05-15 18:35:09,352 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-15 18:35:18,638 - BERTopic - Dimensionality - Completed ✓
2025-05-15 18:35:18,639 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [8]:
topics = topic_model \
    .get_topic_info() \
    .rename(columns=str.lower) \
    .query("topic != -1")

topics

Unnamed: 0,topic,count,name,representation,representative_docs
1,0,339,0_methanol_fuels_maritime_shipping,"[methanol, fuels, maritime, shipping, bunker, ...",[The shipping industry is not on track to meet...
2,1,224,1_hsfo_vlsfo_sulfur_mediterranean,"[hsfo, vlsfo, sulfur, mediterranean, rotterdam...",[Fuel oil stocks in the Amsterdam-Rotterdam-An...
3,2,203,2_epa_trump_biden_republican,"[epa, trump, biden, republican, administration...",[This is part of the COMMODITIES 2025 series w...
4,3,171,3_saf_aviation_sustainable_uco,"[saf, aviation, sustainable, uco, palm, pfad, ...",[Renewables projects have been advancing in Eu...
5,4,166,4_permian_basin_bcf_gas,"[permian, basin, bcf, gas, haynesville, quarte...",[Permian Basin natural gas production is proje...
...,...,...,...,...,...
409,408,10,408_offshore_wind_ao6_windram,"[offshore, wind, ao6, windram, floating, franc...",[France's sixth offshore wind tender saw two 2...
410,409,10,409_virgin_recycled_abs_pellets,"[virgin, recycled, abs, pellets, polystyrene, ...",[European recycled polymer markets saw an impr...
411,410,10,410_minute_mtu_intraday_bidding,"[minute, mtu, intraday, bidding, coupling, vol...",[European spot power exchange operator Epex Sp...
412,411,10,411_pig_steel_iron_nlmk,"[pig, steel, iron, nlmk, mmk, ruslom, scrap, r...",[Existing Russian quotas for low-duty exports ...


In [9]:
# topic_model.get_topic_info().query("Topic == 0")["Representative_Docs"].iloc[0]

In [10]:
topic_model.visualize_topics()

In [11]:
topic_model.visualize_barchart(top_n_topics=10)

In [12]:
topic_model.visualize_heatmap()

In [13]:
topic_model.visualize_hierarchy()

In [14]:
fig1 = topic_model.visualize_topics()
fig2 = topic_model.visualize_barchart(top_n_topics=10)
fig3 = topic_model.visualize_heatmap()
fig4 = topic_model.visualize_hierarchy()


html1 = pio.to_html(fig1, include_plotlyjs='cdn', full_html=False)
html2 = pio.to_html(fig2, include_plotlyjs=False, full_html=False)
html3 = pio.to_html(fig3, include_plotlyjs=False, full_html=False)
html4 = pio.to_html(fig4, include_plotlyjs=False, full_html=False)

full_html = f"""
<html>
<head>
    <title>BERTopic Visualizations</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
    <h1>Intertopic Distance Map</h1>{html1}
    <h1>Top Words per Topic</h1>{html2}
    <h1>Topic Similarity Heatmap</h1>{html3}
    <h1>Topic Hierarchy</h1>{html4}
</body>
</html>
"""

with open("bertopic_all_visualizations.html", "w", encoding="utf-8") as f:
    f.write(full_html)
