<a href="https://colab.research.google.com/github/GresaSm/Deep-Learning-Tutorial/blob/main/Model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install bertopic
!pip install bertopic



In [7]:
# Try to import BERTopic
from bertopic import BERTopic

In [8]:
# Install older version of joblib
!pip install --upgrade "joblib>=1.1.1"



In [9]:
# Data processing
import pandas as pd
import numpy as np
import ast

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap import UMAP

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df = pd.read_csv("preprocessed_subset_english_news.csv")

# keep only the text column
#df = df[['processed_text_v2', 'congress', 'period']]



In [11]:
df

Unnamed: 0,title,published_at,category,full_content,preprocessed_content
0,Vienna Jewish cemetery torched,2023-11-02 04:31:58.000000,Europe,The Jewish section of a major cemetery in the ...,jewish section major cemeteri austrian capit s...
1,Bill Ackman says it's 'pathetic' that law firm...,2023-11-02 11:14:13.000000,Politics,The billionaire investorBill Ackmantook aim at...,billionair investorbil ackmantook aim univers ...
2,Netanyahu is focused on his own political 'sur...,2023-11-02 15:58:14.000000,Politics,A 30-year veteran of the Israel Defense Forces...,veteran israel defens forc former head countri...
3,Democrats sound alarms over No Labels third-pa...,2023-11-02 20:20:50.000000,Politics,Former House Speaker Nancy Pelosi is advocatin...,former hous speaker nanci pelosi advoc thirdpa...
4,"Amid Hezbollah-Israel clashes, Christian villa...",2023-11-02 06:07:16.000000,Politics,"In Pictures At Lebanon’s border with Israel, r...",pictur lebanon border israel resid christian v...
...,...,...,...,...,...
4846,Europe’s COP 29 Climate Change Goals Should In...,2023-11-29 13:54:00,Europe,Space-based solar power for Earth At this week...,spacebas solar power earth week climat chang c...
4847,Joint statement by Joint Expeditionary Force m...,2023-11-29 02:47:13,Europe,Defence Secretary Grant Shapps met virtually w...,defenc secretari grant shapp met virtual minis...
4848,Rapala VMC Corporation’s Financial Reporting i...,2023-11-29 14:00:00,Europe,"Rapala VMC Corporation, Financial calendar, N...",rapala vmc corpor financi calendar novemb eet ...
4849,Pharming Group (NASDAQ:PHAR) Shares Gap Down t...,2023-11-29 13:02:41,Europe,Pharming Group (NASDAQ:PHAR–Get Free Report)’s...,pharm group free report share price gap market...


1. With stopwords (tutorial vectorizer)

In [12]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.05,
    random_state=100)

hdbscan_model = HDBSCAN(
    min_cluster_size=80,
    min_samples=40,
    gen_min_span_tree=True,
    prediction_data=True)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import spacy

stopwords = list(stopwords.words('english')) + ['said', "year", "also"]

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2),
                                   stop_words=stopwords)

In [14]:


def lemmatize_and_remove_numbers(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Lemmatize specific words and remove numbers
    lemmatized_tokens = []
    for token in tokens:
        token_lower = token.lower()
        if token_lower in ['israel', 'israeli']:
            lemmatized_tokens.append('israel')
        elif token_lower in ['russia', 'russian']:
            lemmatized_tokens.append('russia')
        elif token_lower in ['ukraine', 'ukrainian']:
            lemmatized_tokens.append('ukraine')
        elif not re.fullmatch(r'\d+', token):  # Regex to remove numbers
            lemmatized_tokens.append(token)

    # Reconstruct the text
    return ' '.join(lemmatized_tokens)
# Apply the function to the 'full_content' column
df['full_content'] = df['full_content'].apply(lemmatize_and_remove_numbers)

In [15]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(df['full_content'])

2023-12-06 19:53:02,739 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/152 [00:00<?, ?it/s]

2023-12-06 20:10:32,412 - BERTopic - Embedding - Completed ✓
2023-12-06 20:10:32,414 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-06 20:11:12,144 - BERTopic - Dimensionality - Completed ✓
2023-12-06 20:11:12,146 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-06 20:11:12,589 - BERTopic - Cluster - Completed ✓
2023-12-06 20:11:12,617 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-06 20:11:38,486 - BERTopic - Representation - Completed ✓


In [16]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1046,-1_people_israel_news_would,"[people, israel, news, would, new]",[One of the most startling scientific discover...
1,0,700,0_ukraine_russia_military_forces,"[ukraine, russia, military, forces, war]","[People 's Daily Online ( Xinhua ) 13:57 , Nov..."
2,1,684,1_shares_stock_company_quarter,"[shares, stock, company, quarter, rating]",[Prudent Man Advisors LLC lessened its positio...
3,2,583,2_climate_change_water_november,"[climate, change, water, november, global]",[Climate change is affecting practically every...
4,3,548,3_israel_gaza_hamas_palestinian,"[israel, gaza, hamas, palestinian, war]","[As the Israel-Hamas war enters the 39th day ,..."
5,4,358,4_media_news_new_google,"[media, news, new, google, digital]",[The early years sector—nurseries and childmin...
6,5,211,5_trump_biden_house_republican,"[trump, biden, house, republican, president]",[As Republicans attempt to once again elect a ...
7,6,201,6_court_state_election_government,"[court, state, election, government, elections]",[The Deputy Director of the Socio-Economic Rig...
8,7,148,7_film_marvel_season_series,"[film, marvel, season, series, star]",[This week brings the eighth and final episode...
9,8,143,8_people_police_jail_israel,"[people, police, jail, israel, mattingly]",[A debate over the war between israel and Hama...


In [17]:
fig_1=model.visualize_barchart(top_n_topics=12)

In [18]:
fig_1

In [19]:
import plotly.graph_objs as go
import plotly.io as pio


In [20]:
!pip install -U kaleido





In [21]:
pio.write_image(fig_1, 'fig_1.png')

In [27]:
fig_2= model.visualize_term_rank()

In [28]:
fig_2

In [29]:
# Visualize intertopic distance
fig_3=model.visualize_topics()

In [30]:
fig_3

In [31]:
fig_4= model.visualize_hierarchy(top_n_topics=10)

In [32]:
fig_4

In [33]:
fig_5=model.visualize_heatmap()

In [34]:
fig_5