In [None]:
pip install bertopic


# Trial with one paragraph 
### - data too small as BERTopic requires UMAP and HDBSCAN perform better with datasets of at least 50–100 documents.

In [None]:
pip install tf-keras

In [1]:
from umap import UMAP
from bertopic import BERTopic

# Sample dataset (replace with your dataset)
documents = [
    "Climate change is a pressing issue affecting the entire planet.",
    "We need renewable energy solutions to combat global warming.",
    "Carbon emissions have reached an all-time high in recent years.",
    "Sustainability is key to preserving our environment.",
    "What are the economic impacts of climate policies?"
]

# Define a UMAP model with compatible parameters
umap_model = UMAP(
    n_neighbors=2,  # Lower for small datasets
    n_components=2,  # Reduced dimensions, small for small datasets
    metric='cosine',  # Distance metric
    init='random'  # Avoid spectral initialization issues
)

# Pass the custom UMAP model to BERTopic
topic_model = BERTopic(umap_model=umap_model)

# Fit the model to your dataset
topics, probs = topic_model.fit_transform(documents)

# Reduce the number of topics
topic_model.reduce_topics(documents, nr_topics=2)

# View the topics
print(topic_model.get_topic_info())

# Get keywords for a specific topic
print(topic_model.get_topic(0))  # Replace 0 with the topic ID of interest


  from .autonotebook import tqdm as notebook_tqdm





ValueError: k must be less than or equal to the number of training points

# Trial 2.1 with transcript2008-2018.zip (no stop words removed)

In [None]:
import os
import zipfile

# Path to the ZIP file
zip_path = r"D:\dev\annotate\Automated-text-annotation\transcripts2008-2018.zip"
extracted_path = r"D:\dev\annotate\Automated-text-annotation\Extract transcripts 2008-2018"

# Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Load all `.txt` files into a list
documents = []
for file_name in os.listdir(extracted_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(extracted_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            documents.append(file.read())


In [None]:
from bertopic import BERTopic

# Initialize BERTopic
topic_model = BERTopic()

# Fit the model to dataset
topics, probs = topic_model.fit_transform(documents)

# View the topics
print(topic_model.get_topic_info())


    Topic  Count                        Name  \
0      -1   1315            -1_the_and_to_of   
1       0    100             0_and_the_of_to   
2       1     77             1_the_to_in_and   
3       2     74        2_universe_the_of_we   
4       3     71          3_energy_the_we_to   
..    ...    ...                         ...   
57     56     12             56_la_da_li_heh   
58     57     12       57_the_and_animals_to   
59     58     11          58_the_and_in_that   
60     59     11        59_fashion_to_the_it   
61     60     10  60_blind_me_wheelchair_the   

                                       Representation  \
0       [the, and, to, of, that, in, you, is, we, it]   
1     [and, the, of, to, it, that, this, you, so, in]   
2     [the, to, in, and, of, was, that, is, we, they]   
3   [universe, the, of, we, is, that, this, and, i...   
4   [energy, the, we, to, and, is, of, that, in, c...   
..                                                ...   
57  [la, da, li, heh, th

In [None]:
# # Visualize the topics
# topic_model.visualize_topics()

# # Visualize topic probabilities
# topic_model.visualize_barchart()

# # Save the model for future use
# topic_model.save("bertopic_model")




# Trial 2.2 with transcript2008-2018.zip (stop words removed)

In [3]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.




In [5]:
import os
import zipfile
from bertopic import BERTopic
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Path to your ZIP file
zip_path = r"D:\dev\annotate\Automated-text-annotation\transcripts2008-2018.zip"
extracted_path = r"D:\dev\annotate\Automated-text-annotation\Extract transcripts 2008-2018"

# Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Preprocessing function
def preprocess_text(doc):
    tokens = word_tokenize(doc.lower())  # Tokenize and convert to lowercase
    return ' '.join(word for word in tokens if word.isalnum() and word not in stop_words)

# Load all `.txt` files into a list
documents = []
for file_name in os.listdir(extracted_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(extracted_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()
            cleaned_text = preprocess_text(raw_text)  # Preprocess each document
            documents.append(cleaned_text)

# Initialize BERTopic with custom vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=5, max_features=10000)
topic_model = BERTopic(vectorizer_model=vectorizer)

# Fit the model to dataset
topics, probs = topic_model.fit_transform(documents)

# View the topics
print(topic_model.get_topic_info())


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HUAWEI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HUAWEI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HUAWEI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


    Topic  Count                                      Name  \
0      -1   1317          -1_money_technology_water_health   
1       0    165              0_men_war_refugees_democracy   
2       1    117             1_universe_earth_mars_planets   
3       2    116            2_energy_climate_nuclear_solar   
4       3     93       3_brain_neurons_cells_consciousness   
5       4     91      4_city_cities_architecture_buildings   
6       5     76           5_patients_health_heart_patient   
7       6     59            6_china_growth_chinese_economy   
8       7     54               7_art_artists_design_artist   
9       8     52            8_bacteria_dna_genome_microbes   
10      9     48    9_company_business_employees_companies   
11     10     47          10_musical_piano_orchestra_piece   
12     11     46              11_fish_sharks_ocean_fishing   
13     12     42             12_cancer_cells_tumor_disease   
14     13     39               13_humor_laugh_gay_laughing   
15     1

In [6]:
topic_model.visualize_topics()
topic_model.visualize_barchart(top_n_topics=10)


In [8]:
topic_docs = topic_model.get_representative_docs(0)  # Get documents for Topic 3
print(topic_docs)




# Trial 3 with tweets on climate change from Kaggle

In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(doc):
    tokens = word_tokenize(doc.lower())  # Tokenize and convert to lowercase
    return ' '.join(word for word in tokens if word.isalnum() and word not in stop_words)

# Load the CSV file
file_path = r"D:\dev\annotate\Automated-text-annotation\twitter_sentiment_data.csv"
df = pd.read_csv(file_path)

# Ensure the column containing tweets is correctly identified
tweet_column = "message"
tweets = df[tweet_column].astype(str).tolist()

# Preprocess tweets
preprocessed_tweets = [preprocess_text(tweet) for tweet in tweets]

# Initialize BERTopic with a custom vectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=5, max_features=10000)
topic_model = BERTopic(vectorizer_model=vectorizer)

# Fit BERTopic to the preprocessed tweets
topics, probs = topic_model.fit_transform(preprocessed_tweets)

# View topic information
topic_info = topic_model.get_topic_info()
print(topic_info)

# Visualize topics
topic_model.visualize_topics().show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HUAWEI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HUAWEI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
