## Bertopic
- For semantic text understanding
- Uses a different modeling library

## Imports

In [3]:
import pandas as pd
df = pd.read_csv('titles_data.csv')

print(df.head())

                                               Title          Query
0                         South America  Google News  South America
1  South American trade bloc Mercosur holds summi...  South America
2  Minnesota National Guard Deploying to South Am...  South America
3  Extreme weather in Latin America unlocks vicio...  South America
4  Volkswagen aims to grow 40 in S America throug...  South America


## Preprocessing

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


nltk.download('stopwords')
nltk.download('punkt')


stop_words = set(stopwords.words('english'))


def preprocess_title(title):
    title = title.lower()
    tokens = word_tokenize(title)
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    preprocessed_title = ' '.join(tokens)

    return preprocessed_title

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tyler\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tyler\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df['Preprocessed Title'] = df['Title'].apply(preprocess_title)
print(df['Preprocessed Title'])

0                               south america google news
1       south american trade bloc mercosur holds summi...
2       minnesota national guard deploying south ameri...
3       extreme weather latin america unlocks vicious ...
4       volkswagen aims grow 40 america ev subscriptio...
                              ...                        
1214    us trying mend ties venezuela one big reason o...
1215    decade maduro migration marks venezuelans live...
1216    venezuelas juan guaidó seeks support washingto...
1217    joint statement venezuela negotiations united ...
1218    opinion venezuelas crisis must resolved peacef...
Name: Preprocessed Title, Length: 1219, dtype: object


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Preprocessed Title'])

In [10]:
from sklearn.decomposition import TruncatedSVD

num_topics = 10
lsa_model = TruncatedSVD(n_components=num_topics)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

# Extract keywords from LSA components
terms = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lsa_model.components_):
    top_keywords = [terms[i] for i in topic.argsort()[:-5 - 1:-1]] 
    print(f"Topic {topic_idx + 1}: {', '.join(top_keywords)}")

Topic 1: news, google, america, south, argentina
Topic 2: google, news, guyanaparaguay, stabroek, loop
Topic 3: america, south, latin, market, caribbean
Topic 4: prensa, latina, la, venezuela, uruguay
Topic 5: english, bnamericas, brazil, new, suriname
Topic 6: reuters, brazil, latina, prensa, canada
Topic 7: brazil, espn, world, uruguay, argentina
Topic 8: 2023, reliefweb, ecuador, peru, argentina
Topic 9: suriname, argentina, international, imf, monetary
Topic 10: venezuela, united, states, latin, state
