In [1]:
!pip install datasets
!pip install bertopic
!pip install langchain
!pip install ctransformers
!pip install nltk
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting bertopic
  Using cached bertopic-0.16.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Using cached hdbscan-0.8.33-cp39-cp39-macosx_10_9_universal2.whl
Collecting umap-learn>=0.5.0 (from bertopic)
  Using cached umap-learn-0.5.5.tar.gz (90 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting scikit-learn>=0.22.2.post1 (from bertopic)
  Using cached scikit_learn-1.4.1.post1-cp39-cp39-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Using cached sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Using cached plotly-5.19.0-py3-none-any.whl.metadata (7.0 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 

## Topic Modeling with Wiki-Dataset

# Goals
- Match each description to its list of topics 
- Clean up list of topics to remove hallucinations
  - Use Vicuna and/or Mistral-Instruct
  - Can mess with top_k 
  - Can mess with temperature=0 to ask gpt to perform cleaning
- Clean Up Method #2
  - Find most common tags
  - For outlier tags send it to a model to see if the list of most common tags correlates to the provided tag
    - Need to find a method to distinguish between bad and good outlier 

In [2]:
from datasets import load_dataset
from bertopic import BERTopic 
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers #use a better method 

from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['article_text', 'topic'],
        num_rows: 22463
    })
})

In [3]:
#load in dataset
dataset = load_dataset("valurank/Topic_Classification", split='train')
dataset

Dataset({
    features: ['article_text', 'topic'],
    num_rows: 22463
})

In [4]:
descriptions = dataset['article_text']
descriptions[:10]

['NEWYou can now listen to Fox News articles! The Mercedes-Benz S-Class has always been a special car.Before it started using that name in 1972, brand’s top model was known as the Sonderklasse, which is German for "Special Class," denoting its position as the flagship of the fleet.It’s been used as a showcase for the latest technologies including new engines, airbags, anti-lock brakes and traction control, and the newest "S" follows in that tradition.Not the redesigned S-Class that launched last year, but the EQS sedan that’s now in showrooms and is Mercedes-Benz’s first purpose-built electric car. The EQS is the first purpose-built electric car from Mercedes-Benz (Mercedes-Benz)The automaker has made other electric vehicles, but on platforms shared with internal combustion engine models. The EQS is the first built on a dedicated EV chassis that will spawn other lines in the years to come.The EQS starts at $103,360, and no one would call that cheap, but it is around $9,000 less than th

#Attributes
1. Use CountVectorizer to remove stop_words
2. Used [Bert Mult-Label](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html#example)

In [5]:
topic_model = BERTopic(vectorizer_model=CountVectorizer(stop_words="english"))

In [6]:
topic_model.fit(descriptions[:1000])

<bertopic._bertopic.BERTopic at 0x103c1f190>

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,121,-1_said_like_song_just,"[said, like, song, just, new, music, im, peopl...",[By Mark SavageBBC Music CorrespondentImage ca...
1,0,112,0_film_million_films_like,"[film, million, films, like, barbie, movie, st...",[There’s finally been some movement on the liv...
2,1,94,1_england_cup_world_final,"[england, cup, world, final, win, match, socce...",[Socceroos coach Graham Arnold has called on P...
3,2,70,2_warriors_celtics_game_curry,"[warriors, celtics, game, curry, nba, finals, ...",[After a resounding victory in Game 3 at TD Ga...
4,3,47,3_heard_depp_trial_amber,"[heard, depp, trial, amber, jury, johnny, inte...",[Topline\nAmber Heard said she does not “blame...
5,4,42,4_sox_rangers_inning_innings,"[sox, rangers, inning, innings, game, said, ru...","[Published June 15, 2022 4:23AM Updated 9:28AM..."
6,5,36,5_open_golf_pga_liv,"[open, golf, pga, liv, tour, country, mickelso...",[NEWYou can now listen to Fox News articles! R...
7,6,36,6_music_album_like_song,"[music, album, like, song, songs, people, fest...","[Tauren Wells, who releases, his new album, 'J..."
8,7,33,7_griner_gantriis_team_said,"[griner, gantriis, team, said, brittney, teri,...",[NEWYou can now listen to Fox News articles! T...
9,8,33,8_lightyear_buzz_evans_disney,"[lightyear, buzz, evans, disney, toy, story, f...",[Register now for FREE unlimited access to Reu...


In [8]:
topic_distr, _ = topic_model.approximate_distribution(descriptions[:1000])
len(topic_distr)

1000

In [9]:
topic_model.visualize_distribution(topic_distr[1])

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# Topic Generation
- This code uses the word bags from each topic and passes them into an llm to parse out a topic 

Prompt: 

['england', 'cup', 'world', 'final', 'match', 'win', 'socceroos', 'league', 'players', 'group']

Given this set of word return a one concept classification of the article


In [16]:
from transformers import pipeline

pipe = pipeline(model="meta-llama/llama-2-7b-chat-hf")

Loading checkpoint shards: 100%|██████████| 2/2 [00:47<00:00, 23.69s/it]


In [17]:
def generate_topic_name(list_of_words):
  words = ', '.join([i for i in list_of_words])

  prompt = f"""Given the following set of words {words}, return at most three words that describe the generic topic of the article"""
  return pipe(prompt)[0]
  

In [23]:
topic_names = []
all_topics = topic_model.get_topics()
for i in range(1,20):
    list_of_words = []
    for word in all_topics[i]:
        list_of_words.append(word[0])
    topic_names.append(generate_topic_name(list_of_words))
    print(list_of_words)

['england', 'cup', 'world', 'final', 'win', 'match', 'socceroos', 'league', 'players', 'group']


KeyboardInterrupt: 

In [None]:
all_topics[1]

[('england', 0.01787814296082338),
 ('cup', 0.016965623358324484),
 ('world', 0.014833492911927709),
 ('final', 0.012123167488149015),
 ('win', 0.011648695460352404),
 ('match', 0.011584492898009687),
 ('socceroos', 0.011454175685328727),
 ('league', 0.010825269150387106),
 ('players', 0.010652916185838336),
 ('group', 0.010321853814448473)]

Attach Topic Tags to Each Description based on the topics it contains

In [14]:
#iterate through each description
#pull the tags that it is most familiar with

In [24]:
topic_names

[{'generated_text': 'Given the following set of words england, cup, world, final, win, match, socceroos, league, players, group, return at most three words that describe the generic topic of the article.\n\n1. England\n2. Cup\n3. World\n4. Final\n5. Win\n6. Match\n7. Socceroos\n8. League\n9. Players\n10. Group\n11. Return\n\nPlease select the three words that you think best describe the generic topic of the article.'}]