In [2]:
!pip install datasets
!pip install bertopic
!pip install langchain
!pip install ctransformers
!pip install nltk
!pip install sentencepiece



## Topic Modeling with Wiki-Dataset

# Goals
- Match each description to its list of topics 
- Clean up list of topics to remove hallucinations
  - Use Vicuna and/or Mistral-Instruct
  - Can mess with top_k 
  - Can mess with temperature=0 to ask gpt to perform cleaning
- Clean Up Method #2
  - Find most common tags
  - For outlier tags send it to a model to see if the list of most common tags correlates to the provided tag
    - Need to find a method to distinguish between bad and good outlier 

In [4]:
from datasets import load_dataset
from bertopic import BERTopic 
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers #use a better method 

from sklearn.feature_extraction.text import CountVectorizer

DatasetDict({
    train: Dataset({
        features: ['article_text', 'topic'],
        num_rows: 22463
    })
})

In [5]:
#load in dataset
dataset = load_dataset("valurank/Topic_Classification", split='train')
dataset

Dataset({
    features: ['article_text', 'topic'],
    num_rows: 22463
})

In [6]:
descriptions = dataset['article_text']
descriptions[:10]

['NEWYou can now listen to Fox News articles! The Mercedes-Benz S-Class has always been a special car.Before it started using that name in 1972, brand’s top model was known as the Sonderklasse, which is German for "Special Class," denoting its position as the flagship of the fleet.It’s been used as a showcase for the latest technologies including new engines, airbags, anti-lock brakes and traction control, and the newest "S" follows in that tradition.Not the redesigned S-Class that launched last year, but the EQS sedan that’s now in showrooms and is Mercedes-Benz’s first purpose-built electric car. The EQS is the first purpose-built electric car from Mercedes-Benz (Mercedes-Benz)The automaker has made other electric vehicles, but on platforms shared with internal combustion engine models. The EQS is the first built on a dedicated EV chassis that will spawn other lines in the years to come.The EQS starts at $103,360, and no one would call that cheap, but it is around $9,000 less than th

#Attributes
1. Use CountVectorizer to remove stop_words
2. Used [Bert Mult-Label](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html#example)

In [7]:
topic_model = BERTopic(vectorizer_model=CountVectorizer(stop_words="english"))

In [8]:
topic_model.fit(descriptions[:1000])

<bertopic._bertopic.BERTopic at 0x171e76970>

In [9]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,123,-1_like_said_just_song,"[like, said, just, song, people, im, time, mus...",[By Mark SavageBBC Music CorrespondentImage ca...
1,0,110,0_film_million_films_like,"[film, million, films, like, barbie, movie, st...",[There’s finally been some movement on the liv...
2,1,88,1_england_cup_world_final,"[england, cup, world, final, match, socceroos,...",[Socceroos coach Graham Arnold has called on P...
3,2,70,2_warriors_celtics_game_curry,"[warriors, celtics, game, curry, nba, finals, ...",[After a resounding victory in Game 3 at TD Ga...
4,3,66,3_data_vehicles_crashes_systems,"[data, vehicles, crashes, systems, stars, mars...","[A Tesla logo is seen in Los Angeles, Californ..."
5,4,49,4_heard_depp_trial_amber,"[heard, depp, trial, amber, jury, johnny, inte...",[Topline\nAmber Heard said she does not “blame...
6,5,42,5_sox_rangers_inning_innings,"[sox, rangers, inning, innings, game, said, ru...","[Published June 15, 2022 4:23AM Updated 9:28AM..."
7,6,35,6_open_golf_pga_liv,"[open, golf, pga, liv, tour, country, mickelso...",[NEWYou can now listen to Fox News articles! R...
8,7,33,7_lightyear_buzz_evans_disney,"[lightyear, buzz, evans, disney, toy, story, f...",[Register now for FREE unlimited access to Reu...
9,8,29,8_pete_davidson_women_ed,"[pete, davidson, women, ed, images, like, gett...",[CelebrityPete DavidsonFor every person admitt...


In [10]:
topic_distr, _ = topic_model.approximate_distribution(descriptions[:1000])
len(topic_distr)

1000

In [11]:
topic_model.visualize_distribution(topic_distr[1])

# Topic Generation
- This code uses the word bags from each topic and passes them into an llm to parse out a topic 

Prompt: 

['england', 'cup', 'world', 'final', 'match', 'win', 'socceroos', 'league', 'players', 'group']

Given this set of word return a one concept classification of the article


In [12]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="google/flan-t5-large")

In [13]:
def generate_topic_name(list_of_words):
  words = ', '.join([i for i in list_of_words])

  prompt = f"""Given the following set of words {words}, return at most three words that describe the generic topic of the article"""
  return pipe(prompt)[0]
  

In [15]:
topic_names = []
all_topics = topic_model.get_topics()
for i in range(20):
    list_of_words = []
    for word in all_topics[i]:
        list_of_words.append(word[0])
    topic_names.append(generate_topic_name(list_of_words))
    print(list_of_words)

['film', 'million', 'films', 'like', 'barbie', 'movie', 'star', 'new', 'world', 'story']
['england', 'cup', 'world', 'final', 'match', 'socceroos', 'league', 'win', 'players', 'play']
['warriors', 'celtics', 'game', 'curry', 'nba', 'finals', 'boston', 'golden', 'quarter', 'wiggins']
['data', 'vehicles', 'crashes', 'systems', 'stars', 'mars', 'space', 'new', 'nhtsa', 'tesla']
['heard', 'depp', 'trial', 'amber', 'jury', 'johnny', 'interview', 'said', 'defamation', 'guthrie']
['sox', 'rangers', 'inning', 'innings', 'game', 'said', 'runs', 'pitch', 'cubs', 'season']
['open', 'golf', 'pga', 'liv', 'tour', 'country', 'mickelson', 'club', 'mcilroy', 'said']
['lightyear', 'buzz', 'evans', 'disney', 'toy', 'story', 'film', 'movie', 'pixar', 'character']
['pete', 'davidson', 'women', 'ed', 'images', 'like', 'getty', 'love', 'cherry', 'life']
['griner', 'team', 'gantriis', 'teri', 'brittney', 'womens', 'detention', 'wnba', 'department', 'russian']
['spacey', 'sexual', 'assault', 'court', 'charges

In [None]:
all_topics[1]

[('warriors', 0.05565827089185594),
 ('celtics', 0.05456474815460534),
 ('game', 0.05168682827867708),
 ('curry', 0.035194949183678846),
 ('nba', 0.03387182432461687),
 ('finals', 0.03130945639182676),
 ('boston', 0.024335312243513048),
 ('golden', 0.021601180188983285),
 ('quarter', 0.021157467494892414),
 ('wiggins', 0.020912720345473555)]

Attach Topic Tags to Each Description based on the topics it contains

In [None]:
#iterate through each description
#pull the tags that it is most familiar with

In [16]:
topic_names

[{'generated_text': 'barbie in a new world starring a new generation of stars and a new'},
 {'generated_text': 'australian rules football team socceroos players play in a match against engl'},
 {'generated_text': 'boston celtics vs. nba warriors in the quarterfinals'},
 {'generated_text': 'new nhtsa vehicle crashes into space'},
 {'generated_text': 'johnny depp said he was defamed by amber gut'},
 {'generated_text': 'cubs said they will not pitch to the sox in the first inning of their'},
 {'generated_text': 'rory mcilroy said he was disappointed with mickelson after'},
 {'generated_text': 'buzz lightyear and friends disney animated feature film and animated shorts'},
 {'generated_text': 'ed scott davidson and his wife getty getty cherry'},
 {'generated_text': 'brittney grener wnba team russian detention'},
 {'generated_text': 'actor kevin spacey charged with sexual assault and rape in london'},
 {'generated_text': 'bts rm'},
 {'generated_text': 'sam and britney spears at a wedding wit