In [1]:
!pip install datasets
!pip install bertopic
!pip install langchain
!pip install ctransformers
!pip install nltk





In [2]:
from datasets import load_dataset
from bertopic import BERTopic
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

dataset = load_dataset("hugginglearners/netflix-shows", split="train")

In [9]:
topic_model = BERTopic(nr_topics=20)
topics, probabilities = topic_model.fit_transform(dataset['description'])

In [10]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4708,-1_the_to_and_of,"[the, to, and, of, in, his, her, with, an, for]","[Overtaken by the death of her beloved, a grie..."
1,0,779,0_and_the_standup_comedian,"[and, the, standup, comedian, in, of, comedy, ...",[Chris Rock takes the stage in Brooklyn for a ...
2,1,557,1_their_to_and_school,"[their, to, and, school, of, the, for, in, hig...",[Four friends shake up their lives when they m...
3,2,428,2_his_her_to_and,"[his, her, to, and, with, an, in, the, when, of]","[After his father's passing, a teenager sets o..."
4,3,427,3_of_to_the_his,"[of, to, the, his, detective, in, murder, an, ...",[An ex-con is just getting his life back on tr...
5,4,304,4_the_of_to_in,"[the, of, to, in, and, by, her, their, haunted...",[After losing their first child in an accident...
6,5,264,5_the_to_bheem_his,"[the, to, bheem, his, and, of, rangers, martia...","[When Dholakpur’s princess is kidnapped, Bheem..."
7,6,261,6_and_christmas_the_to,"[and, christmas, the, to, of, with, friends, s...",[A new pack of Pup Star pooches stumbles upon ...
8,7,209,7_war_the_of_to,"[war, the, of, to, agent, terrorist, in, cia, ...","[During World War II, British forces launch an..."
9,8,204,8_and_the_music_of,"[and, the, music, of, this, band, dancer, danc...","[From child prodigy to iconic music producer, ..."


In [11]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.append('the')
stopwords.append('and')
stopwords.append('to')
stopwords.append('of')
stopwords.append('in')
stopwords.append('with')
stopwords.append('an')
stopwords.append('The')
stopwords.append('And')
stopwords.append('To')
stopwords.append('Of')
stopwords.append('In')
stopwords.append('With')
stopwords.append('An')
stopwords.append('A')
stopwords.append('As')
stopwords.append('as')

descriptions = dataset['description']

def remove_stopwords(text):
    output= ' '.join([i for i in text.split() if i not in stopwords])
    return output

processed_descriptions = [remove_stopwords(text) for text in descriptions]
processed_descriptions[:10]

['father nears end life, filmmaker Kirsten Johnson stages death inventive comical ways help face inevitable.',
 'After crossing paths party, Cape Town teen sets prove whether private-school swimming star sister abducted birth.',
 'protect family powerful drug lord, skilled thief Mehdi expert team robbers pulled violent deadly turf war.',
 'Feuds, flirtations toilet talk go among incarcerated women Orleans Justice Center New Orleans gritty reality series.',
 'city coaching centers known train India’s finest collegiate minds, earnest unexceptional student friends navigate campus life.',
 'arrival charismatic young priest brings glorious miracles, ominous mysteries renewed religious fervor dying town desperate believe.',
 "Equestria's divided. But bright-eyed hero believes Earth Ponies, Pegasi Unicorns pals — and, hoof heart, she’s determined prove it.",
 'On photo shoot Ghana, American model slips back time, becomes enslaved plantation bears witness agony ancestral past.',
 "talented bat

In [12]:
updated_topic_model = BERTopic(nr_topics=20)
topics, probabilities = updated_topic_model.fit_transform(processed_descriptions)
updated_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4464,-1_life_young_when_love,"[life, young, when, love, new, family, two, fr...","[After experiencing tragic loss, woman must re..."
1,0,998,0_murder_crime_police_cop,"[murder, crime, police, cop, detective, when, ...","[Eight years young man framed murder, up-and-c..."
2,1,983,1_standup_comedian_comedy_special,"[standup, comedian, comedy, special, comic, do...",[John Mulaney kid pals tackle existential topi...
3,2,506,2_school_high_student_friends,"[school, high, student, friends, students, col...",[Degrassi's next generation high-tech newbies ...
4,3,457,3_father_family_woman_young,"[father, family, woman, young, single, man, wh...","[1979, single bohemian mom Dorothea, hoping he..."
5,4,267,4_earth_space_planet_sea,"[earth, space, planet, sea, crew, science, sci...","[After falling wormhole, space-dwelling teen m..."
6,5,209,5_christmas_dog_holiday_santa,"[christmas, dog, holiday, santa, friends, fun,...",[Madagascar goes wild holiday spirit set Valen...
7,6,174,6_soccer_team_prize_football,"[soccer, team, prize, football, compete, athle...","[competition show, contestants try earn $1 mil..."
8,7,161,7_bheem_rangers_save_evil,"[bheem, rangers, save, evil, powers, when, sup...",[When two evil entities kidnap princess plot b...
9,8,109,8_chef_food_cooking_chefs,"[chef, food, cooking, chefs, dishes, culinary,...",[female food blogger hired personal chef young...


In [17]:
def generate_topic_name(list_of_words):

    llm = CTransformers(model='models\llama-2-7b-chat.ggmlv3.q8_0.bin',
                        model_type='llama',
                        config={'max_new_tokens': 256,
                                'temperature': 0.01})

    words = ', '.join([i for i in list_of_words])

    template = """
Given the following words that describe a genre of Netflix shows\films, return a one to two word name for the genre. Your response
should be one to two words with no other text:\n\"{words}\"
    """

    prompt = PromptTemplate(input_variables = ['words'], template=template)

    response = llm(prompt.format(words=words))

    return response

In [18]:
topic_names = []
all_topics = topic_model.get_topics()
for i in range(len(all_topics) - 1):
    list_of_words = []
    for word in all_topics[i]:
        list_of_words.append(word[0])
    topic_names.append(generate_topic_name(list_of_words))

In [35]:
all_topics[14]

[('contestants', 0.08927503220670463),
 ('competition', 0.07348370732356461),
 ('prize', 0.06710982451482556),
 ('compete', 0.06584896015661276),
 ('show', 0.06096761344217699),
 ('for', 0.056810949391311934),
 ('this', 0.05540364481809931),
 ('in', 0.052369597782706105),
 ('artists', 0.051267988150305796),
 ('win', 0.04701476240106029)]

In [37]:
topic_names[14]

'\nThe genre is: Reality TV'