In [14]:
from transformers import pipeline

In [15]:
#model for question answering 
pipe = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [16]:
# reading in csvs
import csv
import chromadb

title_ids = []
genres = []
description = []

#read in genres and descriptions automagically
with open('shows.csv', 'r') as file:
  reader = csv.DictReader(file)
  #only first 100 items
  for i, row in enumerate(reader):
      if i >= 100:
          break
      genres.append(row['listed_in'])
      description.append(row['description'])
      title_ids.append(row['show_id'])    

In [17]:
client = chromadb.PersistentClient(path='./')

In [18]:
#database chroma
shows_vs = client.get_or_create_collection(name="netflix")

In [19]:
# #load netflix shows into the vectorstore 
# shows_vs.upsert(
#   documents=description,
#   metadatas = [{'source': genre} for genre in genres],
#   ids=title_ids
# )

In [20]:
shows_vs.query(query_texts=description[0], n_results=5)

#structure of a query return
return_query_example = {'ids': [['s1', 's25', 's13', 's92', 's74']],
 'distances': [[0.0,
   1.2635724544525146,
   1.3824687004089355,
   1.406484842300415,
   1.4076160192489624]],
 'metadatas': [[{'source': 'Documentaries'},
   {'source': 'Comedies, International Movies, Romantic Movies'},
   {'source': 'Dramas, International Movies'},
   {'source': 'Documentaries, International Movies'},
   {'source': 'Dramas, International Movies'}]],
 'embeddings': None,
 'documents': [['As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.',
   'When the father of the man she loves insists that his twin sons marry twin sisters, a woman creates an alter ego that might be a bit too convincing.',
   'After most of her family is murdered in a terrorist bombing, a young woman is unknowingly lured into joining the very group that killed them.',
   "This documentary traces the capture of serial killer Guy Georges through the tireless work of two women: a police chief and a victim's mother.",
   'When a powerful businesswoman’s political ambitions are threatened by her underworld connections, the ensuing power struggle could cost her everything.']],
 'uris': None,
 'data': None}

In [21]:
genres_vs = client.get_or_create_collection(name="genre_store")

In [22]:
import uuid
groups = {} #map of descriptions to genre

return_gpt_query = {'score': 0.8608009219169617, 'start': 6, 'end': 12, 'answer': '3.1415'}
#generate a tag and compare that tag to other generated tags OR do a call back

first_desc = description[0]
qa_return = pipe(question="What is a good set of genres for this tag", context=first_desc)

#add to the map 
groups[first_desc] = qa_return['answer']
#add to the genre_vs
genres_vs.upsert(   
  documents=[qa_return['answer']],
  metadatas = [{'source': first_desc}],
  ids=str(uuid.uuid4())
)

In [27]:
genres_vs_return = genres_vs.query(query_texts=first_desc, n_results=1)
genres_vs_return

{'ids': [['d75bc75a-f2b9-4126-a369-3f790ace9686']],
 'distances': [[1.6926115262590287]],
 'metadatas': [[{'source': 'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'}]],
 'embeddings': None,
 'documents': [['comical']],
 'uris': None,
 'data': None}

In [33]:
#iterate through each description
for desc in description[1:10]:
  #check if the description has a topic that is it is familiar to 
  genres_vs_return = genres_vs.query(query_texts=desc, n_results=1)
  distance = genres_vs_return['distances'][0][0]
  
  #if distance is less than 0.5 then set its genre to the found genre 
  if distance < 0.5:
    groups[desc] = genres_vs_return['documents'][0][0]
  else:
    #generate a tag with llama
    #compare that tag to previous tags 
    #otherwise keep it going along 
    qa_return = pipe(question="What is a good set of genres for this tag", context=desc)
    #simliarity search the answer 
    genres_vs_return = genres_vs.query(query_texts=qa_return['answer'], n_results=1)
    distance = genres_vs_return['distances'][0][0]
    #check if distance less than 0.5
    if distance < 0.5:
      #if it is less than 0.5 then add it to the genre 
      groups[desc] = genres_vs_return['documents'][0][0]
    else:
      groups[desc] = qa_return['answer']
      #add new genre to genrevs
      genres_vs.upsert(   
        documents=[qa_return['answer']],
        metadatas = [{'source': desc}],
        ids=str(uuid.uuid4())
      )

In [34]:
print(groups)

{'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.': 'comical', 'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.': 'swimming', 'To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.': 'drug lord, skilled thief', 'Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.': 'Feuds, flirtations and toilet talk', 'In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.': 'collegiate', 'The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town 