<a href="https://colab.research.google.com/github/MatthewHsu1/RAGsystemOpenAI/blob/main/Open_AI_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets pandas openai pymongo

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("AIatMongoDB/embedded_movies")

dataset_df = pd.DataFrame(dataset['train'])

dataset_df.head(5)

In [None]:
dataset_df = dataset_df.dropna(subset=['plot'])

dataset_df = dataset_df.drop(columns=['plot_embedding'])
dataset_df.head(5)

In [None]:
import openai
from google.colab import userdata

openai.api_key = userdata.get('open_ai')

EMBEDDING_MODEL = 'text-embedding-3-small'

def get_embedding(text):
  if not text or not isinstance(text, str):
    return None

  try:
    embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
    return embedding
  except Exception as e:
    print(f'Error in get_embedding: {e}')
    return None

dataset_df['plot_embedding_optimised'] = dataset_df['plot'].apply(get_embedding)

dataset_df.head()

In [7]:
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  try:
    client = pymongo.MongoClient(mongo_uri)
    print('Connection to MongoDB successful')
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f'Connection failed: {e}')
    return None

mongo_uri = userdata.get('mongo_url')
if not mongo_uri:
  print('Mongo_uri not set in envirnment variables')

mongo_client = get_mongo_client(mongo_uri)

db = mongo_client['movies']
collection = db['movie_collection']

documents = dataset_df.to_dict('records')
collection.insert_many(documents)

print('Data ingestion into MongoDB completed')

Connection to MongoDB successful
Data ingestion into MongoDB completed


In [12]:
def vector_search(user_query, collection):
  query_embedding = get_embedding(user_query)

  if query_embedding is None:
    return "Invalid query or embedding generation failed."

  pipeline = [
      {
          "$vectorSearch": {
              "index": "vector_index",
              "queryVector": query_embedding,
              "path": "plot_embedding_optimised",
              "numCandidates": 150,
              "limit": 5
          }
      },
      {
          "$project": {
              "plot": 1,
              "title": 1,
              "genres": 1,
              "score": {
                  "$meta": "vectorSearchScore"
              }
          }
      }
  ]

  results = collection.aggregate(pipeline)
  return list(results)

In [37]:
def handle_user_query(query, collection):
  get_knowledge = vector_search(query, collection)

  search_result = ''
  for result in get_knowledge:
    search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('plot', 'N/A')} \n"

  completion = openai.chat.completions.create(
      model='gpt-3.5-turbo',
      messages=[
          {'role': "system", 'content': 'You are a movie recommendation system that picks the best fit movie out the choices you are given. You are not allowed to pick anything outside of your choices. And do not mention anyhting along the lines "based on the context provided"'},
          {'role': 'user', 'content': 'Answer this user query: ' + query + ' using the following context: ' + search_result}
      ]
  )

  return (completion.choices[0].message.content), search_result

In [38]:
query = "What is a good movie related to bees?"
response, source_information = handle_user_query(query, collection)

print(f"Response: {response}")
print(f"Source Information: \n{source_information}")

Response: Starship Troopers 2: Hero of the Federation
Source Information: 
Title: Killer Bean 2: The Party, Plot: 3D Animated coffee beans get into a gun fight over a loud party next door. Bean1 cant sleep and so crashes the party and starts shooting all the Beans. Bean2 avoids being shot and sneaks ... 
Title: Paragraph 78, Plot: This is a wonderful movie about a group of men frontier the Nashville area who like to make puppets out of real animals 
Title: Starship Troopers 2: Hero of the Federation, Plot: In the sequel to Paul Verhoeven's loved/reviled sci-fi film, a group of troopers taking refuge in an abandoned outpost after fighting alien bugs, failing to realize that more danger lays in wait. 
Title: Tremors II: Aftershocks, Plot: When an army of Graboids - giant, carnivorous underground worms - threaten the Petromaya oil refinery in Mexico, its owners call on Earl Bassett, who once helped kill four of the creatures ... 
Title: The Animatrix, Plot: The Animatrix is a collection o