# Get-Collection-Schema Re-write

In [1]:
!pip install pymongo[srv]

Collecting pymongo[srv]
  Downloading pymongo-4.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (670 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.0/670.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.7.2


In [2]:
import pymongo
from pymongo import MongoClient
import json

In [11]:
# Connect to your MongoDB Atlas cluster
# Replace the connection string with your own
client = MongoClient("mongodb+srv://<username>:<password>@cluster0.r8pymzo.mongodb.net/")

In [12]:
# Access the sample_mflix database
db = client.sample_mflix

In [13]:
# Function to extract schema for a given collection
def get_collection_schema(collection_name):
    collection = db[collection_name]
    sample_document = collection.find_one()
    schema = {}
    for field, value in sample_document.items():
        field_type = type(value).__name__
        if isinstance(value, list):
            schema[field] = {"type": "array"}
        else:
            schema[field] = {"type": field_type}
    return schema

In [14]:
# Function to generate the JSON schema for all collections
def generate_schema():
    collections_schema = {}
    for collection_name in db.list_collection_names():
        collections_schema[collection_name] = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
        collections_schema[collection_name]["fields"] = get_collection_schema(collection_name)
    return collections_schema

In [15]:
# Generate the schema
schema = generate_schema()

In [None]:
schema

In [16]:
# Function to generate the schema for a single collection and write it to a JSON file
def generate_collection_schema(collection_name):
    schema = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
    schema["fields"] = get_collection_schema(collection_name)
    with open(f"{collection_name}.json", "w") as f:
        json.dump(schema, f, indent=4)

In [17]:
# Generate schema for each collection and write to separate files
for collection_name in db.list_collection_names():
    generate_collection_schema(collection_name)

Adding Examples

In [None]:
movies_collection = db.movies

# Fetch a document from the movies collection
movie_document = movies_collection.find_one()

In [None]:
# Helper function to structure the data according to the schema
def add_example_data(schema, document):
    for field in schema['fields']:
        if field in document:
            schema['fields'][field]['example'] = document[field]
        else:
            schema['fields'][field]['example'] = None  # Set None if field not found in document
    return schema

In [None]:
# Load the existing movies.json file
with open('movies.json', 'r') as file:
    movies_schema = json.load(file)

In [None]:
# Add example data to the schema
updated_schema = add_example_data(movies_schema, movie_document)

In [None]:
from bson import ObjectId
from datetime import datetime

# Define a custom JSON encoder for ObjectId and datetime
class JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, ObjectId):
            return str(obj)
        if isinstance(obj, datetime):
            return obj.isoformat()
        return json.JSONEncoder.default(self, obj)

In [None]:
# Save the updated schema back to the JSON file
with open('movies_added_example.json', 'w') as file:
    json.dump(updated_schema, file, indent=4, cls=JSONEncoder)

LLM - OpenAI

In [18]:
!pip install openai

Collecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [50]:
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = <API_KEY>,
    )

    chat_completion = client.chat.completions.create(
        model = "gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens = 300
    )

    return chat_completion

In [70]:
# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Construct the schema description
json_schema = json.dumps(fields)

# Generate description for each field
prompt_str = f"""{json_schema} is the schema of collections, fields in a mongodb database.
Description refers to what information the collection or field holds.
Give the description as
field name : description
"""

description_generated = generate_description(prompt_str)



In [71]:
description_generated

ChatCompletion(id='chatcmpl-9PCHiU6fwnwAn5wNtSBPexc0O4izO', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="_id : unique identifier for each document\nplot : short summary of the movie's plot\ngenres : array of genres the movie belongs to\nruntime : duration of the movie in minutes\ncast : array of actors and actresses in the movie\nnum_mflix_comments : number of comments on the movie in the Mflix platform\nposter : URL of the movie's poster image\ntitle : title of the movie\nfullplot : detailed description of the movie's plot\ncountries : array of countries where the movie was filmed or produced\nreleased : release date of the movie\ndirectors : array of directors of the movie\nwriters : array of writers of the movie\nawards : information about any awards won by the movie\nlastupdated : date of last update to the document\nyear : year the movie was released\nimdb : information about the movie from the IMDb database\ntype : type of m

In [72]:
description_string = str(description)

In [73]:
description_string

"_id : unique identifier for the document\nplot : brief summary of the movie's plot\ngenres : array of genres that the movie belongs to\nruntime : duration of the movie in minutes\ncast : array of actors/actresses starring in the movie\nnum_mflix_comments : number of comments or reviews on the movie\nposter : URL to the movie's poster image\ntitle : title of the movie\nfullplot : detailed description of the movie's plot\ncountries : array of countries where the movie was filmed or produced\nreleased : release date of the movie\ndirectors : array of directors who worked on the movie\nwriters : array of writers who contributed to the movie\nawards : dictionary containing information about any awards the movie has received\nlastupdated : timestamp indicating when the document was last updated\nyear : year when the movie was released\nimdb : dictionary containing IMDb rating and other related information\ntype : type of movie (e.g. movie, series)\ntomatoes : dictionary containing ratings a

In [74]:
print(description_string)

_id : unique identifier for the document
plot : brief summary of the movie's plot
genres : array of genres that the movie belongs to
runtime : duration of the movie in minutes
cast : array of actors/actresses starring in the movie
num_mflix_comments : number of comments or reviews on the movie
poster : URL to the movie's poster image
title : title of the movie
fullplot : detailed description of the movie's plot
countries : array of countries where the movie was filmed or produced
released : release date of the movie
directors : array of directors who worked on the movie
writers : array of writers who contributed to the movie
awards : dictionary containing information about any awards the movie has received
lastupdated : timestamp indicating when the document was last updated
year : year when the movie was released
imdb : dictionary containing IMDb rating and other related information
type : type of movie (e.g. movie, series)
tomatoes : dictionary containing ratings and reviews from the

In [76]:
description_list = description_string.split("\n")

In [77]:
# description_list

In [78]:
# # Initialize an empty dictionary
# description_dict = {}

# # Iterate over the list and split each string into key and value
# for item in description_list:
#     key, value = item.split(" : ", 1)
#     description_dict[key.strip()] = value.strip()

# # Print the resulting dictionary
# print(description_dict)


In [79]:
import json

# Read data from movies.json file
with open('movies.json', 'r') as f:
    existing_json_data = json.load(f)

for item in description_list:
    field, description_value = item.split(" : ", 1)
    existing_json_data["fields"][field]["description"] = description_value

In [80]:
existing_json_data

{'name': 'movies',
 'description': 'Contains detailed information about movies.',
 'fields': {'_id': {'type': 'ObjectId',
   'description': 'unique identifier for the document'},
  'plot': {'type': 'str', 'description': "brief summary of the movie's plot"},
  'genres': {'type': 'array',
   'description': 'array of genres that the movie belongs to'},
  'runtime': {'type': 'int',
   'description': 'duration of the movie in minutes'},
  'cast': {'type': 'array',
   'description': 'array of actors/actresses starring in the movie'},
  'num_mflix_comments': {'type': 'int',
   'description': 'number of comments or reviews on the movie'},
  'poster': {'type': 'str', 'description': "URL to the movie's poster image"},
  'title': {'type': 'str', 'description': 'title of the movie'},
  'fullplot': {'type': 'str',
   'description': "detailed description of the movie's plot"},
  'countries': {'type': 'array',
   'description': 'array of countries where the movie was filmed or produced'},
  'released

In [81]:
# Write the updated JSON data back to the file
with open('movies_rewrite.json', 'w') as f:
    json.dump(existing_json_data, f, indent=4)

print("Descriptions updated and written to movies_rewrite.json file.")

Descriptions updated and written to movies_rewrite.json file.


LLM - TogetherAI

In [82]:
import os
import time
import json
from together import Together

TOGETHER_API_KEY = <TOGETHER_API_KEY>

# Function to generate description using Together.ai Language Model
def generate_description(prompt):
    client = Together(api_key = TOGETHER_API_KEY)

    response = client.completions.create(
      # model = "codellama/CodeLlama-34b-Python-hf",
      # model = "codellama/CodeLlama-34b-Instruct-hf",
      model = "Phind/Phind-CodeLlama-34B-v2",
      prompt = prompt,
    )

    # print(response.choices[0].text)

    return response.choices[0].text