# Collection To JSON Schema

In [2]:
!pip install pymongo[srv]

Collecting pymongo[srv]
  Downloading pymongo-4.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (669 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.0/670.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.7.1


In [4]:
import pymongo
from pymongo import MongoClient
import json

In [5]:
# Connect to your MongoDB Atlas cluster
# Replace the connection string with your own
client = MongoClient("mongodb+srv://<username>:<password>@cluster0.r8pymzo.mongodb.net/")

In [6]:
# Access the sample_mflix database
db = client.sample_mflix

In [7]:
# Function to extract schema for a given collection
def get_collection_schema(collection_name):
    collection = db[collection_name]
    sample_document = collection.find_one()
    schema = {}
    for field, value in sample_document.items():
        field_type = type(value).__name__
        if isinstance(value, list):
            if value:
                if isinstance(value[0], dict):
                    inner_fields = {}
                    for inner_doc in value:
                        inner_fields.update(get_collection_schema(collection_name))
                    schema[field] = {"type": "array", "description": f"List of objects with fields: {inner_fields}"}
                else:
                    inner_type = type(value[0]).__name__
                    schema[field] = {"type": "array", "description": f"List of {inner_type}s"}
            else:
                schema[field] = {"type": "array"}
        else:
            schema[field] = {"type": field_type}
    return schema

In [8]:
# Function to generate the JSON schema for all collections
def generate_schema():
    collections_schema = {}
    for collection_name in db.list_collection_names():
        collections_schema[collection_name] = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
        collections_schema[collection_name]["fields"] = get_collection_schema(collection_name)
    return collections_schema

In [9]:
# Generate the schema
schema = generate_schema()

In [10]:
schema

{'sessions': {'name': 'sessions',
  'description': 'Contains detailed information about sessions.',
  'fields': {'_id': {'type': 'ObjectId'},
   'user_id': {'type': 'str'},
   'jwt': {'type': 'str'}}},
 'movies': {'name': 'movies',
  'description': 'Contains detailed information about movies.',
  'fields': {'_id': {'type': 'ObjectId'},
   'plot': {'type': 'str'},
   'genres': {'type': 'array', 'description': 'List of strs'},
   'runtime': {'type': 'int'},
   'cast': {'type': 'array', 'description': 'List of strs'},
   'poster': {'type': 'str'},
   'title': {'type': 'str'},
   'fullplot': {'type': 'str'},
   'languages': {'type': 'array', 'description': 'List of strs'},
   'released': {'type': 'datetime'},
   'directors': {'type': 'array', 'description': 'List of strs'},
   'rated': {'type': 'str'},
   'awards': {'type': 'dict'},
   'lastupdated': {'type': 'str'},
   'year': {'type': 'int'},
   'imdb': {'type': 'dict'},
   'countries': {'type': 'array', 'description': 'List of strs'},
 

In [49]:
# # Write the schema to a JSON file - Everything in a single file
# # Don't Run This
# with open("movie.json", "w") as f:
#     json.dump({"collections": schema}, f, indent=4)

# print("Schema generation complete. Schema saved as 'movie.json'.")

In [11]:
# Function to generate the schema for a single collection and write it to a JSON file
def generate_collection_schema(collection_name):
    schema = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
    schema["fields"] = get_collection_schema(collection_name)
    with open(f"{collection_name}.json", "w") as f:
        json.dump(schema, f, indent=4)

In [12]:
# Generate schema for each collection and write to separate files
for collection_name in db.list_collection_names():
    generate_collection_schema(collection_name)

# LLM

In [13]:
!pip install openai

Collecting openai
  Downloading openai-1.25.1-py3-none-any.whl (312 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/312.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py

In [50]:
import os
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = "<OpenAI_API-Key>", # Replace with your OpenAI API Key
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens = 20
    )

    return chat_completion

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a single line description for the {key} field: {value}")
    field_generated_descriptions[key] = description

# Print final descriptions
print("Final Descriptions:")
print(field_generated_descriptions)

Final Descriptions:
{'_id': ChatCompletion(id='chatcmpl-9La4fqfqsbH6lQJZ2P16mNWatDJXS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A unique identifier for each document in a database.', role='assistant', function_call=None, tool_calls=None))], created=1714930585, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=10, prompt_tokens=19, total_tokens=29)), 'plot': ChatCompletion(id='chatcmpl-9La4fpz5BjKF0juY8lKr7h8dDiH0h', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='A young woman navigates the cutthroat world of high fashion modeling while uncovering dark secrets about the', role='assistant', function_call=None, tool_calls=None))], created=1714930585, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_a450710239', usage=CompletionUsage(completion_tokens=20, prompt_tokens

In [51]:
# Separate keys and values
keys = list(field_generated_descriptions.keys())
values = list(field_generated_descriptions.values())

# Print keys and values
print("Keys:", keys)
print("Values:", values)

Keys: ['_id', 'plot', 'genres', 'runtime', 'cast', 'poster', 'title', 'fullplot', 'languages', 'released', 'directors', 'rated', 'awards', 'lastupdated', 'year', 'imdb', 'countries', 'type', 'tomatoes', 'num_mflix_comments']
Values: [ChatCompletion(id='chatcmpl-9La4fqfqsbH6lQJZ2P16mNWatDJXS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A unique identifier for each document in a database.', role='assistant', function_call=None, tool_calls=None))], created=1714930585, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=10, prompt_tokens=19, total_tokens=29)), ChatCompletion(id='chatcmpl-9La4fpz5BjKF0juY8lKr7h8dDiH0h', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='A young woman navigates the cutthroat world of high fashion modeling while uncovering dark secrets about the', role='assistant', function_

In [52]:
description = []

for i in (range(len(values))):
  # Use regular expressions to extract the content
  content_match = re.search(r"content='(.*?)'", str(values[i]))

  if content_match:
      content = content_match.group(1)
      description.append(content)
      # print("Extracted Content:", content)
  else:
    description.append("")

In [53]:
print("Description:")
print(description)

Description:
['A unique identifier for each document in a database.', 'A young woman navigates the cutthroat world of high fashion modeling while uncovering dark secrets about the', 'A collection of different types or styles of music, literature, or art.', 'The duration in which a program or process is active and executing on a computer.', 'A list of names of the cast members starring in a production.', '"Captivating images that tell a story at a glance."', '"Revolutionizing the way we connect and communicate."', '"An in-depth narrative detailing the unfolding events and complexities of the story."', 'A collection of string values representing different programming languages.', 'A diverse and innovative collection of new music that pushes boundaries and challenges conventional genres.', 'A collection of names and contact information for directors in various industries.', '"Revealing the satisfaction level of a product or service through user feedback and reviews."', 'Honoring excellenc

In [54]:
field_descriptions_final = {}
# Update the dictionary with new values
for key, new_value in zip(field_generated_descriptions.keys(), description):
    field_descriptions_final[key] = new_value

In [56]:
# Print the final dictionary of descriptions
print("Final Dictionary of Descriptions:")
print(field_descriptions_final)

Final Dictionary of Descriptions:
{'_id': 'A unique identifier for each document in a database.', 'plot': 'A young woman navigates the cutthroat world of high fashion modeling while uncovering dark secrets about the', 'genres': 'A collection of different types or styles of music, literature, or art.', 'runtime': 'The duration in which a program or process is active and executing on a computer.', 'cast': 'A list of names of the cast members starring in a production.', 'poster': '"Captivating images that tell a story at a glance."', 'title': '"Revolutionizing the way we connect and communicate."', 'fullplot': '"An in-depth narrative detailing the unfolding events and complexities of the story."', 'languages': 'A collection of string values representing different programming languages.', 'released': 'A diverse and innovative collection of new music that pushes boundaries and challenges conventional genres.', 'directors': 'A collection of names and contact information for directors in vari

In [57]:
import json

# Read data from movies.json file
with open('movies.json', 'r') as f:
    existing_json_data = json.load(f)

# Update the JSON data with descriptions
for field, description in field_descriptions_final.items():
    existing_json_data["fields"][field]["description"] = description

In [58]:
existing_json_data

{'name': 'movies',
 'description': 'Contains detailed information about movies.',
 'fields': {'_id': {'type': 'ObjectId',
   'description': 'A unique identifier for each document in a database.'},
  'plot': {'type': 'str',
   'description': 'A young woman navigates the cutthroat world of high fashion modeling while uncovering dark secrets about the'},
  'genres': {'type': 'array',
   'description': 'A collection of different types or styles of music, literature, or art.'},
  'runtime': {'type': 'int',
   'description': 'The duration in which a program or process is active and executing on a computer.'},
  'cast': {'type': 'array',
   'description': 'A list of names of the cast members starring in a production.'},
  'poster': {'type': 'str',
   'description': '"Captivating images that tell a story at a glance."'},
  'title': {'type': 'str',
   'description': '"Revolutionizing the way we connect and communicate."'},
  'fullplot': {'type': 'str',
   'description': '"An in-depth narrative 

In [59]:
# Write the updated JSON data back to the file
with open('movies_new.json', 'w') as f:
    json.dump(existing_json_data, f, indent=4)

print("Descriptions updated and written to movies_new.json file.")

Descriptions updated and written to movies_new.json file.


# Trial & Error - Don't Run This

In [63]:
import os
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = "<OpenAI_API-Key>", # Replace with your OpenAI API Key
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=50
    )

    return chat_completion.data.text

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a description for the {key} field: {value}")
    field_generated_descriptions[key] = description.split(':', 1)[-1].strip() if description else ""  # Extract content part
    time.sleep(60 / 200)  # Delay to respect the RPD limit (200 requests per minute)

# Print final descriptions
print("Final Descriptions:")
print(field_generated_descriptions)

AttributeError: 'ChatCompletion' object has no attribute 'data'

In [62]:
import os
import time
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = "<OpenAI_API-Key>", # Replace with your OpenAI API Key
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=50
    )

    return chat_completion.choices[0].message['content']

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a description for the {key} field: {value}")
    field_generated_descriptions[key] = description.split(':', 1)[-1].strip()  # Extract content part
    time.sleep(60 / 200)  # Delay to respect the RPD limit (200 requests per minute)

# Create a final description combining all field descriptions
final_description = " ".join(field_generated_descriptions.values())

# Print final description
print("Final Description:", final_description)


TypeError: 'ChatCompletionMessage' object is not subscriptable

In [23]:
import re

# Provided output
output_text = "ChatCompletion(id='chatcmpl-9LZaHgLkyeosjEamDYNusxNKMY704', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Unique identifier for each document in a database collection.', role='assistant', function_call=None, tool_calls=None))], created=1714928701, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=10, prompt_tokens=19, total_tokens=29))"

# Use regular expressions to extract the content
content_match = re.search(r"content='(.*?)'", output_text)

if content_match:
    content = content_match.group(1)
    print("Extracted Content:", content)
else:
    print("No content found.")


Extracted Content: Unique identifier for each document in a database collection.


In [None]:
# Create a final description combining all field descriptions
final_description = " ".join(field_generated_descriptions.values())

# Print final description
print("Final Description:", final_description)