# Collection To JSON Schema

In [1]:
!pip install pymongo[srv]

Collecting pymongo[srv]
  Downloading pymongo-4.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (670 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.0/670.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.7.2


In [2]:
import pymongo
from pymongo import MongoClient
import json

In [3]:
# Connect to your MongoDB Atlas cluster
# Replace the connection string with your own
# mongodb+srv://kavin:<password>@cluster0.r8pymzo.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0
# mongodb+srv://kavin:<password>@cluster0.r8pymzo.mongodb.net/
client = MongoClient("mongodb+srv://<username>:<password>@cluster0.r8pymzo.mongodb.net/")

In [4]:
# Access the sample_mflix database
db = client.sample_mflix

In [5]:
# Function to extract schema for a given collection
def get_collection_schema(collection_name):
    collection = db[collection_name]
    sample_document = collection.find_one()
    schema = {}
    for field, value in sample_document.items():
        field_type = type(value).__name__
        if isinstance(value, list):
            if value:
                if isinstance(value[0], dict):
                    inner_fields = {}
                    for inner_doc in value:
                        inner_fields.update(get_collection_schema(collection_name))
                    schema[field] = {"type": "array", "description": f"List of objects with fields: {inner_fields}"}
                else:
                    inner_type = type(value[0]).__name__
                    schema[field] = {"type": "array", "description": f"List of {inner_type}s"}
            else:
                schema[field] = {"type": "array"}
        else:
            schema[field] = {"type": field_type}
    return schema

In [6]:
# Function to generate the JSON schema for all collections
def generate_schema():
    collections_schema = {}
    for collection_name in db.list_collection_names():
        collections_schema[collection_name] = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
        collections_schema[collection_name]["fields"] = get_collection_schema(collection_name)
    return collections_schema

In [7]:
# Generate the schema
schema = generate_schema()

In [8]:
schema

{'sessions': {'name': 'sessions',
  'description': 'Contains detailed information about sessions.',
  'fields': {'_id': {'type': 'ObjectId'},
   'user_id': {'type': 'str'},
   'jwt': {'type': 'str'}}},
 'movies': {'name': 'movies',
  'description': 'Contains detailed information about movies.',
  'fields': {'_id': {'type': 'ObjectId'},
   'plot': {'type': 'str'},
   'genres': {'type': 'array', 'description': 'List of strs'},
   'runtime': {'type': 'int'},
   'cast': {'type': 'array', 'description': 'List of strs'},
   'poster': {'type': 'str'},
   'title': {'type': 'str'},
   'fullplot': {'type': 'str'},
   'languages': {'type': 'array', 'description': 'List of strs'},
   'released': {'type': 'datetime'},
   'directors': {'type': 'array', 'description': 'List of strs'},
   'rated': {'type': 'str'},
   'awards': {'type': 'dict'},
   'lastupdated': {'type': 'str'},
   'year': {'type': 'int'},
   'imdb': {'type': 'dict'},
   'countries': {'type': 'array', 'description': 'List of strs'},
 

In [None]:
# # Write the schema to a JSON file - Everything in a single file
# # Don't Run This
# with open("movie.json", "w") as f:
#     json.dump({"collections": schema}, f, indent=4)

# print("Schema generation complete. Schema saved as 'movie.json'.")

In [9]:
# Function to generate the schema for a single collection and write it to a JSON file
def generate_collection_schema(collection_name):
    schema = {"name": collection_name, "description": f"Contains detailed information about {collection_name}."}
    schema["fields"] = get_collection_schema(collection_name)
    with open(f"{collection_name}.json", "w") as f:
        json.dump(schema, f, indent=4)

In [10]:
# Generate schema for each collection and write to separate files
for collection_name in db.list_collection_names():
    generate_collection_schema(collection_name)

# LLM - OpenAI

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.25.2-py3-none-any.whl (312 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 

In [None]:
import os
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = <OpenAI_API_KEY>,
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens = 20
    )

    return chat_completion

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a single line description for the {key} field: {value}")
    field_generated_descriptions[key] = description

# Print final descriptions
print("Final Descriptions:")
print(field_generated_descriptions)

Final Descriptions:
{'_id': ChatCompletion(id='chatcmpl-9LtMSZcOwt7ZsivxOjkp1kaifc30I', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A unique identifier for each document in a MongoDB collection.', role='assistant', function_call=None, tool_calls=None))], created=1715004724, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=11, prompt_tokens=19, total_tokens=30)), 'plot': ChatCompletion(id='chatcmpl-9LtMTEQD9Hl3ofXETL6GxBnt2baPk', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='A young woman discovers she has the ability to communicate with ghosts and must navigate the challenges that come with', role='assistant', function_call=None, tool_calls=None))], created=1715004725, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_toke

In [None]:
# Separate keys and values
keys = list(field_generated_descriptions.keys())
values = list(field_generated_descriptions.values())

# Print keys and values
print("Keys:", keys)
print("Values:", values)

Keys: ['_id', 'plot', 'genres', 'runtime', 'cast', 'poster', 'title', 'fullplot', 'languages', 'released', 'directors', 'rated', 'awards', 'lastupdated', 'year', 'imdb', 'countries', 'type', 'tomatoes', 'num_mflix_comments']
Values: [ChatCompletion(id='chatcmpl-9LtMSZcOwt7ZsivxOjkp1kaifc30I', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A unique identifier for each document in a MongoDB collection.', role='assistant', function_call=None, tool_calls=None))], created=1715004724, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=11, prompt_tokens=19, total_tokens=30)), ChatCompletion(id='chatcmpl-9LtMTEQD9Hl3ofXETL6GxBnt2baPk', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='A young woman discovers she has the ability to communicate with ghosts and must navigate the challenges that come with', role='a

In [None]:
import re

In [None]:
description = []

for i in (range(len(values))):
  # Use regular expressions to extract the content
  content_match = re.search(r"content='(.*?)'", str(values[i]))

  if content_match:
      content = content_match.group(1)
      description.append(content)
      # print("Extracted Content:", content)
  else:
    description.append("")

In [None]:
print("Description:")
print(description)

Description:
['A unique identifier for each document in a MongoDB collection.', 'A young woman discovers she has the ability to communicate with ghosts and must navigate the challenges that come with', 'A list of strings representing different genres of music, movies, literature, or other forms of creative expression', 'The runtime field specifies the total duration of a film, television show, or video game.', 'A list of strings representing the cast members of a film or production.', '"Captivating visuals that will draw you in"', '"Creative solutions for modern living."', 'A detailed summary of the entire storyline and narrative of a movie or TV show.', 'A collection of string values representing different programming languages.', '"The released field indicates the date that the product or software was officially made available to the public for use', 'A collection of strings representing a list of directors.', 'The rated field is a system where items or individuals are evaluated and 

In [None]:
field_descriptions_final = {}
# Update the dictionary with new values
for key, new_value in zip(field_generated_descriptions.keys(), description):
    field_descriptions_final[key] = new_value

In [None]:
# Print the final dictionary of descriptions
print("Final Dictionary of Descriptions:")
print(field_descriptions_final)

Final Dictionary of Descriptions:
{'_id': 'A unique identifier for each document in a MongoDB collection.', 'plot': 'A young woman discovers she has the ability to communicate with ghosts and must navigate the challenges that come with', 'genres': 'A list of strings representing different genres of music, movies, literature, or other forms of creative expression', 'runtime': 'The runtime field specifies the total duration of a film, television show, or video game.', 'cast': 'A list of strings representing the cast members of a film or production.', 'poster': '"Captivating visuals that will draw you in"', 'title': '"Creative solutions for modern living."', 'fullplot': 'A detailed summary of the entire storyline and narrative of a movie or TV show.', 'languages': 'A collection of string values representing different programming languages.', 'released': '"The released field indicates the date that the product or software was officially made available to the public for use', 'directors': '

In [None]:
import json

# Read data from movies.json file
with open('movies.json', 'r') as f:
    existing_json_data = json.load(f)

# Update the JSON data with descriptions
for field, description in field_descriptions_final.items():
    existing_json_data["fields"][field]["description"] = description

In [None]:
existing_json_data

{'name': 'movies',
 'description': 'Contains detailed information about movies.',
 'fields': {'_id': {'type': 'ObjectId',
   'description': 'A unique identifier for each document in a MongoDB collection.'},
  'plot': {'type': 'str',
   'description': 'A young woman discovers she has the ability to communicate with ghosts and must navigate the challenges that come with'},
  'genres': {'type': 'array',
   'description': 'A list of strings representing different genres of music, movies, literature, or other forms of creative expression'},
  'runtime': {'type': 'int',
   'description': 'The runtime field specifies the total duration of a film, television show, or video game.'},
  'cast': {'type': 'array',
   'description': 'A list of strings representing the cast members of a film or production.'},
  'poster': {'type': 'str',
   'description': '"Captivating visuals that will draw you in"'},
  'title': {'type': 'str',
   'description': '"Creative solutions for modern living."'},
  'fullplot

In [None]:
# Write the updated JSON data back to the file
with open('movies_new.json', 'w') as f:
    json.dump(existing_json_data, f, indent=4)

print("Descriptions updated and written to movies_new.json file.")

Descriptions updated and written to movies_new.json file.


# Few Shot Prompting - OpenAI

In [None]:
import os
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key = <OpenAI_API_KEY>,
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens = 20
    )

    return chat_completion

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
  prompt_str = f"""Generate a single line description for the {key} field such that it gives the meaning of the field in the context of movies.
                For example: if key is 'tomatoes' give the description like 'Rotten Tomatoes specific information.',
                if key is 'runtime' give the description like 'The runtime of the movie in minutes.'
                Don't consider a database. Give a general description for the key such that it suits for any movie taken"""

  # prompt_str = f"""Generate a single line description for the {key} field such that it gives the meaning of the field in the context of movies"""

  # prompt_str = f"Generate a single line description for the {key} field: {value}"

  # description = generate_description(f"Generate a single line description for the {key} field: {value}")
  description = generate_description(prompt_str)
  field_generated_descriptions[key] = description

# Print final descriptions
print("Final Descriptions:")
print(field_generated_descriptions)

Final Descriptions:
{'_id': ChatCompletion(id='chatcmpl-9Lu8V46yoNLIySO0LlMNvTcOa5H4U', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Unique identifier for a specific movie entry.', role='assistant', function_call=None, tool_calls=None))], created=1715007703, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=8, prompt_tokens=98, total_tokens=106)), 'plot': ChatCompletion(id='chatcmpl-9Lu8WuOPZZdX2tKXRrLcmrtyB4NZS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A concise summary of the main events and conflicts that drive the storyline of the movie.', role='assistant', function_call=None, tool_calls=None))], created=1715007704, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=17, prompt_tokens=97, total_tokens=114)), 'ge

In [None]:
# Separate keys and values
keys = list(field_generated_descriptions.keys())
values = list(field_generated_descriptions.values())

# Print keys and values
print("Keys:", keys)
print("Values:", values)

Keys: ['_id', 'plot', 'genres', 'runtime', 'cast', 'poster', 'title', 'fullplot', 'languages', 'released', 'directors', 'rated', 'awards', 'lastupdated', 'year', 'imdb', 'countries', 'type', 'tomatoes', 'num_mflix_comments']
Values: [ChatCompletion(id='chatcmpl-9Lu8V46yoNLIySO0LlMNvTcOa5H4U', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Unique identifier for a specific movie entry.', role='assistant', function_call=None, tool_calls=None))], created=1715007703, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=8, prompt_tokens=98, total_tokens=106)), ChatCompletion(id='chatcmpl-9Lu8WuOPZZdX2tKXRrLcmrtyB4NZS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='A concise summary of the main events and conflicts that drive the storyline of the movie.', role='assistant', function_call=None, tool_calls=None))

In [None]:
import re

description = []

for i in (range(len(values))):
  # Use regular expressions to extract the content
  content_match = re.search(r"content='(.*?)'", str(values[i]))

  if content_match:
      content = content_match.group(1)
      description.append(content)
      # print("Extracted Content:", content)
  else:
    description.append("")

In [None]:
print("Description:")
print(description)

Description:
['Unique identifier for a specific movie entry.', 'A concise summary of the main events and conflicts that drive the storyline of the movie.', 'The genre or genres of the movie, categorizing it based on style, tone, and themes.', 'The duration of the movie from beginning to end in minutes.', 'A list of actors and actresses who portray characters in the film.', 'The genre of the movie.', 'The title of the movie.', 'A detailed summary of the entire storyline and plot developments of the movie from beginning to end.', 'The languages spoken in the movie.', 'The date when the movie was initially released in theaters.', 'The directors field provides information about the individual or individuals responsible for overseeing and guiding the artistic vision and execution', 'The MPAA rating of the movie providing age-appropriate content levels.', 'Recognition and accolades received by the movie and its cast and crew.', 'The date and time when the movie information was last updated.'

In [None]:
field_descriptions_final = {}
# Update the dictionary with new values
for key, new_value in zip(field_generated_descriptions.keys(), description):
    field_descriptions_final[key] = new_value

# Print the final dictionary of descriptions
print("Final Dictionary of Descriptions:")
print(field_descriptions_final)

Final Dictionary of Descriptions:
{'_id': 'Unique identifier for a specific movie entry.', 'plot': 'A concise summary of the main events and conflicts that drive the storyline of the movie.', 'genres': 'The genre or genres of the movie, categorizing it based on style, tone, and themes.', 'runtime': 'The duration of the movie from beginning to end in minutes.', 'cast': 'A list of actors and actresses who portray characters in the film.', 'poster': 'The genre of the movie.', 'title': 'The title of the movie.', 'fullplot': 'A detailed summary of the entire storyline and plot developments of the movie from beginning to end.', 'languages': 'The languages spoken in the movie.', 'released': 'The date when the movie was initially released in theaters.', 'directors': 'The directors field provides information about the individual or individuals responsible for overseeing and guiding the artistic vision and execution', 'rated': 'The MPAA rating of the movie providing age-appropriate content level

In [None]:
import json

# Read data from movies.json file
with open('movies.json', 'r') as f:
    existing_json_data = json.load(f)

# Update the JSON data with descriptions
for field, description in field_descriptions_final.items():
    existing_json_data["fields"][field]["description"] = description

# Write the updated JSON data back to the file
with open('movies_new.json', 'w') as f:
    json.dump(existing_json_data, f, indent=4)

print(existing_json_data)

print("Descriptions updated and written to movies_new.json file.")

{'name': 'movies', 'description': 'Contains detailed information about movies.', 'fields': {'_id': {'type': 'ObjectId', 'description': 'Unique identifier for a specific movie entry.'}, 'plot': {'type': 'str', 'description': 'A concise summary of the main events and conflicts that drive the storyline of the movie.'}, 'genres': {'type': 'array', 'description': 'The genre or genres of the movie, categorizing it based on style, tone, and themes.'}, 'runtime': {'type': 'int', 'description': 'The duration of the movie from beginning to end in minutes.'}, 'cast': {'type': 'array', 'description': 'A list of actors and actresses who portray characters in the film.'}, 'poster': {'type': 'str', 'description': 'The genre of the movie.'}, 'title': {'type': 'str', 'description': 'The title of the movie.'}, 'fullplot': {'type': 'str', 'description': 'A detailed summary of the entire storyline and plot developments of the movie from beginning to end.'}, 'languages': {'type': 'array', 'description': 'T

# Trying with together.ai

In [1]:
!pip3 install --upgrade together

# !pip3 install together



In [None]:
# Don't Run This

# import os
# from together import Together

# TOGETHER_API_KEY = <TOGETHER_API_KEY>

# client = Together(api_key = TOGETHER_API_KEY)

# response = client.completions.create(
#     model="codellama/CodeLlama-34b-Python-hf",
#     prompt="Write a Next.js component with TailwindCSS for a header component.",
# )

# print(response.choices[0].text)

In [5]:
import os
import time
import json
from together import Together

# Function to generate description using Together.ai Language Model
def generate_description(prompt):
    client = Together(api_key = <TOGETHER_API_KEY>)

    response = client.completions.create(
      # model = "codellama/CodeLlama-34b-Python-hf",
      # model = "codellama/CodeLlama-34b-Instruct-hf",
      model = "Phind/Phind-CodeLlama-34B-v2",
      prompt = prompt,
    )

    # print(response.choices[0].text)

    return response.choices[0].text
    # return response


# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

In [11]:
# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
  # prompt_str = f"""Generate a single line description for the {key} field such that it gives the meaning of the field in the context of movies. Limit number of words generated to 10 words.
  #               For example: if key is 'tomatoes' give the description like 'Rotten Tomatoes specific information.',
  #               if key is 'runtime' give the description like 'The runtime of the movie in minutes.'
  #               Don't consider a database. Give a general description for the key such that it suits for any movie taken"""

  # prompt_str = f"""Generate a single line description for the {key} field such that it gives the meaning of the field in the context of movies.
  # Don't generate code or many contents. Give a single sentence as output"""

  # prompt_str = f"""Give a single line description for the {key} field such that it gives the meaning of the field in the context of movies. Limit number of words generated to 10 words"""

  prompt_str = f"""Give a single line description for the meaning of {key} field in the context of movies. Limit number of words generated to 10 words. Don't generate extra contents"""

  print(key)

  description = generate_description(prompt_str)
  field_generated_descriptions[key] = description

_id
plot
genres
runtime
cast
poster
title
fullplot
languages
released
directors
rated
awards
lastupdated
year
imdb
countries
type
tomatoes
num_mflix_comments


In [12]:
# Print final descriptions
print("Final Descriptions:")

for key, value in field_generated_descriptions.items():
  print(key)
  print(value)

Final Descriptions:
_id
._The _id field represents the unique identifier for each movie in the context of movies.
plot
.The plot field in the context of movies refers to the main events and developments that occur in the storyline of a film.
genres
.The genres field in the context of movies refers to the classification of films based on their themes, styles, or storylines.
runtime
.runtime field refers to the duration of a movie, usually expressed in minutes.
cast
.cast field in movies context refers to the actors or actresses who portray the characters in the film.
poster
.The poster field in the context of movies refers to a promotional image or visual representation of the movie, typically showcasing the main actors or a scene from the film.
title
.The title field represents the name of the movie in the context of movies.
fullplot
.The fullplot field in the context of movies refers to a comprehensive summary of the movie's storyline, including its characters, setting, and main event

In [13]:
# Separate keys and values
keys = list(field_generated_descriptions.keys())
values = list(field_generated_descriptions.values())

# Print keys and values
print("Keys:", keys)
print("Values:", values)

Keys: ['_id', 'plot', 'genres', 'runtime', 'cast', 'poster', 'title', 'fullplot', 'languages', 'released', 'directors', 'rated', 'awards', 'lastupdated', 'year', 'imdb', 'countries', 'type', 'tomatoes', 'num_mflix_comments']
Values: ['._The _id field represents the unique identifier for each movie in the context of movies.', '.The plot field in the context of movies refers to the main events and developments that occur in the storyline of a film.', '.The genres field in the context of movies refers to the classification of films based on their themes, styles, or storylines.', '.runtime field refers to the duration of a movie, usually expressed in minutes.', '.cast field in movies context refers to the actors or actresses who portray the characters in the film.', '.The poster field in the context of movies refers to a promotional image or visual representation of the movie, typically showcasing the main actors or a scene from the film.', '.The title field represents the name of the movi

In [14]:
import json

# Read data from movies.json file
with open('movies.json', 'r') as f:
    existing_json_data = json.load(f)

# Update the JSON data with descriptions
for field, description in field_generated_descriptions.items():
    existing_json_data["fields"][field]["description"] = description

# Write the updated JSON data back to the file
with open('movies_new_together_ai.json', 'w') as f:
    json.dump(existing_json_data, f, indent=4)

# print(existing_json_data)

print("Descriptions updated and written to movies_new_together_ai.json file.")

Descriptions updated and written to movies_new_together_ai.json file.


# BERT Embedding and Cosine Similarity

In [2]:
!pip install transformers scikit-learn



In [3]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import json
import unittest

In [29]:
class TestMovieDescriptionSimilarity(unittest.TestCase):
    def setUp(self):
        # Load movies JSON data
        with open('movies_json.json', 'r') as f1, open('movies_new_together_ai.json', 'r') as f2:
            self.movies_data_1 = json.load(f1)
            self.movies_data_2 = json.load(f2)

            # print(self.movies_data_1)
            # print(self.movies_data_2)

        # Initialize BERT tokenizer and model
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model.eval()

    def test_movie_description_similarity(self):
        movie_desc_ref = []
        movie_desc_together = []

        # Extracting descriptions of movies_json.json
        fields_ref = self.movies_data_1['collections']['movies']['fields']
        for field, info in fields_ref.items():
            # print(f"Field: {field}")
            # print(f"Description: {info['description']}\n")
            movie_desc_ref.append(info['description'])

        # Extracting descriptions of movies_new_together_ai.json
        fields_together = self.movies_data_2['fields']
        for field, info in fields_together.items():
            # print(f"Field: {field}")
            # print(f"Description: {info['description']}\n")
            movie_desc_together.append(info['description'])

        # print(movie_desc_ref)
        # print(movie_desc_together)



        for movie1, movie2 in zip(movie_desc_ref, movie_desc_together):
            # print(movie1)
            # print(movie2)
            movie_desc_1 = movie1
            movie_desc_2 = movie2

            # Check if both descriptions are strings
            self.assertIsInstance(movie_desc_1, str)
            self.assertIsInstance(movie_desc_2, str)

            movie_desc_embedding_1 = self._calculate_bert_embedding(movie_desc_1)
            movie_desc_embedding_2 = self._calculate_bert_embedding(movie_desc_2)

            # Ensure that description embeddings are not empty
            # self.assertNotEqual(movie_desc_embedding_1, [])
            # self.assertNotEqual(movie_desc_embedding_2, [])

            # Calculate cosine similarity between embeddings
            similarity = self._calculate_cosine_similarity(movie_desc_embedding_1, movie_desc_embedding_2)
            print(f"Cosine Similarity between movie descriptions: {similarity}")
            self.assertTrue(similarity > 0.5, f"Similarity between movie descriptions is too low: {similarity}")

    def _calculate_bert_embedding(self, text):
        tokens = self.tokenizer.tokenize(text)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        token_tensor = torch.tensor([token_ids])

        with torch.no_grad():
            outputs = self.model(token_tensor)

        last_hidden_states = outputs[0]
        cls_embedding = last_hidden_states[:, 0, :].numpy()  # Convert to numpy array
        return cls_embedding

    def _calculate_cosine_similarity(self, vec1, vec2):
        return cosine_similarity(vec1, vec2)[0][0]

# Since Google Colab doesn't support running unittest.TestCase directly, we'll manually run the tests
test_runner = unittest.TextTestRunner()
suite = unittest.TestLoader().loadTestsFromTestCase(TestMovieDescriptionSimilarity)
result = test_runner.run(suite)


Cosine Similarity between movie descriptions: 0.8486945629119873
Cosine Similarity between movie descriptions: 0.7991851568222046
Cosine Similarity between movie descriptions: 0.7245012521743774
Cosine Similarity between movie descriptions: 0.7719172835350037
Cosine Similarity between movie descriptions: 0.8596510887145996
Cosine Similarity between movie descriptions: 0.6955277323722839
Cosine Similarity between movie descriptions: 0.8798936605453491
Cosine Similarity between movie descriptions: 0.8048529624938965
Cosine Similarity between movie descriptions: 0.7927318811416626
Cosine Similarity between movie descriptions: 0.7694969773292542
Cosine Similarity between movie descriptions: 0.8115817308425903
Cosine Similarity between movie descriptions: 0.7189486026763916
Cosine Similarity between movie descriptions: 0.7836775779724121
Cosine Similarity between movie descriptions: 0.8522384166717529
Cosine Similarity between movie descriptions: 0.883621335029602
Cosine Similarity between 

.
----------------------------------------------------------------------
Ran 1 test in 7.495s

OK


Cosine Similarity between movie descriptions: 0.7713451385498047


In [None]:
# class TestMovieDescriptionSimilarity(unittest.TestCase):
#     def setUp():
#         # Load movies JSON data
#         with open('movies.json', 'r') as f1, open('movies_new_together_ai.json', 'r') as f2:
#             movies_data_1 = json.load(f1)
#             movies_data_2 = json.load(f2)

#         # Initialize BERT tokenizer and model
#         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         model = BertModel.from_pretrained('bert-base-uncased')

#         return movies_data_1, movies_data_2, tokenizer, model

#     def test_movie_description_similarity():
#         movies_data_1, movies_data_2, tokenizer, model = TestMovieDescriptionSimilarity.setUp()

#         for movie1, movie2 in zip(movies_data_1, movies_data_2):
#             # field = "description"

#             print("Movie 1:\n", movie1["fields"][field]["description"])
#             print("Movie 2:\n", movie2["fields"][field]["description"])

#             movie_desc_1 = movie1["fields"][field]["description"]
#             movie_desc_2 = movie2["fields"][field]["description"]

#             # Check if both descriptions are strings
#             assert isinstance(movie_desc_1, str)
#             assert isinstance(movie_desc_2, str)

#             movie_desc_embedding_1 = _calculate_bert_embedding(movie_desc_1, tokenizer, model)
#             movie_desc_embedding_2 = _calculate_bert_embedding(movie_desc_2, tokenizer, model)

#             # Ensure that description embeddings are not empty
#             assert movie_desc_embedding_1 != []
#             assert movie_desc_embedding_2 != []

#             # Calculate cosine similarity between embeddings
#             similarity = _calculate_cosine_similarity(movie_desc_embedding_1, movie_desc_embedding_2)
#             print(f"Cosine Similarity between movie descriptions: {similarity}")
#             assert similarity > 0.5, f"Similarity between movie descriptions is too low: {similarity}"

# def _calculate_bert_embedding(text, tokenizer, model):
#     tokens = tokenizer.tokenize(text)
#     tokens = ['[CLS]'] + tokens + ['[SEP]']
#     token_ids = tokenizer.convert_tokens_to_ids(tokens)
#     token_tensor = torch.tensor([token_ids])

#     with torch.no_grad():
#         outputs = model(token_tensor)

#     last_hidden_states = outputs[0]
#     cls_embedding = last_hidden_states[:, 0, :].numpy()  # Convert to numpy array
#     return cls_embedding

# def _calculate_cosine_similarity(vec1, vec2):
#     return cosine_similarity(vec1, vec2)[0][0]

# # Manually run the tests
# unittest.main()



# Trial & Error - Don't Run This

In [None]:
import os
import time
import json
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key=<OPENAI_API_KEY>,
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=50
    )

    return chat_completion.data.text

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a description for the {key} field: {value}")
    field_generated_descriptions[key] = description.split(':', 1)[-1].strip() if description else ""  # Extract content part
    time.sleep(60 / 200)  # Delay to respect the RPD limit (200 requests per minute)

# Print final descriptions
print("Final Descriptions:")
print(field_generated_descriptions)

In [None]:
import os
import time
from openai import OpenAI

# Function to generate description using OpenAI Language Model
def generate_description(prompt):
    client = OpenAI(
        api_key=<OPENAI_API_KEY>,
    )

    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=50
    )

    return chat_completion.choices[0].message['content']

# Load JSON file
with open('movies.json', 'r') as f:
    data = json.load(f)

# Extract field names and descriptions
fields = data['fields']
field_descriptions = {key: value.get('description', '') for key, value in fields.items()}

# Generate description for each field with rate limiting
field_generated_descriptions = {}
for key, value in field_descriptions.items():
    description = generate_description(f"Generate a description for the {key} field: {value}")
    field_generated_descriptions[key] = description.split(':', 1)[-1].strip()  # Extract content part
    time.sleep(60 / 200)  # Delay to respect the RPD limit (200 requests per minute)

# Create a final description combining all field descriptions
final_description = " ".join(field_generated_descriptions.values())

# Print final description
print("Final Description:", final_description)


In [None]:
import re

# Provided output
output_text = "ChatCompletion(id='chatcmpl-9LZaHgLkyeosjEamDYNusxNKMY704', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Unique identifier for each document in a database collection.', role='assistant', function_call=None, tool_calls=None))], created=1714928701, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_3b956da36b', usage=CompletionUsage(completion_tokens=10, prompt_tokens=19, total_tokens=29))"

# Use regular expressions to extract the content
content_match = re.search(r"content='(.*?)'", output_text)

if content_match:
    content = content_match.group(1)
    print("Extracted Content:", content)
else:
    print("No content found.")


Extracted Content: Unique identifier for each document in a database collection.


In [None]:
# Create a final description combining all field descriptions
final_description = " ".join(field_generated_descriptions.values())

# Print final description
print("Final Description:", final_description)

BERT Embedding

In [11]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input text
text = "I love natural language processing!"

# Tokenize input text
tokens = tokenizer.tokenize(text)
# Add [CLS] and [SEP] tokens
tokens = ['[CLS]'] + tokens + ['[SEP]']
# Convert tokens to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
# Convert token IDs to tensor
token_tensor = torch.tensor([token_ids])

# Get BERT embeddings
with torch.no_grad():
    outputs = model(token_tensor)

# Extract the output embeddings (the output of the final layer)
last_hidden_states = outputs[0]

# For the first token, which is [CLS], you can extract the embedding as follows:
cls_embedding = last_hidden_states[:, 0, :]

# Now cls_embedding contains the BERT embedding for the input text
print(cls_embedding)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tensor([[ 1.0501e-01,  2.3739e-01, -1.9970e-02, -1.8423e-01, -2.7452e-01,
         -4.2344e-01,  2.7119e-01,  7.4514e-01,  7.1286e-03, -2.8330e-01,
         -1.8654e-02,  9.7414e-03,  1.7744e-01,  1.8370e-01,  2.9469e-01,
         -8.6571e-02, -1.3720e-01,  3.8465e-01,  1.8496e-01, -8.4644e-02,
         -7.5815e-02, -1.9936e-01, -1.5177e-03, -1.0480e-01,  2.5557e-01,
         -2.4288e-01,  1.3063e-01,  3.8064e-02,  1.7991e-01, -4.0952e-02,
          3.2467e-02,  1.2837e-01, -5.5229e-02,  3.3944e-02,  2.8307e-02,
         -1.0063e-01, -1.1154e-01, -1.7855e-01,  1.5756e-03, -1.1465e-02,
         -2.6244e-02, -6.8964e-02,  2.1842e-02,  1.9151e-03, -1.2272e-01,
         -1.9624e-01, -2.6947e+00, -1.5402e-01, -2.6869e-01, -1.2473e-01,
          3.4256e-01,  2.1459e-01,  2.1122e-01,  7.9208e-02,  5.3878e-02,
          2.1164e-01, -3.6505e-01,  5.7705e-01, -9.7463e-02,  1.3112e-01,
          2.2906e-01,  1.7993e-01, -2.1684e-01,  1.1254e-01, -1.9131e-02,
         -1.8134e-01, -1.7209e-01,  2.