In [None]:
from dotenv import load_dotenv
from threading import Thread
import openai
import google.generativeai as genai
import os
import asyncio
import time
import ast
import re
from datetime import datetime, timedelta
import calendar
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility, db
from transformers import LlamaTokenizerFast
load_dotenv('../.env')


In [None]:
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")


In [None]:
MAX_QUERIES_PER_MINUTE = 60
DATABASE_NAME = 'CUSTOM_DATASETS'
COLLECTION_NAME = 'calculator_v2_test'


In [None]:
SYSTEM_PROMPT = """
Generate a sample Multi-Turn conversation between a user and a voice assistant named JARVIS in which the voice assistant has the capability of using a calculator (for calculation), datetime (to see current day date time in `DAY YYYY-MM-DD HH:MM:SS` format). The assistant has a frank personality and is very helpful.
The calculator can perform these arithmetic operations.
1. Addition (+): adds two operands
2. Subtraction (-): subtracts two operands
3. Multiplication (*): multiplies two operands
4. Division (/): divides two operands
5. Modulus (%): returns the remainder when the first operand is divided by the second
6. Floor division (//): returns the quotient when the first operand is divided by the second
7. Exponent (**): returns the first operand raised to the power of the second operand
NOTE: `^` is not a valid operator for exponentiation. Use `**` instead.

NOTE: This calculator works on the BODMAS rule which means multiplication and division are performed before addition and subtraction.
Example Usage:
<calculator>(2+3)</calculator>

NOTE: If you need to use two or more operators in a single expression, every time you "must" apply brackets to specify the order of operations.
Precedence of operators:
1. Exponentiation (**)
2. Multiplication (*), Division (/), Floor division (//), and Modulus (%)
3. Addition (+) and Subtraction (-)
Example Usage:
<calculator>(2+(3*4))</calculator> Here, 3 and 4 will be multiplied first and then the result will be added to 2.

## Example use of `<calculator>` tag
<|user|>
Hey, buddy, I'm planning a road trip, and I want to calculate the total distance I'll be driving. The trip involves multiple stops, and I have the distances between each pair of stops. Can you help me find the total distance?</s>
<|assistant|>
Absolutely! I'd be happy to assist. Could you provide me with the distances between each pair of stops and the list of stops on your road trip?</s>
<|user|>
Sure, Home to Gas Station is 10 miles Gas Station to Mountain View is 30 miles Mountain View to Lakeside Park is 15 miles Lakeside Park to Beach Resort is 25 miles Beach Resort to Home is 40 miles.</s>
<|assistant|>
Great, <calculator>(10+30+15+25+40)<stop>120</calculator>the total distance is 120 miles. Is there anything else I can help you with?</s>
<|user|>
That's perfect. One more thing, what's the average speed I should maintain if I want to reach the Beach Resort in 2 hours?</s>
<|assistant|>
Let me calculate that for you. <calculator>(120/2)<stop>60</calculator>You should maintain an average speed of 60 miles per hour. Is there anything else I can help you with?</s>
<|user|>
That's all I need. Thanks for your help.</s>
<|assistant|>
You're welcome. Have a great trip! Drive Safe, and feel free to ask for any help.</s>

## Example use of `<datetime>` tag, Output of `<datetime>` tag should be in the format of `DAY YYYY-MM-DD HH:MM:SS`. eg. Fri 2024-02-01 18:17:04
<|user|>
Hey, what is the date today?</s>
<|assistant|>
Today is <datetime><stop>Fri 2024-02-01 18:17:04</datetime> first September.</s>
<|user|>
Hey, what is the time now?</s>
<|assistant|>
The time is <datetime><stop>Fri 2024-02-01 18:17:04</datetime> six seventeen PM.</s>
<|user|>
Can you tell me exact time including seconds?</s>
<|assistant|>
The time is <datetime><stop>Fri 2024-02-01 18:17:14</datetime> six seventeen and fourteen seconds PM.</s>
<|user|>
What time it will be from twenty two hours from now?</s>
<|assistant|>
The time will be <datetime><stop>Sat 2024-02-02 16:17:14</datetime> four seventeen and fourteen seconds PM.</s>

Generate more such conversations to train the assistant. 
- The main calculation should be surrounded by "<calculator>" and "</calculator>" which will not be shown to the user and is evaluated by a computer. 
- You should use the output of the calculator in your response, eg. <|user|>\nWhat is 2 + 2?\n<|assistant|>\n2 plus 2 is <calculator>(2+2)<stop>4</calculator>four.
- Assume this system is being used to do day-to-day tasks.
- Do not explain your calculation to the user.
- User query can't contain any kind of symbols such as (%, $,₹, .,), etc. Instead, it should use the corresponding word representation of the symbols eg. dollar for $ and percentage for %
- If the user query contains currency or a number, convert it into words eg. ₹100 -> one Hundred rupees, 3500 rupees -> thirty-five hundred rupees, ₹500 -> five hundred rupees, ₹.25 -> twenty-five paisa.
- Try to use currency as rupees
- Both User queries and Assistant response should end with "</s>"
- the assistant response should contain "<stop>" after the expression in between the opening and closing tags of "<calculator>" and "</calculator>". eg. ASSISTANT: The cost of a dozen eggs would be <calculator>(1*12)<stop>12</calculator>twelve rupees.</s>
- If the calculator gives a float value, round it accordingly. eg. calculator gives you 4.98 output, you can say "almost five" to the user.
- If you need to use some constants such as "pi" during calculations use its value 3.14.

Generate more such MT-conversations to train the assistant. You should use all available tags in your conversations integratively.
"""


## Function to get embeddings

In [None]:
# def get_embeding(text: str) -> tuple[list[float], str]:
#     __model_name = "text-embedding-ada-002"
#     return openai.embeddings.create(
#         model=__model_name,
#         input=[text]
#     ).data[0].embedding, __model_name


## Setting API Keys and Connecting to the Database

In [None]:
# openai.api_key = os.getenv('OPENAI_API_KEY')


In [None]:
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))


In [None]:
connections.connect(
    alias="default",
    host=os.getenv("MILVUS_HOST"),
    port=os.getenv("MILVUS_PORT"),
    user=os.getenv("MILVUS_USER"),
    password=os.getenv("MILVUS_PASSWORD"),
)


## Creating Milvus Database to store embeddings and text data

In [None]:
if not DATABASE_NAME in db.list_database():
    db.create_database(DATABASE_NAME)
db.using_database(DATABASE_NAME)


In [None]:
if not COLLECTION_NAME in utility.list_collections():
    # Create collection
    schema = CollectionSchema(
        fields=[
            FieldSchema(
                name="conversation_id", 
                dtype=DataType.INT64, 
                is_primary=True, 
                auto_id=True, 
                description="Unique id for each conversation"
            ), 
            # FieldSchema(
            #     name="embeding", 
            #     dtype=DataType.FLOAT_VECTOR, 
            #     dim=len(get_embeding("Hello")[0]),
            #     description=f"Embedings generated using '{get_embeding('Hello')[1]}'"
            # ),
            FieldSchema(
                name="conversation",
                dtype=DataType.VARCHAR,
                max_length=2048,
                description="Conversation between user and assistant"
                )
            ], 
        enable_dynamic_field=True,
        description=SYSTEM_PROMPT
    )

    collection = Collection(name=COLLECTION_NAME, schema=schema)

    collection.create_index(
        field_name="embeding", 
        index_params={
            "index_type": "IVF_FLAT",
            "metric_type": "COSINE",
            "params": {"nlist": 128}
        }
    )

    print(f'Collection {COLLECTION_NAME} created successfully.')

else:
    collection = Collection(name=COLLECTION_NAME)

collection.load()
print(f'Collection {COLLECTION_NAME} loaded successfully.')


## Gemini Config

In [None]:
generation_config = genai.types.GenerationConfig(
    candidate_count=1,
    temperature=0.9,
    max_output_tokens= 1024,
    # stop_sequences=["<stop>"]
)

model = genai.GenerativeModel(
    model_name="gemini-pro",
    generation_config=generation_config,
    safety_settings=None
)


## Test Cases

In [None]:
def assistant_response_endswith_eos_token(text: str):
    # Matches all the assistant responses in the text except the last one
    expr = r'<\|assistant\|>\n(.*?)<\|user\|>'
    assistant_responses = [x.strip() for x in re.findall(expr, text, re.DOTALL)]
    # Last assistant response
    assistant_responses.append(text.split('<|assistant|>')[-1].strip())
    return not all([x.endswith('</s>') for x in assistant_responses])


In [None]:
def stop_token_between_calculator_tags(text):
    expr = r'<calculator>(.*?)</calculator>'
    calculator_expressions = [x.strip() for x in re.findall(expr, text)]
    return not all([x.count('<stop>') == 1 for x in calculator_expressions])


In [None]:
def number_of_open_calc_tags_equal_number_of_close_calc_tags(text):
    return not text.count('<calculator>') == text.count('</calculator>')


In [None]:
# def is_calculator_calculation_correct(text, debug=False):
#     expr = r'<calculator>(.*?)<stop>(.*?)</calculator>'
#     calculator_expressions = re.findall(expr, text)
#     if debug:
#         return [f'{x} -> {eval(x[0]) == eval(x[1])}' for x in calculator_expressions]
#     return not all([eval(x[0]) == eval(x[1]) for x in calculator_expressions])


In [None]:
def user_input_should_not_contain_special_token(text):
    spl_tokens = ['<s>', '</s>', '<unk>', '<calculator>', '</calculator>', '<stop>']
    expr = r'<\|user\|>\n(.*?)<\|assistant\|>'
    user_inputs = [x.strip() for x in re.findall(expr, text)]
    return not all([not any([x.count(y) > 0 for y in spl_tokens]) for x in user_inputs])


In [None]:
def number_of_user_inputs_equal_number_of_assistant_responses(text):
    return not text.count('<|user|>') == text.count('<|assistant|>')


In [None]:
def test_response(text):
    return assistant_response_endswith_eos_token(text) or stop_token_between_calculator_tags(text) or number_of_open_calc_tags_equal_number_of_close_calc_tags(text) or user_input_should_not_contain_special_token(text) or number_of_user_inputs_equal_number_of_assistant_responses(text)


## Generate Data

In [None]:
def calculator(expr: str) -> str:
    """Evaluates the expression and returns the result. 
    
    Args:
        expr (`str`): The expression to be evaluated.

    Returns:
        `str`: The result of the expression rounded to 2 decimal places.
        If the expression is invalid, it will return `Invalid expression: <error>`.

    Example:
        <calculator>(2+3)<stop>5</calculator>
    """
    try:
        parsed_expression = ast.parse(expr, mode='eval')
        result = eval(compile(parsed_expression, filename='<string>', mode='eval'))
        return str(round(result, 2))

    except Exception as e:
        return f'Invalid expression: {e}'

calculator('5/3)')


In [None]:
def get_DayDateTime():
    """Returns current date time in `DAY YYYY-MM-DD HH:MM:SS` format. 

    Args:
        None

    Returns:
        `str`: Current date time in `DAY YYYY-MM-DD HH:MM:SS` format.
    
    Example:
    <datetime><stop>Fri 2024-02-01 18:17:04</datetime>
    """
    try:
        return datetime.now().strftime("%a %Y-%m-%d %H:%M:%S")

    except Exception as e:
        return f'ERROR: {e}'

get_DayDateTime()


In [None]:
def deltatime(start_time: str, end_time: str):
    """Compute time difference between two time strings in HH:MM:SS format and return the result in x hours y minutes z seconds format.

    `NOTE`: If the start time is greater than the end time, it will swap the start and end time, so the result will always be positive.
    
    Args:
        start_time (`str`): Start time in HH:MM:SS format.
        end_time (`str`): End time in HH:MM:SS format.

    Returns:
        `str`: Time difference in x hours y minutes z seconds format.
    
    Example:
    >>> deltatime('18:03:18', '20:02:33')
    '1 hours 59 minutes 15 seconds'
    """
    try:
        start_time = datetime.strptime(start_time, "%H:%M:%S")
        end_time = datetime.strptime(end_time, "%H:%M:%S")

        if start_time > end_time:
            start_time, end_time = end_time, start_time
        
        time_diff = end_time - start_time
        hours, remainder = divmod(time_diff.seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return f'{hours} hours {minutes} minutes {seconds} seconds'

    except Exception as e:
        return f'Invalid Time: {e}'
    
deltatime('20:2:33', '18:03:18')
# deltatime('18:03:18', '20:02:33')


In [None]:
def deltadate(start_date: str, end_date: str):
    """Compute date difference between two date strings in YYYY-MM-DD format and return the result in x years, x months, x days format.

    `NOTE`: If the start date is greater than the end date, it will swap the start and end date, so the result will always be positive.
    
    Args:
        start_date (`str`): Start date in YYYY-MM-DD format.
        end_date (`str`): End date in YYYY-MM-DD format.

    Returns:
        `str`: Date difference in x years, x months, x days format.
    
    Example:
    >>> deltadate('2021-09-01', '2021-09-09')
    '0 years, 0 months, 8 days'
    """
    # Parse input date strings
    try:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d")

        if start_date > end_date:
            start_date, end_date = end_date, start_date
        
        # Calculate date difference
        delta = end_date - start_date
        
        # Format the result as x years, x months, x days
        return f'{delta.days//365} years, {(delta.days%365)//30} months, {(delta.days%365)%30} days'

    except Exception as e:
        return f'Invalid Date: {e}'
# deltadate('2021-09-01', '2090-09-09')
deltadate('2021-09-09', '2021-09-01')


In [None]:
def get_day_for_date(date_string: str):
    """Get the day for a given date string in YYYY-MM-DD format.

    Args:
        date_string (`str`): The date in YYYY-MM-DD format.

    Returns:
        `str`: The day for the given date.
    
    Example:
    >>> get_day_for_date('2024-02-02')
    'Friday'
    """
    try:
        date_object = datetime.strptime(date_string, "%Y-%m-%d")
        
        # Use strftime to format the date as the day
        day = date_object.strftime("%A")
        return day

    except Exception as e:
        return f'ERROR: {e}'

get_day_for_date('2024-02-03')


In [None]:
def get_all_dates_for_a_day_in_a_given_month_and_year(day: str, month: str, year: int):
    """Get all the dates for a given day in a month and year.

    Args:
        day (`str`): The day of the week. Day should be one of the following: [Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday].
        month (`str`): The month of the year. Month should be one of the following: [January, February, March, April, May, June, July, August, September, October, November, December].
        year (`int`): The year.

    Returns:
        `dict`: A dictionary containing the dates for the given day in the month and year.
    
    Example:
    >>> get_all_dates_for_a_day_in_a_given_month_and_year('Friday', 'February', 2024)
    {'1st Friday': '2024-02-02', '2nd Friday': '2024-02-09', '3rd Friday': '2024-02-16', '4th Friday': '2024-02-23'}
    """
    try:
        # Create a dictionary to store the result
        result = {}

        # Convert month name to its corresponding number
        month_number = list(calendar.month_name).index(month)

        # Find the day of the week for the 1st day of the given month and year
        first_day_of_month = datetime(year, month_number, 1)
        day_of_week = first_day_of_month.weekday()

        # Calculate the offset to get to the desired day
        days_to_add = (list(calendar.day_name).index(day) - day_of_week + 7) % 7

        # Generate dates for the specified day in the given month and year
        current_date = first_day_of_month + timedelta(days=days_to_add)
        nth_day = 1
        while current_date.month == month_number:
            ordinal = 'th' if 4 <= nth_day <= 20 or 24 <= nth_day <= 30 else {1: 'st', 2: 'nd', 3: 'rd'}.get(nth_day % 10, 'th')
            result[f'{nth_day}{ordinal} {day}'] = current_date.strftime('%dth %B %Y')
            current_date += timedelta(weeks=1)
            nth_day += 1

        return result

    except Exception as e:
        return f'ERROR: {e}'

get_all_dates_for_a_day_in_a_given_month_and_year('Saturday', 'March', 2024)


In [None]:
# def generate_response_gemini(text: str = ''):
#     response = text + model.generate_content(SYSTEM_PROMPT + text).text.strip()
#     if not response.endswith('</s>'):
#         # print(response)
#         response = f'{response}<stop>{calculator(response[response.rfind('<calculator>') + len('<calculator>'):])}</calculator>'
#         return generate_response_gemini(response)
#     return response


In [None]:
def generate_content(prev_text: str = ''):
    try:
        response = model.generate_content(SYSTEM_PROMPT + prev_text).text.strip()
        # if not response.endswith('</s>'):
        #     response = f'{response}<stop>{round(eval(response[response.rfind("<calculator>") + len("<calculator>"):]), 2)}</calculator>'
        #     return generate_content(response)

        # if not test_response(response):
        print(prev_text + response)

    except Exception as e:
        print(e)

generate_content()


In [None]:
response = model.generate_content(SYSTEM_PROMPT)


In [None]:
print(response.text)


In [None]:
if __name__ == '__main__':
    while True:
        threads = [Thread(target=insert_data_point) for _ in range(MAX_QUERIES_PER_MINUTE)]
        for thread in threads: thread.start()
        time.sleep(60)
        for thread in threads: thread.join()
