In [23]:
import aiohttp
import asyncio
import pandas as pd
import json
import numpy as np
import plotly.express as px
import time
import re
import random

file_path = "/Users/velo1/SynologyDrive/GIT_syno/Mac/Netology/NLP/data/book-war-and-peace.txt"
max_tokens = '200'
approx_chars_per_token = 3
seq_len = 5000
res = []

with open(file_path, 'r', encoding='utf-8') as file:
    long_text = file.read()

def extract_random_sequence(text, seq_length):
    seq_length *= approx_chars_per_token
    # Choose a random starting index
    start_index = random.randint(0, len(text) - seq_length)
    # Extract the sequence from the text
    sequence = text[start_index:start_index + seq_length]
    return sequence.strip()


async def measure_llm_response_time(context, session, concurency):
    '''Asynchronous function to measure the response time for a given context length'''
    data = {
        "model": "/model-store/mistralai/Mistral-7B-Instruct-v0.1",
        "messages": [
            {"content": "<s>[INST]You are a helpful assistant. Analyze the context and answer users question.[/INST]", "role": "system"},
            {"content": f"The context: {context}</s>", "role": "assistant"},
            {"content": "Try to count all the unique names in the context. What are these names and their count? Try to do your best and make your answer as detailed as possible", "role": "user"}
        ],
        "max_tokens": f'{max_tokens}'
    }
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json'
    }


    start_time = time.time()
    async with session.post('http://194.135.112.219:3003/v1/chat/completions', json=data, headers=headers) as response:
        response_data = await response.json()
    total_response_time = time.time() - start_time
    usage_data = response_data['usage']
    completion_tokens = usage_data['completion_tokens']
    prompt_tokens = usage_data['prompt_tokens']


    return {
        'concurency': concurency,
        'context_length': seq_len,
        'max_tokens': max_tokens,
        'total_response_time': total_response_time,
        'completion_tokens': completion_tokens,
        'prompt_tokens': prompt_tokens
    }

async def run_experiment(full_text, concurrency):
    '''Asynchronous function to run the experiment with a given concurrency'''
    start_power = np.log10(200)
    end_power = np.log10(200) + 1
    context_lengths = np.logspace(start_power, end_power, num=1, dtype=int)
    results = []

    async with aiohttp.ClientSession() as session:
        tasks = []
        for length in context_lengths:
            for _ in range(concurrency):
                # Generate a unique random sequence for each task
                unique_text = extract_random_sequence(full_text, seq_len)
                tasks.append(measure_llm_response_time(unique_text, session, concurrency))
                # tasks.append(measure_llm_response_time(full_text[:seq_len*approx_chars_per_token], session, concurrency))
        responses = await asyncio.gather(*tasks)
        for response in responses:
            results.append(response )
            # print(response)
            # print(f"Context Length: {response['context_length']} Tokens - Total: {response['total_response_time']:.4f} sec")
    return results

async def main():
    global res
    for max_tokens in ['200']:  # Add other values as needed
        print(f"Max Tokens: {max_tokens}{'*'*50}")
        for concurrency in [8, 16, 32, 64, 128]:  # Add other concurrency levels as needed    
            print(f"Testing concurrency: {concurrency}")
            # Pass the full `long_text` to extract a random sequence for each experiment
            res += await run_experiment(long_text, concurrency)




await main()

# Create a DataFrame from the results
df = pd.DataFrame(res)

# Display the dataframe
print(df)
df.to_csv('/Users/velo1/SynologyDrive/GIT_syno/Mac/Netology/NLP/LLM_batching_3090.csv', index=False)



Max Tokens: 200**************************************************
Testing concurrency: 8
Testing concurrency: 16
Testing concurrency: 32
Testing concurrency: 64
Testing concurrency: 128
     concurency  context_length max_tokens  total_response_time  \
0             8            5000        200            15.645329   
1             8            5000        200            15.643634   
2             8            5000        200            15.641177   
3             8            5000        200            15.638423   
4             8            5000        200            15.638022   
..          ...             ...        ...                  ...   
243         128            5000        200           216.836032   
244         128            5000        200           212.989813   
245         128            5000        200           216.835996   
246         128            5000        200           216.834008   
247         128            5000        200           216.835090   

     comp