In [1]:
#!pip install langgraph
#!pip3  install torch torchvision torchaudio transformers
#!pip3 install packaging ninja
#!pip3 install accelerate
#!pip3 install protobuf
#!pip3 install sentencepiece
#!pip3 install bitsandbytes
#!pip3 install scipy

from transformers import pipeline
import torch, os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer, LlamaForCausalLM, MistralForCausalLM
import random, json
import inspect
import json
from typing import Dict, Any, Optional, Callable, List

class Agent:
    def __init__(self, model_name='Qwen/Qwen2.5-Coder-7B-Instruct',
                agent_name='dummy_model', message=''):
        # Load Qwen model and tokenizer            
        bnb_config = BitsAndBytesConfig(
            torch_dtype="auto",
            device_map="auto",
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quantw_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,  # Changed from bfloat16 to float16
            bnb_4bit_quant_storage=torch.uint8,    # Added for storage optimization
            use_nested_quant=True,                 # Added for nested quantization
        )
        max_length=8500  # Total tokens (input + output)
        max_new_tokens=500  # Limit output tokens
        save_directory = model_name.replace('/','_')+'_saved_response'
        try:
            
            print('Trying to load the mode:',save_directory,'from local repo')
            #self.model = AutoModelForCausalLM.from_pretrained(save_directory)
            #self.tokenizer = AutoTokenizer.from_pretrained(save_directory)
            self.model_pipeline = pipeline(
                "text-generation", 
                model=save_directory,
                model_kwargs={
                    "quantization_config": bnb_config,
                    "torch_dtype": torch.float16,
                },
                max_length=max_length,
                max_new_tokens=max_new_tokens
            )
        except:  
            print('The model:',model_name,'is not found locally, downloading it')
            #self.model = AutoModelForCausalLM.from_pretrained(
            #    model_name, quantization_config=bnb_config, token="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"
            #)
            #self.tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
            self.model_pipeline = pipeline(
                "text-generation", 
                model=model_name,
                model_kwargs={
                    "quantization_config": bnb_config,
                    "torch_dtype": torch.float16,
                },
                max_length=max_length,
                max_new_tokens=max_new_tokens,
                token="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"
            )
            print("Saving the model:",model_name," locally")
            #self.model.save_pretrained(save_directory)
            #self.tokenizer.save_pretrained(save_directory)
            self.model_pipeline.model.save_pretrained(save_directory)
            self.model_pipeline.tokenizer.save_pretrained(save_directory)
            
            
        self.agent_name = agent_name
        self.model_name = model_name
        if not message:
            message = "You are a helpful AI assistant. Maintain context and be concise.\n\n"
        self.message = message
        if 'nstruct' in model_name:
                self.messages = [dict({"role": "system", "content": self.message})]
        else:
            self.messages = ""
        self.instruct_history = 10
        self.instruct_history_counter = 0
        self.tools = []
        
    def clear_response(self,messages, response_string):
        #print('agent_name',agent_name)
        if all(keyword in self.model_name for keyword in ['Qwen','Instruct']):
            #print('//////////////',response_string,'\n','//////////')
            return response_string.replace('ssistant.','%').split('ssistant\n')[1]
        if all(keyword in self.model_name for keyword in ['falcon','instruct']): 
            return response_string.split('ssistant:')[1].split('User')[0]
        if all(keyword in self.model_name for keyword in ['lama','nstruct']):
            return response_string.split('ssistant\n')[1].split('User')[0]
        if all(keyword in self.model_name for keyword in ['mistralai','nstruct']):
            return response_string[len(messages[0]['content'])+len(messages[1]['content'])+2:]
        if all(keyword in self.model_name for keyword in ['OpenHermes','OpenHermes']):
            return response_string.split('ssistant\n')[1].split('User')[0]
    def create_system_prompt(self,prompt, extras=[]):
            if 'nstruct'in self.model_name:
                return self.instruct_create_system_prompt(prompt)
            else:
                return self.llm_create_system_prompt(prompt)

    def instruct_create_system_prompt(self,prompt, extras=[]):  ### for instruct models
        
        self.messages.append({
            "role": "user", 
            "content": prompt
        })
        self.messages = self.messages + extras + self.tools
        for msg in self.messages[-self.instruct_history:]:  # Limit context to last 5 messages
            prompt += f"{msg['role'].capitalize()}: {msg['content']}\n"
        prompt += "Assistant:"
        return prompt
            
    def llm_create_system_prompt(self,prompt):
        return self.system_messages+'\n'+prompt
    
        
    def get_tool_schema(self,func: Callable) -> dict:
        """
        Generate a JSON schema for a tool function.

        Args:
            func (Callable): The function to generate a schema for.

        Returns:
            dict: A JSON schema representing the function's parameters.
        """
        import inspect
        signature = inspect.signature(func)
        parameters = {}

        for name, param in signature.parameters.items():
            parameters[name] = {
                "type": "string",  # Assume string type for simplicity
                "description": f"Parameter {name}"
            }

        return {
            "type": "function",
            "function": {
                "name": func.__name__,
                "description": func.__doc__.split('\n')[0] if func.__doc__ else "",
                "parameters": {
                    "type": "object",
                    "properties": parameters
                }
            }
        }
    def generate_response(self,prompt):
        if 'nstruct' in self.model_name:
            return self.instruct_generate_response(prompt)
        else:
            return self.llm_generate_response(prompt)
        
    def instruct_generate_response(self,prompt):  #### given we are using instruct model and it is  tools compatible (if tools are stated)
        
        # Generate response
        messages =  self.create_system_prompt(prompt)
        schema_tools = []
        response = self.model_pipeline(messages)[0]['generated_text']
        self.messages.append({
            "role": "assistant", 
            "content": response.split('ssistant:')[-1]
        })
        
        return response.split('ssistant:')[-1]
    
    
        for tool in self.tools:
            schema_tools.append(self.get_tool_schema(tool))
        text = self.tokenizer.apply_chat_template(
            messages,
            tools= schema_tools,
            tokenize=False,
            add_generation_prompt=True
        )

        # Generate response
        print(" I am here 2 ")
        inputs = self.tokenizer(text, return_tensors="pt", return_attention_mask=True).to(self.model.device)
        generated_ids = self.model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=384,
            #pad_token_id=self.model.config.eos_token_id
        )

        # Decode response
        response = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return response
    
    def llm_generate_response(self, prompt): # if the model is not instruct
        # Prepare input
        
        messages = self.create_system_prompt(prompt)
        text = messages
        print('llm result: ')
        response = self.model_pipeline(prompt)[0]['generated_text']
        return response[len(prompt):].strip()
        # Generate response
        inputs = self.tokenizer(question, return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            #input_ids=inputs["input_ids"],
            **inputs,
            max_new_tokens=100, 
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=self.model.config.eos_token_id
        )

        # Decode response
        response = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return response[len(text):].strip()

In [2]:
agent1 = agent2 = None

In [3]:
def agentthis(prompt="",message=""):
    global agent1,agent2
    READER_MODEL_NAME1 = "Qwen/Qwen2.5-Coder-7B-Instruct"
    READER_MODEL_NAME2 = "tiiuae/falcon-7b-instruct"
    READER_MODEL_NAME3 = 'teknium/OpenHermes-2.5-Mistral-7B'
    READER_MODEL_NAME4 = 'meta-llama/Llama-3.2-3B-Instruct'
    READER_MODEL_NAME5 = "mistralai/Mistral-7B-Instruct-v0.3"
    READER_MODEL_NAME6 = "meta-llama/Llama-3.1-8B"
    READER_MODEL_NAME7 = "meta-llama/Llama-3.1-8B-Instruct"
    READER_MODEL_NAME8 = "meta-llama/Meta-Llama-3.1-8b-Instruct"
    READER_MODEL_NAME9 = "meta-llama/Llama-3.2-1B-Instruct"
    READER_MODEL_NAME10 = "mistralai/Mixtral-8x22B-Instruct-v0.1"
    
    
    if agent1 == None:
        agent1 = Agent(READER_MODEL_NAME10,'agent1', message)
    if not message:
        agent1.message ="You are a helpful AI assistant. Maintain context and be concise.\n\n"
    else:
            agent1.message = message
    if not prompt:
        prompt = 'rephrase this in a precise brief way: I tried many times and it was very unsuccessfull to reach \
     the disired results. this is out of my hand and I cannod to anything'
    agent1_response = agent1.generate_response(prompt)
    return agent1_response, agent1

In [4]:
if __name__ == "__main__":
    response,agent = agentthis("how many continents in the world ? Name them only ", "You are a helpful AI assistant.Your name is Assisto. Maintain context and be concise.\n\n")
    print(response)

Unused kwargs: ['torch_dtype', 'device_map', 'bnb_4bit_quantw_type', 'use_nested_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Trying to load the mode: mistralai_Mixtral-8x22B-Instruct-v0.1_saved_response from local repo
The model: mistralai/Mixtral-8x22B-Instruct-v0.1 is not found locally, downloading it


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/59 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 

In [None]:
agent.messages

In [None]:
response, _ = agentthis("my name Moataz, what is your name ?")
print(response)

In [None]:
response, _ = agentthis("do you remember my name ?")
print(response)

In [None]:
response, _ = agentthis("What is your name ?")
print(response)