Merge pull request #4 from NavinKumarMNK/mnk-models-support

[Model Support] Now supported 4 models [Nous-Capybara-34B, Qwen-32B, Mistral-7B, C4AI-35B].
NavinKumarMNK · Apr 11, 2024 · aff8dff · aff8dff
2 parents a13330e + c65b4e4
commit aff8dff
Show file tree

Hide file tree

Showing 27 changed files with 417 additions and 444 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -4,7 +4,12 @@ version: 2
 mkdocs:
   configuration: mkdocs.yml
 
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
+
 # Optionally declare the Python requirements required to build your docs
 python:
    install:
-   - requirements: docs/requirements.txt
+   - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # AI-Learning Platform
 
-## Documentation
+## Documentation [![Documentation Status](https://readthedocs.org/projects/ai-learning-platform/badge/?version=latest)](https://ai-learning-platform.readthedocs.io/en/latest/?badge=latest)
 - This project uses `mkdocs` as the documentation service
 - serve the document 
 
 ```bash
 pip install mkdocs
 mkdocs serve
-```
+```
diff --git a/ml_service/.gitattributes b/ml_service/.gitattributes
@@ -0,0 +1,2 @@
+*.html binary
+*.ipynb binary
diff --git a/ml_service/.gitignore b/ml_service/.gitignore
@@ -6,4 +6,6 @@ __pycache__
 .env
 cudnn.tar.xz
 *.tar.gz
-core.*
+core.*
+.env
+.history
diff --git a/ml_service/Dockerfile b/ml_service/Dockerfile
@@ -10,4 +10,6 @@ RUN conda install -y rocketce::onnxruntime
 # COPY . /app
 ENV RAY_DEDUP_LOGS=1
 
+RUN printenv | awk -F= '{print "export " $1 "=\"" $2 "\""}' >> ~/.bashrc
+
 WORKDIR /app/
diff --git a/ml_service/Makefile b/ml_service/Makefile
@@ -29,53 +29,53 @@ run-vllm-ray:
 	docker run -it --runtime=nvidia --gpus all -v /data/navin-kumar-m:/data --ipc=host --privileged ray-vllm
 
 ray-up:
-	ray up -y config/ray-cluster.yaml --no-config-cache -v
+	ray up -y config/ray/ray-cluster.yaml --no-config-cache -v
 
 ray-down:
-	ray down -y config/ray-cluster.yaml
+	ray down -y config/ray/ray-cluster.yaml
 
 ray-attach:
-	ray attach config/ray-cluster.yaml
+	ray attach config/ray/ray-cluster.yaml
 
 RAY_BASH_CODE ?= 'python -c "import ray; ray.init()"'  # Default Python code
 ray-exec:
-	ray exec config/ray-cluster.yaml $(RAY_BASH_CODE)
+	ray exec config/ray/ray-cluster.yaml $(RAY_BASH_CODE)
 
 ray-serve-deploy:
-	serve deploy config/ray-serve.yaml -v
+	serve deploy config/ray/ray-serve.yaml -v
 
 ray-serve-run:
-	serve run config/ray-serve.yaml
+	serve run config/ray/ray-serve.yaml
 
 ray-serve-status:
 	serve status
 
 ray-log:
-	ray logs config/ray-cluster.yaml
+	ray logs config/ray/ray-cluster.yaml
 
 ray-rsync-up:
-	ray rsync-up config/ray-cluster.yaml -v
+	ray rsync-up config/ray/ray-cluster.yaml -v
 
 ray-rsync-down:
-	ray rsync-downs config/ray-cluster.yaml
+	ray rsync-downs config/ray/ray-cluster.yaml
 
 ray-up-dev:
-	ray up -y config/ray-cluster.dev.yaml --no-config-cache -v
+	ray up -y config/ray/ray-cluster.dev.yaml --no-config-cache -v
 
 ray-down-dev:
-	ray down -y config/ray-cluster.dev.yaml
+	ray down -y config/ray/ray-cluster.dev.yaml
 
 ray-attach-dev:
-	ray attach config/ray-cluster.dev.yaml
+	ray attach config/ray/ray-cluster.dev.yaml
 
-
-ray-dev-ssh-keygen:
-	if [ -z "$(IP)" ]; then echo "IP is not set"; exit 1; fi
-	if [ -z "$(PORT)" ]; then echo "PORT is not set"; exit 1; fi
-	ssh-keygen -f "/root/.ssh/known_hosts" -R "[$(IP)]:$(PORT)"
+ray-dev-ssh-rmkey:
+	ssh-keygen -f "$(KH_PATH)" -R "[$(IP)]:$(PORT)"
 
 ray-dev-ssh:
-	if [ -z "$(IP)" ]; then echo "IP is not set"; exit 1; fi
-	if [ -z "$(PORT)" ]; then echo "PORT is not set"; exit 1; fi
-
 	ssh -p $(PORT) root@$(IP)
+
+rm-pycache:
+	find . -type d -name __pycache__ -exec rm -r {} +
+
+llm_chat:
+	python3 ./test/llm_client_http.py
diff --git a/ml_service/README.md b/ml_service/README.md
@@ -1,40 +1,29 @@
-# AI Learning Management System
 
-    - Code is written to be run on a local linux PC machine on the lab.
-
-## Plan
-- [ ] Full website of Video Tutorials
-- [ ] Deployed Website 
-- [ ] Deployed LLM Model
-- [ ] LLM Model SFT, DPO
-- [ ] Dataset
-- [ ] PDF parsing
-- [ ] Speech-to-text (transcript)
-- [ ] LLM Chat integeration with webiste
-
-## Stack
-### ML
-- Python 
-- Pytorch
-- Ray
-
-### Backend
-- Go
-- PostgreSQL
-
-### Frontend
-- React.js
 
 ## Setup
+> **Note:** Look at the `Makefile` for available commands.
+
+Fill the configuration in .env
+```env
+ROOT_PATH=
+```
 
 For development
 - before building the image, get the cudnn.tar.xz and place in the main directory. this is done to avoid authentication while downloading the library 
 - rename the `*.tar.xz` -> `cudnn.tar.xz`. so the dockerfile could pick it up while building the image.
+
 ```bash
 docker build . -t <image_name>
 ```
 
 Run the container
 ```bash
-docker run -it --runtime=nvidia --gpus all --ipc=host --privileged llm_serve
-```
+docker run -it --runtime=nvidia --gpus all --ipc=host --privileged ml_service
+```
+
+Setup Using Ray:
+```bash
+make ray-up  # starts the cluster (add -dev) if need to perfom dev env
+make ray-attach # attach to the container shell
+make ray-serve-run  # to start the ray deployments
+```
diff --git a/ml_service/config.yaml b/ml_service/config.yaml
@@ -2,19 +2,22 @@ app:
   name: vit-ray
 
 llm:
+  model_name: C4AI-35B # supported = [Nous-Capybara-34B, Qwen-32B, Mistral-7B, C4AI-35B]  
+  time_consecutive_res: 0.5
   serve_config:
-    model: /data/c4ai-35b  # name / path
-    download_dir: null # path to model
-    load_format: safetensors  # format of model {auto, pt, dummy, safetensors}
+    model: /data/c4ai-35b # supported - [/data/nous-34b, /data/qwen-32b, /data/mistral-7b, /data/c4ai-35b]
+    download_dir: null # download model dir 
+    load_format: auto #safetensors  # format of model {auto, pt, dummy, safetensors}
     dtype: float16  # data type {auto, float32, float16, bfloat16}
-    max_model_len: 8192 # max length of model
+    max_model_len: 16384 # max length of model
     worker_use_ray: false # use ray for worker
     engine_use_ray: false # use ray for engine
-    pipeline_parallel_size: 1  # size of pipeline parallel
+    # pipeline_parallel_size: 1  # size of pipeline parallel
     tensor_parallel_size: 4  # size of tensor parallel
-    # gpu_memory_utilization: 0.95  # gpu memory utilization
+    # gpu_memory_utilization: 0.9  # gpu memory utilization
     enforce_eager: true
     disable_custom_all_reduce: True
+    # trust_remote_code: true # for cohere models comment it
 
 emb:
   serve_config:

diff --git a/ml_service/config/model/c4ai-35b.yaml b/ml_service/config/model/c4ai-35b.yaml
@@ -0,0 +1,40 @@
+prompt_format:
+  system: "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+# User Preamble
+## Task and Context
+You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
+
+## Style Guide
+Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
+
+## Available Tools
+Here is a list of tools that you have available to you:
+
+```python
+def internet_search(query: str) -> List[Dict]:
+    '''Returns a list of relevant document snippets for a textual query retrieved from the internet
+
+    Args:
+        query (str): Query to search the internet with
+    '''
+    pass
+```
+
+```python
+def directly_answer() -> List[Dict]:
+    '''Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
+    '''
+    pass
+```<|END_OF_TURN_TOKEN|>"
+  assistant: "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{instruction}"
+  trailing_assistant: ""
+  user: "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{instruction}<|END_OF_TURN_TOKEN|>"
+  system_in_user: false
+  accept_sys_from_req: false
+  recursive_sys_prompt: true
diff --git a/ml_service/config/model/mistral-7b.yaml b/ml_service/config/model/mistral-7b.yaml
@@ -0,0 +1,10 @@
+prompt_format:
+  system: "Your name is MegAcad and you are an AI Assitant helps in tutoring & guiding undergraduate students \
+Think carefully through the topic, step by step in a systematic manner, and allow each step to logically build on the previous one. \
+Dont answer to any questions other than studies. Everyone one of your \
+response should be in english. use other languages only if its necessary.\n"
+  assistant: "{instruction}"
+  trailing_assistant: ""
+  user: " [INST] {system} {instruction} [/INST]"
+  system_in_user: true
+  accept_sys_from_req: false
diff --git a/ml_service/config/model/nous-capybara-34b.yaml b/ml_service/config/model/nous-capybara-34b.yaml
@@ -0,0 +1,10 @@
+prompt_format:
+  system: "Your name is MegAcad and you are an AI Assitant helps in tutoring & guiding undergraduate students \
+Think carefully through the topic, step by step in a systematic manner, and allow each step to logically build on the previous one. \
+Dont answer to any questions other than studies. Everyone one of your \
+response should be in english. use other languages only if its necessary.\n"
+  assistant: "{instruction}"
+  trailing_assistant: ""
+  user: "USER: {system} {instruction} ASSISTANT:"
+  system_in_user: true
+  accept_sys_from_req: false
diff --git a/ml_service/config/model/qwen-32b.yaml b/ml_service/config/model/qwen-32b.yaml
@@ -0,0 +1,12 @@
+prompt_format:
+  system: "<|im_start|>system
+Your name is MegAcad and you are an AI Assitant helps in tutoring & guiding undergraduate students \
+Think carefully through the topic, step by step in a systematic manner, and allow each step to logically build on the previous one. \
+Dont answer to any questions other than studies. Everyone one of your \
+response should be in english. Don't use other languages unless it necessary.<|im_end|>"
+  assistant: "<|im_start|>assistant{instruction}"
+  trailing_assistant: ""
+  user: "<|im_start|>user
+{instruction}<|im_end|>"
+  system_in_user: false
+  accept_sys_from_req: false
diff --git a/ml_service/config/ray-cluster.dev.yaml → ml_service/config/ray/ray-cluster.dev.yaml b/ml_service/config/ray-cluster.dev.yaml → ml_service/config/ray/ray-cluster.dev.yaml
diff --git a/ml_service/config/ray-cluster.yaml → ml_service/config/ray/ray-cluster.yaml b/ml_service/config/ray-cluster.yaml → ml_service/config/ray/ray-cluster.yaml
diff --git a/ml_service/config/ray-serve.yaml → ml_service/config/ray/ray-serve.yaml b/ml_service/config/ray-serve.yaml → ml_service/config/ray/ray-serve.yaml
@@ -37,18 +37,7 @@ applications:
     #  num_cpus: 32
     #  num_gpus: 4
 
-# - name: stt
-#  route_prefix: /stt
-#  import_path: stt_serve:app
-#  
-#  deployments:
-#  - name: STTDeployment
-#    num_replicas: 1
-#    max_concurrent_queries: 8
-#    ray_actor_options:
-#      num_gpus: 1
-
-- name: embedding
+- name: emb
   route_prefix: /api/v1/embedder
   import_path: emb_serve:main
   args:

diff --git a/ml_service/emb_serve.py b/ml_service/emb_serve.py
@@ -84,8 +84,8 @@ async def generate_embedding(self, request: Request) -> JSONResponse:
 def main(args: Dict[str, str]) -> Application:
     # load env
     load_env()
-    EMB_PATH = os.getcwd()
-    CONFIG_FILE = os.path.join(EMB_PATH, "config.yaml")
+    ROOT_PATH = os.environ.get("ROOT_PATH", None)
+    CONFIG_FILE = os.path.join(ROOT_PATH, "config.yaml")
     if os.path.exists(CONFIG_FILE) is None:
         raise ConfigFileMissingError(
             "MAIN_CONFIG_FILE_PATH environmental variable is missing."