### 2.1 环境准备

In [1]:
!pip install BCEmbedding==0.1.1

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# 查看已安装依赖
! pip list

Package                           Version
--------------------------------- --------------------
absl-py                           2.1.0
accelerate                        0.33.0
adaseq                            0.6.6
addict                            2.4.0
aiohttp                           3.9.5
aiosignal                         1.3.1
albucore                          0.0.12
albumentations                    1.4.11
alias-free-torch                  0.0.6
aliyun-python-sdk-core            2.15.1
aliyun-python-sdk-kms             2.16.3
altair                            5.4.0
aniso8601                         9.0.1
annotated-types                   0.7.0
antlr4-python3-runtime            4.9.3
anyio                             4.4.0
apex                              0.1
appdirs                           1.4.4
argon2-cffi                       23.1.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
asttokens                         2.4.1
astunparse        

In [3]:
# 安装 streamlit
! pip install streamlit==1.24.0

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### 2.2 模型下载

In [4]:
# 向量模型下载
from modelscope import snapshot_download
model_dir = snapshot_download("AI-ModelScope/bge-small-zh-v1.5", cache_dir='.')


Downloading: 100%|██████████| 91.4M/91.4M [00:01<00:00, 67.2MB/s]


In [5]:
# 源大模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('IEITYuan/Yuan2-2B-Mars-hf', cache_dir='.')
# model_dir = snapshot_download('IEITYuan/Yuan2-2B-July-hf', cache_dir='.')

Downloading: 100%|██████████| 39.0/39.0 [00:00<00:00, 71.7B/s]
Downloading: 100%|██████████| 4.41G/4.41G [00:18<00:00, 259MB/s] 


### 2.3 RAG实战

In [6]:
#加入网络搜索
!pip install langchain
!pip install -U langchain-community

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
# 导入所需的库
from typing import List
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [8]:
# # 定义向量模型类
# class EmbeddingModel:
#     """
#     class for EmbeddingModel
#     """

#     def __init__(self, path: str) -> None:
#         self.tokenizer = AutoTokenizer.from_pretrained(path)

#         self.model = AutoModel.from_pretrained(path).cuda()
#         print(f'Loading EmbeddingModel from {path}.')

#     def get_embeddings(self, texts: List) -> List[float]:
#         """
#         calculate embedding for text list
#         """
#         encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
#         encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
#         with torch.no_grad():
#             model_output = self.model(**encoded_input)
#             sentence_embeddings = model_output[0][:, 0]
#         sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
#         return sentence_embeddings.tolist()

In [25]:
#用新模型的向量模型类
torch.cuda.empty_cache()

class EmbeddingModel:
    def __init__(self, model_name: str, device: str = 'cuda'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = device
        self.model.to(self.device)

#     #不知道这个返回tensor行不行先试试吧
#     def get_embeddings(self, sentences: List[str]) -> torch.Tensor:
#         inputs = self.tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")
#         inputs_on_device = {k: v.to(self.device) for k, v in inputs.items()}
#         outputs = self.model(**inputs_on_device, return_dict=True)
#         embeddings = outputs.last_hidden_state[:, 0]
#         embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
#         return embeddings.cpu().detach().numpy()
    
    def get_embeddings(self, sentences: List[str], batch_size: int = 4) -> torch.Tensor:
        all_embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            inputs = self.tokenizer(sentences, padding=True, truncation=True, max_length=256, return_tensors="pt")

            # inputs = self.tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt")
            inputs_on_device = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs_on_device, return_dict=True)
            embeddings = outputs.last_hidden_state[:, 0]
            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
            all_embeddings.append(embeddings.cpu().numpy())
        return np.vstack(all_embeddings)



In [27]:
print("> Create embedding model...")
# embed_model_path = './AI-ModelScope/bge-small-zh-v1___5'
# embed_model = EmbeddingModel(embed_model_path)

embed_model = EmbeddingModel('AI-ModelScope/BCEmbeddingmodel')

# 使用VectorStoreIndex类
index = VectorStoreIndex(doecment_path="knowledge.txt", embed_model=embed_model)



# # init embedding model
# model = EmbeddingModel(model_name_or_path="AI-ModelScope/BCEmbeddingmodel")

> Create embedding model...
Loading 183 documents for knowledge.txt.


In [28]:
import os
from langchain.utilities import GoogleSerperAPIWrapper
import pprint

In [30]:
# 定义向量库索引类
class VectorStoreIndex:
    """
    class for VectorStoreIndex
    """

    def __init__(self, doecment_path: str, embed_model: EmbeddingModel) -> None:
        self.documents = []
        for line in open(doecment_path, 'r', encoding='utf-8'):
            line = line.strip()
            self.documents.append(line)

        self.embed_model = embed_model
        self.vectors = self.embed_model.get_embeddings(self.documents)

        print(f'Loading {len(self.documents)} documents for {doecment_path}.')

    def get_similarity(self, vector1: List[float], vector2: List[float]) -> float:
        """
        calculate cosine similarity between two vectors
        """
        dot_product = np.dot(vector1, vector2)
        magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
        if not magnitude:
            return 0
        return dot_product / magnitude

    def query(self, question: str, k: int = 1) -> List[str]:
        question_vector = self.embed_model.get_embeddings([question])[0]
        result = np.array([self.get_similarity(question_vector, vector) for vector in self.vectors])
        return np.array(self.documents)[result.argsort()[-k:][::-1]].tolist()
    
    def web_search(self, search_list):
        os.environ["SERPER_API_KEY"] = "88a8892a02409063f02a3bb97ac08b36fb213ae7"
        search = GoogleSerperAPIWrapper()
        search_result = ''
        for prof_name in search_list:
            search_item = prof_name + "research interest"
            search_result+= str(search.run(search_item)) + '\n'
            
        return search_result
            # results = search.results(search_item)
            # pprint.pp(results)
        

In [31]:
print("> Create index...")
doecment_path = './knowledge.txt'
index = VectorStoreIndex(doecment_path, embed_model)

#查看向量库的shape
_vector = np.array(index.vectors)
print(_vector.shape)

> Create index...
Loading 183 documents for ./knowledge.txt.
(8418, 768)


In [None]:
question = 'introduce professors in duke university'
print('> Question:', question)

context = index.query(question)
print('> Context:', context)

context_web = index.web_search(context)
print('> Context_web:', context_web)

In [None]:

# 定义大语言模型类
class LLM:
    """
    class for Yuan2.0 LLM
    """

    def __init__(self, model_path: str) -> None:
        print("Creat tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
        self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)

        print("Creat model...")
        self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()

        print(f'Loading Yuan2.0 model from {model_path}.')

    def generate(self, question: str, context: List, context_web: str):
        if context:
            prompt = f'背景：{context}\n{context_web}\n问题：{question}\n请根据以上提供的信息回答问题。'
        else:
            prompt = question

        prompt += "<sep>"
        inputs = self.tokenizer(prompt, return_tensors="pt")["input_ids"].cuda()
        outputs = self.model.generate(inputs, do_sample=False, max_length=2048)
        output = self.tokenizer.decode(outputs[0])

        print(output.split("<sep>")[-1])

In [None]:
print("> Create Yuan2.0 LLM...")
model_path = './IEITYuan/Yuan2-2B-Mars-hf'
# model_path = './IEITYuan/Yuan2-2B-July-hf'
llm = LLM(model_path)

In [None]:
print('> Without RAG:')
llm.generate(question, [])
# llm.generate(question, [],'')

print('> With RAG:')
# llm.generate(question, context)
llm.generate(question, context, context_web)