In [None]:
# !pip install BCEmbedding==0.1.1
# 安装 streamlit
# ! pip install streamlit==1.24.0
# !pip install langchain
# !pip install -U langchain-community

In [2]:
# 导入所需的库
from typing import List
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
import BCEmbedding  

In [39]:
#用新模型的向量模型类
torch.cuda.empty_cache()

class EmbeddingModel:
    def __init__(self, model_name: str, device: str = 'cuda'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = device
        self.model.to(self.device)
    
    def get_embeddings(self, sentences: List[str], batch_size: int = 8) -> np.ndarray:  
        all_embeddings = []  
        for i in range(0, len(sentences), batch_size):  
            batch = sentences[i:i + batch_size]  
            #print(batch)
            inputs = self.tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")  
            #print(inputs)
            inputs_on_device = {k: v.to(self.device) for k, v in inputs.items()}  
            with torch.no_grad():  
                outputs = self.model(**inputs_on_device, return_dict=True)  
            embeddings = outputs.last_hidden_state[:, 0]  
            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  
            all_embeddings.append(embeddings.cpu().numpy())  
        return np.vstack(all_embeddings)  


In [43]:
# 定义向量库索引类
class VectorStoreIndex:
    """
    class for VectorStoreIndex
    """

    def __init__(self, doecment_path: str, embed_model: EmbeddingModel) -> None:
        self.documents = []
        for line in open(doecment_path, 'r', encoding='utf-8'):
            line = line.strip()
            self.documents.append(line)

        self.embed_model = embed_model
        self.vectors = self.embed_model.get_embeddings(self.documents)

        print(f'Loading {len(self.documents)} documents for {doecment_path}.')

    def get_similarity(self, vector1: List[float], vector2: List[float]) -> float:
        """
        calculate cosine similarity between two vectors
        """
        dot_product = np.dot(vector1, vector2)
        magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
        if not magnitude:
            return 0
        return dot_product / magnitude

    def query(self, question: str, k: int = 2) -> List[str]:
        question_vector = self.embed_model.get_embeddings([question])[0]
        result = np.array([self.get_similarity(question_vector, vector) for vector in self.vectors])
        return np.array(self.documents)[result.argsort()[-k:][::-1]].tolist()
    '''
    def web_search(self, search_list):
        os.environ["SERPER_API_KEY"] = "88a8892a02409063f02a3bb97ac08b36fb213ae7"
        search = GoogleSerperAPIWrapper()
        search_result = ''
        for prof_name in search_list:
            search_item = prof_name + "research interest"
            search_result+= str(search.run(search_item)) + '\n'
            
        return search_result
            # results = search.results(search_item)
            # pprint.pp(results)
    '''

In [48]:
print("> Create embedding model...")
embed_model = EmbeddingModel('AI-ModelScope/BCEmbeddingmodel')
# embed_model.get_embeddings(sentences)
# # init embedding model
# model = EmbeddingModel(model_name_or_path="AI-ModelScope/BCEmbeddingmodel")

> Create embedding model...


In [69]:
print("> Create index...")
doecment_path = './test.txt'
index = VectorStoreIndex(doecment_path, embed_model)

#查看向量库的shape
_vector = np.array(index.vectors)
print(_vector.shape)

> Create index...
Loading 10 documents for ./test.txt.
(10, 768)


In [7]:
import os
from langchain.utilities import GoogleSerperAPIWrapper
import pprint

In [70]:
question = 'Recommend university in security'
print('> Question:', question)

context = index.query(question)
print('> Context:', context)

#context_web = index.web_search(context)
#print('> Context_web:', context_web)

> Question: Recommend university in security
> Context: ['The University of Maryland ranks 15th overall in computer science, with an AI program ranked 18th and strong interdisciplinary research ranked 8th. The university’s systems program is ranked 9th, excelling in areas such as cybersecurity, cloud computing, and networking. Its theory program is ranked 16th, showcasing its strengths in algorithms and computational complexity. Located near Washington, D.C., the University of Maryland offers students access to government agencies, research labs, and top tech companies in the region.', 'Cornell University ranks 7th overall in computer science, excelling in theory (ranked 6th) and AI (ranked 7th). The university’s interdisciplinary program is ranked 12th, with collaborations spanning fields like computational biology, social sciences, and economics. In systems, Cornell is ranked 19th, with strengths in computer architecture, distributed systems, and databases. Located in Ithaca, New Yor

In [71]:
# 定义大语言模型类
class LLM:
    """
    class for Yuan2.0 LLM
    """

    def __init__(self, model_path: str) -> None:
        print("Creat tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
        self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)

        print("Creat model...")
        self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()

        print(f'Loading Yuan2.0 model from {model_path}.')

    def generate(self, question: str, context: List):#, context_web: str):
        if context:
            prompt = f'背景：{context}\n 问题：{question}\n 我现在正在申请计算机领域的博士，以上背景是关于学校和教授的学术信息，请根据以上信息回答我的问题'
        else:
            prompt = question

        prompt += "<sep>"
        inputs = self.tokenizer(prompt, return_tensors="pt")["input_ids"].cuda()
        outputs = self.model.generate(inputs, do_sample=False, max_length=2048)
        output = self.tokenizer.decode(outputs[0])

        print(output.split("<sep>")[-1])


In [72]:
print("> Create Yuan2.0 LLM...")
model_path = './IEITYuan/Yuan2-2B-Mars-hf'
#model_path = './IEITYuan/Yuan2-2B-July-hf'
llm = LLM(model_path)
print('> Without RAG:')
llm.generate(question, [])
# llm.generate(question, [],'')

print('> With RAG:')
# llm.generate(question, context)
llm.generate(question, context)#, context_web)

> Create Yuan2.0 LLM...
Creat tokenizer...
Creat model...
Loading Yuan2.0 model from ./IEITYuan/Yuan2-2B-Mars-hf.
> Without RAG:
 1. 联合国安全理事会第1267(1999)号决议<eod>
> With RAG:
 根据以上信息，我建议你考虑以下几个方面来选择合适的学校：
1. 学校的计算机科学排名：根据学校的计算机科学排名，选择一个在该领域内排名较高的学校。你可以参考学校的整体排名、学科排名和师资力量等指标。
2. 学校的教授团队：了解学校的教授团队，特别是那些在计算机领域有丰富经验和专业知识的教授。他们的教学和研究水平对你的学习和职业发展都非常重要。
3. 学校的科研实力：考虑学校的科研实力，包括是否有科研项目资助、实验室设备和资源等。这些因素将直接影响到你在计算机领域的研究能力和发展潜力。
4. 学校的就业和实习机会：了解学校的就业和实习机会，尤其是与计算机科学相关的实习和就业机会。一个好的学校可以提供更多的实践机会和职业发展空间。
综上所述，选择一个合适的学校需要综合考虑学校的计算机科学排名、教授团队、科研实力和就业机会等因素。建议你仔细研究学校的官方网站和相关排名，与学校的招生办公室或教授进行交流，以便做出更准确和明智的选择。<eod>
