# 安装依赖部分

In [None]:
!pip install langchain
!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents. 
!pip install openai
!pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
!pip install chromadb # the AI-native open-source embedding database
!pip install Cython # Cython is an optimising static compiler for both the Python programming language
!pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
!pip install unstructured[local-inference]
!CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
!pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
!pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
!pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. 
!pip install tiktoken
!apt-get install poppler-utils
!sudo apt-get install tesseract-ocr
!sudo apt-get install tesseract-ocr-math
!pip install habanero
!pip install PyPDF2
!pip install bibtexparser
!pip install pymupdf
!pip install kor

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cd /content/drive/MyDrive/Literature_Review_tool

# 提取文本部分

In [2]:
def load_pdf(path,openai_api_key,chunk_size,chunk_overlap):
  from langchain.document_loaders import PyMuPDFLoader,DirectoryLoader,UnstructuredPDFLoader
  from detectron2.config import get_cfg
  from PyPDF2 import PdfReader

  cfg = get_cfg()    
  cfg.MODEL.DEVICE = 'gpu'

  import os

  file_names = os.listdir(path)
  pdf_file_names = [path + '/'+file_name for file_name in file_names if file_name.endswith('.pdf')]

  docs =[]

  import re

  for pdf in pdf_file_names:
    source = extract_doi(pdf)

    if source != 'None':
      doc = PyMuPDFLoader(pdf).load()
      for element in doc:
        element.metadata = source
        element.page_content = re.sub('\n+',' ',element.page_content.strip())
        docs.append(element)
  
    else:
      doc = PyMuPDFLoader(pdf).load()
      print(f"{pdf} is not identified! Using other strategy!!")
      source = extract_doi_llm(doc,openai_api_key)
      if source != 'None':
        for element in doc:
          element.metadata = source
      for element in doc:
        element.page_content = re.sub('\n+',' ',element.page_content.strip())
        docs.append(element)
  
  
  
  from langchain.text_splitter import RecursiveCharacterTextSplitter

  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap = chunk_overlap)

  split_docs = text_splitter.split_documents(docs)

  return split_docs
    


In [3]:
def get_info(path):
    from PyPDF2 import PdfReader
    with open(path, 'rb') as f:
        pdf = PdfReader(f)
        info = pdf.metadata
        return info
    
def extract_doi(path):

    source = 0
    info = get_info(path)
    if '/doi' in info:
        doi = info['/doi']
    elif '/Subject' in info:
        Subject = info['/Subject']
        if 'doi:' in Subject:
            Subject = Subject.split('doi:')
            doi = Subject[1]
        else:
          source = 'None'
    elif '/WPS-ARTICLEDOI' in info:
        doi = info['/WPS-ARTICLEDOI']
    else:
        source = 'None'
    
    if source != 'None':
        import habanero
        import time
        citation = habanero.cn.content_negotiation(ids = doi,format='bibentry')
        time.sleep(5)
        import bibtexparser
        citation = bibtexparser.loads(citation)
        citation = citation.entries[0]
        source = {'author':citation['author'],
              'year':citation['year'],
              'title':citation['title'],
              'journal':citation['journal'],
              }
    
    return source

In [4]:
def extract_doi_llm(doc,openai_api_key):

  import re 

  doc[0].page_content = re.sub('\n+',' ',doc[0].page_content.strip())

  from langchain.text_splitter import RecursiveCharacterTextSplitter
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap = 50)
  split_docs = text_splitter.split_documents(doc)
  abstract = split_docs[0]
  doi = extract_chain(abstract,openai_api_key)

  if doi != 'None':
    import habanero
    import time
    citation = habanero.cn.content_negotiation(ids = doi,format='bibentry')
    time.sleep(5)
    import bibtexparser
    citation = bibtexparser.loads(citation)
    citation = citation.entries[0]
    source = {'author':citation['author'],
            'year':citation['year'],
            'title':citation['title'],
            'journal':citation['journal'],
            }
    return source
  else:
    source = 'None'
    return source



In [5]:
def extract_chain(abstract,openai_api_key):
  from kor.extraction import create_extraction_chain
  from kor.nodes import Object, Text, Number
  from langchain.chat_models import ChatOpenAI

  llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    openai_api_key = openai_api_key,
    temperature=0,
    )
  schema = Object(
      id="doi",
      description="doi is a digital identifier.It typically starts with 10. followed by a numeric prefix, such as 10.1000/182.",
      attributes=[
          Text(
              id="doi",
              description='doi is a digital identifier. It typically starts with "10." followed by a numeric prefix, such as 10.1000/182.',
              examples=[
                  ('American Economic Journal: Economic Policy 2015, 7(4): 223–242  http://dx.doi.org/10.1257/pol.20130367 223 Water Pollution Progress at Borders: The','http://dx.doi.org/10.1257/pol.20130367'),
                  ('Environment and Development Economics (2020), 1–17 doi:10.1017/S1355770X2000025X EDE RESEARCH ARTICLE Political incentives, Party Congress, and pollution cycle: empirical evidence from China Zhihua Tian,1 and Yanfang Tian2* 1School of Economics, Zhejiang University of Technology, Hangzhou','10.1017/S1355770X2000025X')
                  ],
               many=True
               )
          ],
          many=False
          )
  chain = create_extraction_chain(llm, schema, encoder_or_encoder_class='json')
  output = chain.predict_and_parse(text=abstract.page_content)
  if 'doi' not in output['data']:
    print(f"LLM strategy failed!!{abstract.metadata['source']} Please manually add it!!")
    source = 'None'
    
    return source
    
  else:
    doi = output['data']['doi']['doi'][0]
    if 'doi=' in doi:
      doi = doi.split('doi=')[1]
    return doi


# 生成数据库部分

In [16]:
def generate_or_load_vectorstore(split_docs,device,model_name,generate,persist_directory,collection_name):
    
    from langchain.vectorstores import Chroma
    from langchain.embeddings import HuggingFaceEmbeddings

    
    model_kwargs = {'device':device}
    model_name = model_name
    embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
    
    persist_directory = persist_directory
    collection_name = collection_name
    
    if generate == True:
        vectorstore = Chroma.from_documents(split_docs,embeddings,collection_name=collection_name,persist_directory = persist_directory)
        vectorstore.persist()
        
    else:
        vectordb = Chroma(collection_name=collection_name, persist_directory=persist_directory, embedding_function=embeddings)
        
    return vectordb

# Chain生成并匹配

In [18]:
def get_chain_output(query,vectordb,k,openai_api_key):
    
    docs = vectordb.similarity_search(query,5,include_metadata=True)
    
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0, model_name="gpt-3.5-turbo")
    
    from langchain.prompts import PromptTemplate,ChatPromptTemplate,HumanMessagePromptTemplate
    from langchain.llms import OpenAI
    
    from langchain.output_parsers import PydanticOutputParser
    from pydantic import BaseModel, Field, validator
    from typing import List,Union,Optional
    
    class Sentence(BaseModel):
        sentence: List[str] = Field(description="The sentence in the given document which is the most similar to the query provided")
        source: List[str] = Field(description="The meta source of the paper")
        score: List[float] = Field(description = "The similarity score between the sentence selected and the query provided")
            
    parser = PydanticOutputParser(pydantic_object=Sentence)
    
    question_template = """
    Given the document and query, find three sentences in the document that are most similar in meaning to the query. 
    Return the sentences, the meta source of the sentences and the cosine similarity scores. 
    If no similar sentences is found, return the sentence with highest cosine siliarity scores.
    {query}
    ===========
    {context}
    ===========
    {format_instructions}
    
    """

    from langchain.chains.question_answering import load_qa_chain
    from langchain import LLMChain

    PROMPT = PromptTemplate(template = question_template,
                            input_variables=['query','context'],
                            partial_variables = {"format_instructions":parser.get_format_instructions()})
    
    llm_chain = LLMChain(llm=llm,prompt = PROMPT)
    
    output = llm_chain({"query":query,"context":docs})
    
    return output,docs
    

In [20]:
def run_text_match(output,query,docs):
    
    import re
    text = re.sub("\n+","",output['text'])
    
    import json
    json_obj = json.loads(text)
    
    
    if "properties" in json_obj:
        print('No result was found, Using embedding searching strategy!!!')
        split_docs = split_for_embedding(docs)
        similar_sentence = search_cosine_similarity(query,split_docs,embeddings)
    
        for i,element in enumerate(similar_sentence):
            print(f'The {i} sentence')
            print(f"Sentence:{element['sentences']}")
            print(f"Source:{element['source']}")
            print(f"Score:{element['score']}")
            print("========")
            print("========")
    else:
        for i in range(3):
            print(f'The {i} sentence')
            print(f"Sentence:{json_obj['sentence'][i]}")
            print(f"Source:{json_obj['source'][i]}")
            print(f"Score:{json_obj['score'][i]}")
            print("========")
            print("========")

def split_for_embedding(docs): ##输入docs(list),输出split_for embedding(list)
    for_embedding = []
    for content in docs:
        new_content = content.page_content.replace('et al.','et al。')
        new_content = new_content.split('.')
        
        meta_data = content.metadata['source']
        
        for split_content in new_content:
            split_content = split_content.replace('。','.')
            
            if len(split_content) < 30:
                continue
            else:
                for_embedding.append({"content":split_content,"source":meta_data})
                
    return for_embedding

def search_cosine_similarity(query,split_docs,embeddings):  ##query-str,split_docs-list,embeddings-embeddings()
    split_docs_content = [content['content'] for content in split_docs]
    embed_docs = embeddings.embed_documents(split_docs_content)
    embed_query= embeddings.embed_query(query)
    
    from openai.embeddings_utils import cosine_similarity
    
    cos_index = []
    for embed_doc in embed_docs:
        cos_index.append(cosine_similarity(embed_doc,embed_query))
    
    #这边是根据大小建立索引
    idx = sorted(range(len(cos_index)),key=lambda k:cos_index[k]) #根据cos_index的大小进行排序
    final_similar_list = []
    for index in idx[-3:]:
        unit = {}
        unit['sentences']=split_docs_content[index]
        unit['source']=split_docs[index]['source']
        unit['score']=cos_index[index]
        final_similar_list.append(unit)
    
    return final_similar_list
        

# 主程序

In [8]:
path = "/content/drive/MyDrive/Literature_Review_tool/literature"
openai_api_key = "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"

import torch 

device = 'cuda' if torch.cuda.is_available() else 'cpu'

split_docs = load_pdf(path=path,openai_api_key=openai_api_key,chunk_size =1500, chunk_overlap=100)

ModuleNotFoundError: No module named 'detectron2'

In [21]:
vectordb = generate_or_load_vectorstore(split_docs=True,device=device,model_name = "sentence-transformers/all-mpnet-base-v2",generate=False,persist_directory = 'huggingface_index',collection_name='paper_index')

query = "However, the China's cadre evaluation system is different from the voting system in the Western countries"

output,docs = get_chain_output(query,vectordb,k=5,openai_api_key=openai_api_key)

final_list = run_text_match(output,query,docs)

print(final_list)






Using embedded DuckDB with persistence: data will be stored in: huggingface_index


The 0 sentence
Sentence:The Central Organi- zation Department emerged as the key human resource manager of the CCP, although senior-level promotions were ultimately decided by members of the Politburo (Nathan and Gilley 2002).
Source:American Political Science Review
Score:0.707
The 1 sentence
Sentence:After reform began in 1978, education creden- tials came to play a prominent role in the advancement of lower level cadres, and older cadres were strongly encouraged to retire (Cui 2003; Landry 2008; Manion 1993; Walder, Li, and Treiman 2000).
Source:American Political Science Review
Score:0.707
The 2 sentence
Sentence:Finally, when a centralized ﬁscal system was ﬁrst implemented in the mid-1990s, the CCP used the cadre evaluation system to ensure that provincial lead- ers cooperated with central tax ofﬁcials in maximizing revenue for the central government.
Source:American Political Science Review
Score:0.707
None


In [None]:
'huggingface_index',collection_name='paper_index'
The political business cycle is caused by opportunism