# 从指定的PDF文件中提取所有页面的文本内容，并将其作为一个字符串返回



In [None]:
! pip install PyPDF2



In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
        return text

pdf_text = extract_text_from_pdf("")


# 生成PDF文件文本的嵌入向量

In [None]:
!pip install openai==0.28



In [None]:
import openai
import os

openai.api_key = ""

def get_embeddings(text):
    # Initialize the OpenAI client
    # Use the new client.embeddings.create method
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

embedding = get_embeddings(pdf_text)

# 将PDF文本生成的嵌入向量存储到Pinecone索引中。

In [None]:
! pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key = "")

#pc.delete_index("minghao")

# 初始化一个index，不需要再运行一次

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc.create_index(
    name="minghao",
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-east-1")  # Provide spec
)

index = pc.Index("minghao")

# 可以运行

In [None]:
pdf_id = "pdf_001"

index.upsert(
    vectors=[
        {
            "id": pdf_id,
            "values": embedding,
            "metadata": {
                "type": "pdf_document",
                "title": "minghao_cv?",
                "content": pdf_text
                }
        }
    ],
    namespace="cv"
)

print(embedding)

[-0.0044843037612736225, -0.007799377199262381, 0.02373605966567993, -0.0412001758813858, -0.004845886956900358, 0.018167007714509964, -0.019951267167925835, 0.0033319697249680758, -0.02032974548637867, -0.009536326862871647, 0.01582854613661766, 0.013321120291948318, 0.010259492322802544, 0.005900222342461348, 0.0044538904912769794, -0.01704508624970913, 0.005055402405560017, -0.011084036901593208, 0.011617963202297688, -0.011881547048687935, -0.018613072112202644, 0.0025614940095692873, -0.02034326270222664, 9.22965700738132e-05, -0.021722009405493736, 0.00407541124150157, 0.03606367111206055, -0.028142640367150307, -0.018464382737874985, -0.03736131638288498, 0.027710093185305595, -0.017747975885868073, -0.01914023980498314, -0.0158555805683136, 0.01024597603827715, -0.017991283908486366, 0.006407114211469889, -0.013882080093026161, 0.019234858453273773, -8.321475615957752e-05, 0.012368163093924522, 0.016693640500307083, -0.0010991106973960996, -0.018559003248810768, 0.0100905289873

# 查询相关信息

In [None]:
user_question = "tell me abot minghao's education history"
question_embedding = get_embeddings(user_question)

query_response = index.query(
    vector=question_embedding,
    top_k=3,
    namespace="cv",
    include_metadata=True
)

# 查看结果
for match in query_response['matches']:
    print(f"ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Metadata: {match['metadata']}")

retrieved_text = query_response['matches'][0]['metadata'].get('content', 'No content available')


# 传递给 GPT 进行回答

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",  # 使用聊天模型
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"User question: {user_question}\n\nRelevant text: {retrieved_text}\n\nAnswer the user's question based on the relevant text."}
    ],
    max_tokens=200
)

# 输出 GPT 的回答
print(response['choices'][0]['message']['content'])


ID: pdf_001
Score: 0.797455907
Metadata: {'content': "  MINGHAO SUN Mail: sun989minghao@gmail.com Phone: +1 2092980792 LinkedIn: https://www.linkedin.com/in/minghao-sun-653778276/ Estimated Graduation Time: December 2025 TECH STACK Programming Languages: Python; Java; C; C++; JavaScript; SQL; HTML; CSS. Web Development:            React; Bootstrap; Axios; Node.js; MongoDB; JSON; Socket.IO; AWS EC2, Flask; Git; Postman; Django. Machine Learning:           LLM; Fine-tuning; LangChain; RAG; Deeplake; Pinecone; Hugging Face. EDUCATION Bachelor of Science –  Major: Computer Science  Name: University of Tianjin Renai College. (Tianjin, China) Main courses: Data Structure, operating system, Computer Network Master of Science –  Major: Computer Science Name: University of the Pacific. (Stockton, CA, USA) Main courses: Database Management, Machine Learning, Web Development, Advanced Algorithms WORK EXPERIENCE Int’ I Data Engineering and Science Association (California, USA) Intern | 5/2024 – 9/