In [1]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import urllib.request
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as PineconeStore
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

  from tqdm.autonotebook import tqdm


In [3]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = openai_api_key

pinecone_api_key = os.getenv('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = pinecone_api_key

In [5]:
df = pd.read_csv('/Users/mira/VScode/chatbot/data/my_profile.csv',header=None,names=['text'])

# OpenAIEmbeddings インスタンスを作成
embeddings = OpenAIEmbeddings()

# 'text_column' の各セルをベクトル化してリストに保存
df['vectorized'] = df['text'].apply(lambda x: embeddings.embed_query(x))

# ベクトルリストをデータフレームに展開（各要素を別々のカラムに）
vectorized_df = pd.DataFrame(df['vectorized'].tolist(), index=df.index)

# 元のDataFrameとベクトル化したDataFrameを結合
output_df = pd.concat([df.drop(columns=['vectorized']), vectorized_df], axis=1)

# 結果を表示
output_df

  warn_deprecated(


Unnamed: 0,text,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,私は学びたいと思う教授がいたからです。,-0.008547,-0.001751,0.00381,-0.031845,0.014005,0.019269,0.020698,0.012228,0.000963,...,0.00735,-0.001139,0.033364,-0.026413,-0.015704,0.012878,-0.012061,-0.00223,0.011443,-0.014597
1,その教授は塩尻（斎藤）亜希先生といい、現在の僕の指導教員です。,-0.014455,-0.009331,-0.00994,-0.020282,0.003113,0.022077,-0.008179,0.005791,-0.008158,...,0.008627,-0.024891,0.001095,-0.020001,-0.000791,0.00637,0.005248,-0.011441,0.009431,-0.006986
2,生まれも育ちも岩手県盛岡市です！,0.007824,-0.008179,-4e-05,-0.020353,-0.026127,0.015673,-0.006001,-0.017809,-0.00926,...,0.017057,0.012398,0.016992,-0.028157,0.018481,0.03219,0.003109,0.008219,-0.000964,-0.03633
3,コンピュータサイエンスを学び、Javaなどによるシステムの開発から、機械学習と幅広く学んでいます。,-0.006216,-0.013467,0.017514,-0.021086,-0.006454,0.008352,-0.003182,0.001099,-0.001257,...,0.005914,0.004523,0.022333,-0.02755,-0.027755,-0.01718,0.007459,-0.024749,-0.010466,-0.000903
4,株式会社Zigexnにて機械学習エンジニアによる長期インターンシップ、そして業務委託として株...,-0.004156,-0.023303,0.019663,-0.029693,-0.015873,0.037547,-0.012322,0.009845,-0.005898,...,0.018267,-0.001415,0.034181,-0.024028,0.002306,-0.001014,-0.028462,-0.004895,0.005641,-0.006684
5,はい！コンスタントに参加しています。,-0.017303,-0.016646,0.000321,-0.014678,-0.001487,0.014182,-0.000385,0.000955,-0.000227,...,0.002441,0.004249,0.033802,-0.033909,-0.012214,0.010566,-0.001378,-0.005541,0.005444,-0.034605
6,主にテーブルデータのコンペに参加しています！画像や音声にも挑戦したいのですが計算リソース不足...,-0.024112,-0.018857,0.001609,-0.034786,0.01103,0.028601,-0.0179,-0.01519,-0.010578,...,0.017078,0.014246,0.009271,-0.045926,-0.004016,-0.013007,0.00233,0.007588,-0.013411,-0.028081
7,"AWS CLF,SAA",0.001802,-0.011057,-0.000103,-0.018666,-0.019302,0.029674,-0.04242,0.010231,-0.00698,...,-0.007885,-0.015925,0.006451,-0.022015,-0.038689,0.019557,0.009121,-0.005211,-0.004409,0.00715
8,以前の長期インターン先である株式会社SiNCEにてAWS関連のプロジェクトがありました。,0.00311,-0.040472,0.007335,-0.014333,-0.013313,0.026383,-0.021803,0.007423,0.02133,...,-0.012941,-0.015414,0.023073,-0.019588,-0.003975,0.021357,-0.0159,-0.00745,-0.00514,0.007896
9,その際に、AWS及びクラウドの知識が不足していたため、自己研鑽の一環として取得しました。,0.007659,-0.023229,0.017671,-0.026819,-0.010238,0.017498,-0.00241,-0.018336,0.012439,...,-0.026926,-0.001855,0.024373,-0.038986,-0.013303,0.027763,-0.028854,0.005049,0.004421,0.016501


In [6]:
pc = Pinecone(api_key=pinecone_api_key)
pinecone_index = pc.Index("sample-db")
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'sample-db-rdebc4f.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'sample-db',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [7]:
original_texts = output_df.iloc[:, 0]
vectorized_data_only = output_df.iloc[:, 1:]
for i in range(len(vectorized_data_only)):
    pinecone_index.upsert(
        vectors = [
            {
                'id': str(i+1),
                'values': vectorized_data_only.T[i],
                'metadata': {"text": original_texts[i], "memo": "memo_"+str(i+1)}
            }
        ]
    )

In [18]:
env="aws"
index_name = "sample-db"


embeddings = OpenAIEmbeddings()

#vectorstore = PineconeStore.from_documents(index_name, embeddings,"text")
vectorstore = PineconeStore.from_documents(index_name, embeddings)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

TypeError: VectorStore.from_documents() takes 3 positional arguments but 4 were given

In [None]:
retrieval_qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model="gpt-3.5-turbo"),
        chain_type="refine",
        retriever=retriever,
        return_source_documents=True
)

result = retrieval_qa.invoke({"query": "趣味はなんですか？"})
display("Answer:", result['result'])

In [None]:
source_docs_excerpts = [
        "Source Document: " + doc.page_content[:200] + "..." for doc in result['source_documents']
]
print(source_docs_excerpts)