In [1]:
%pip install -r requirements.txt -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import urllib.request
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as PineconeStore
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  

  from tqdm.autonotebook import tqdm


In [3]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = openai_api_key

pinecone_api_key = os.getenv('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = pinecone_api_key

In [4]:
import streamlit as st

# 環境変数の読み込み
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

# CSVファイルを読み込み、列名を指定
def load_data(filepath):
    return pd.read_csv(filepath, header=None, names=['text', 'Category'])

# テキストデータをベクトル化
def vectorize_text(df, embeddings):
    df['vectorized'] = df['text'].apply(lambda x: embeddings.embed_query(x))
    return df

# ベクトルデータをDataFrameに展開
def expand_vectors(df):
    vectorized_df = pd.DataFrame(df['vectorized'].tolist(), index=df.index)
    return pd.concat([df.drop(columns=['vectorized']), vectorized_df], axis=1)

# Pineconeにデータをアップロード
def upload_to_pinecone(index, output_df):
    original_texts = output_df.iloc[:, 0]
    original_category = output_df.iloc[:, 1]
    vectorized_data_only = output_df.iloc[:, 2:]

    for i in range(len(vectorized_data_only)):
        index.upsert(
            vectors=[
                {
                    'id': str(i + 1),
                    'values': vectorized_data_only.iloc[i].tolist(),
                    'metadata': {"text": original_texts[i], "Category": original_category[i]}
                }
            ]
        )

# PineconeVectorStoreの初期化
def initialize_vector_store(index_name, embeddings, pinecone_api_key):
    return PineconeVectorStore(index_name=index_name, embedding=embeddings, pinecone_api_key=pinecone_api_key)

# 質問応答システムのセットアップ
def setup_qa_system(vectorstore):
    llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.0)
    return RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

# Streamlitインターフェースの設定
def main():
    st.title("Chatbot Interface with Pinecone and OpenAI")
    
    query = st.text_input("質問を入力してください:")
    
    if query:
        filepath = '/Users/ito_itsuki/Documents/python_env/chatbot/data/my_profile.csv'
        index_name = 'sample-db'
        
        df = load_data(filepath)
        
        # ベクトル化と展開
        embeddings = OpenAIEmbeddings()
        vectorized_df = expand_vectors(vectorize_text(df, embeddings))
        
        # Pineconeにアップロード
        pinecone.init(api_key=pinecone_api_key, environment="us-west1-gcp")
        pinecone_index = pinecone.Index(index_name)
        upload_to_pinecone(pinecone_index, vectorized_df)
        
        # 質問応答システムのセットアップと実行
        vectorstore = initialize_vector_store(index_name, embeddings, pinecone_api_key)
        qa = setup_qa_system(vectorstore)
        
        # ユーザーのクエリに対する応答を取得
        result = qa({"query": query})
        
        st.write("Answer:", result['result'])

if __name__ == "__main__":
    main()


2024-08-20 18:08:54.468 
  command:

    streamlit run /Users/ito_itsuki/Library/Python/3.9/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-08-20 18:08:54.469 Session state does not function when running a script without `streamlit run`
