In [28]:
import os
import numpy as np
import openai
from supabase import create_client, Client
import requests
from readability import Document
from bs4 import BeautifulSoup

In [29]:
supabase_url = os.environ['NEXT_PUBLIC_SUPABASE_URL']
supabase_anon_key = os.environ['NEXT_PUBLIC_SUPABASE_ANON_KEY']
openai_api_key = os.environ['OPENAI_API_KEY']

In [30]:
openai.api_key = openai_api_key

In [31]:
supabase: Client = create_client(supabase_url, supabase_anon_key)

In [39]:
urls = [
  'https://devlog.mescius.jp/nextjs-quickstart/',
]

In [44]:
max_size = 500

# 文書処理関数
def process_documents(urls):
    documents = []

    for url in urls:
        # 記事の内容を取得
        response = requests.get(url)
        doc = Document(response.text)
        readable_article = doc.summary()
        # BeautifulSoupを使ってHTMLタグを削除
        soup = BeautifulSoup(readable_article, 'html.parser')
        article_text = soup.get_text()

        if not article_text:
            raise ValueError(f"内容の取得に失敗しました: {url}")

        start = 0
        while start < len(article_text):
            # 文章を分割
            end = start + max_size
            chunk = article_text[start:end]
            documents.append({'url': url, 'body': chunk})
            start = end

        # 古いデータを削除
        supabase.table('documents').delete().eq('url', url).execute()

    for document in documents:
        # 改行を空白に変換
        input_text = document['body'].replace('\n', ' ')

        # Embedding
        # 文章をベクトルに変換
        embedding_response = openai.Embedding.create(input=input_text, model="text-embedding-ada-002")

        # ベクトル取得
        embedding = embedding_response['data'][0]['embedding']

        # テーブル作成
        supabase.table('documents').insert({
            'content': input_text,
            'embedding': embedding,
            'url': document['url']
        }).execute()

    print('Document processing and insertion complete.')

In [45]:
if __name__ == '__main__':
    process_documents(urls)

2023-11-05 00:35:14,552:INFO - HTTP Request: DELETE https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents?url=eq.https%3A//devlog.mescius.jp/nextjs-quickstart/ "HTTP/1.1 200 OK"
2023-11-05 00:35:15,018:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:15,390:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:15,716:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:16,155:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:16,497:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:16,846:INFO - HTTP Request: POST https://ogvafqdckdxorurwynoi.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-11-05 00:35:17,220:I

Document processing and insertion complete.
