In [12]:
%pip install requests bs4 openai python-dotenv asyncio aiohttp

Note: you may need to restart the kernel to use updated packages.


### URL の validation と HTML のダウンロード

In [165]:
import re
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup

In [166]:
# 1. URLからWebページを取得し、要約文とEmbeddingベクトルを生成して保存
## 1.1 URLからWebページを取得
### 1.1.1 URLのバリデーション
def validate_url(url: str) -> bool:
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

### 1.1.2 Webページのダウンロード
def download_webpage(url: str) -> str:
    if not validate_url(url):
        raise ValueError("Invalid URL")

    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download the webpage: {response.status_code}")

    return response.text

In [192]:
url = "https://openai.com/pricing"

if validate_url(url):
    webpage_content = download_webpage(url)
    print("Webpage content downloaded successfully.")
    display(webpage_content)
else:
    print("Invalid URL.")

Webpage content downloaded successfully.


'<!DOCTYPE html>\n<html lang="en-US">\n<head><meta charset="utf-8">\n<title>Pricing</title>\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<link rel="preconnect" href="https://github.githubassets.com/" crossorigin="">\n<link rel="preconnect" href="https://fonts.googleapis.com/" crossorigin="">\n<meta name="description" content="Simple and flexible. Only pay for what you use.">\n<meta property="og:title" content="Pricing">\n<meta property="og:description" content="Simple and flexible. Only pay for what you use.">\n<meta property="og:image" content="https://openaicom.imgix.net/7ef54590-0045-4fb6-9f03-9643e08f0d94/stangel-2022-0423.jpg?auto=compress%2Cformat&fit=min&fm=jpg&q=80&rect=0%2C540%2C3840%2C2160">\n<meta property="og:image:alt" content="Aerial shot of two people sitting in black armchairs around a round red table, one holding a smartphone while sitting cross-legged, another sitting cross-legged">\n<meta name="twitter:card" content="summary_large_image">\n<

### HTML パース

In [193]:
### 1.1.3 HTMLタグの削除と本文の抽出
def extract_main_content(html_content: str) -> str:
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Remove script and style elements to clean up the content
    for script in soup(["script", "style"]):
        script.decompose()

    # Append a space to each tag, which helps in maintaining word separation when tags are removed
    for tag in soup.find_all():
        tag.append(" ")
    
    # Extract text from the parsed HTML and remove unnecessary whitespaces and line breaks
    extracted_text = ' '.join(line.strip() for line in
                              soup.get_text().splitlines() if line.strip())
    
    # Remove continuous spaces and replace them with a single space
    cleaned_text = re.sub(r'\s+', ' ', extracted_text)
    
    return cleaned_text

In [194]:
if validate_url(url):
    webpage_content = download_webpage(url)
    print("Webpage content downloaded successfully.")
    main_content = extract_main_content(webpage_content)
    print("Main content extracted:")
    display(main_content)
else:
    print("Invalid URL.")

Webpage content downloaded successfully.
Main content extracted:


'Pricing Close Search Submit Skip to main content Site Navigation Research Overview Index Product Overview ChatGPT GPT-4 DALLÂ·E 2 Customer stories Safety standards Pricing Developers Overview Documentation API reference Examples Safety Company About Blog Careers Charter Security Search Menu Mobile Navigation Close Site Navigation Research Product Developers Safety Company Search Submit Pricing Simple and flexible. Only pay for what you use. Quick links Contact sales Learn more Language models Multiple models, each with different capabilities and price points. Prices are per 1,000 tokens. You can think of tokens as pieces of words, where 1,000 tokens is about 750 words. This paragraph is 35 tokens. GPT-4 With broad general knowledge and domain expertise, GPT-4 can follow complex instructions in natural language and solve difficult problems with accuracy. Learn more Model Prompt Completion 8K context $0.03 Â / 1K tokens $0.06 Â / 1K tokens 32K context $0.06 Â / 1K tokens $0.12 Â / 1K to

### チャンク分割

In [195]:
def split_text(text: str, max_length: int = 1000, overlap: int = 50) -> list[str]:
    """
    Splits the given text into an array of subtexts with a length of up to max_length
    characters. Overlaps the subtexts by overlap characters.

    :param text: The text to be split.
    :type text: str
    :param max_length: The maximum length of each subtext. Default is 800.
    :type max_length: int
    :param overlap: The overlap between each subtext. Default is 50.
    :type overlap: int
    :return: An array of the split subtexts.
    :rtype: list[str]
    """
    result = []
    i = 0
    while i < len(text):
        end = i + max_length
        if end >= len(text):
            result.append(text[i:])
            break

        # Find the last space character within the overlap range
        last_space_index = text.rfind(' ', end - overlap, end)

        if last_space_index != -1:
            end = last_space_index + 1
        else:
            # If no space found in the overlap range, prioritize splitting at spaces
            space_index = text.rfind(' ', 0, end)
            if space_index != -1:
                end = space_index + 1

        result.append(text[i:end])
        i = end

    return result

In [196]:
chunks = split_text(main_content)

for chunk in chunks:
    display(chunk)

'Pricing Close Search Submit Skip to main content Site Navigation Research Overview Index Product Overview ChatGPT GPT-4 DALLÂ·E 2 Customer stories Safety standards Pricing Developers Overview Documentation API reference Examples Safety Company About Blog Careers Charter Security Search Menu Mobile Navigation Close Site Navigation Research Product Developers Safety Company Search Submit Pricing Simple and flexible. Only pay for what you use. Quick links Contact sales Learn more Language models Multiple models, each with different capabilities and price points. Prices are per 1,000 tokens. You can think of tokens as pieces of words, where 1,000 tokens is about 750 words. This paragraph is 35 tokens. GPT-4 With broad general knowledge and domain expertise, GPT-4 can follow complex instructions in natural language and solve difficult problems with accuracy. Learn more Model Prompt Completion 8K context $0.03 Â / 1K tokens $0.06 Â / 1K tokens 32K context $0.06 Â / 1K tokens $0.12 Â / 1K '

'tokens 8K context 32K context Chat ChatGPT models are optimized for dialogue. The performance of gpt-3.5-turbo is on par with Instruct Davinci. Learn more about ChatGPT Model Usage gpt-3.5-turbo $0.002 / 1K tokens gpt-3.5-turbo InstructGPT Instruct models are optimized to follow single-turn instructions. Ada is the fastest model, while Davinci is the most powerful. Learn more Ada Fastest $0.0004 / 1K tokens Babbage $0.0005 / 1K tokens Curie $0.0020 / 1K tokens Davinci Most powerful $0.0200 / 1K tokens Fine-tuning models Create your own custom models by fine-tuning our base models with your training data. Once you fine-tune a model, youâ\x80\x99ll be billed only for the tokens you use in requests to that model. Learn more about fine-tuning Model Training Usage Ada $0.0004 Â / 1K tokens $0.0016 Â / 1K tokens Babbage $0.0006 Â / 1K tokens $0.0024 Â / 1K tokens Curie $0.0030 Â / 1K tokens $0.0120 Â / 1K tokens Davinci $0.0300 Â / 1K tokens $0.1200 Â / 1K tokens Ada Babbage Curie Davinci '

'Embedding models Build advanced search, clustering, topic modeling, and classification functionality with our embeddings offering. Learn more about embeddings Model Usage Ada $0.0004 Â / 1K tokens Ada Other models Image models Build DALLÂ·E directly into your apps to generate and edit novel images and art. Our image models offer three tiers of resolution for flexibility. Learn more Resolution Price 1024Ã\x971024 $0.020 Â / image 512Ã\x97512 $0.018 Â / image 256Ã\x97256 $0.016 Â / image 1024Ã\x971024 512Ã\x97512 256Ã\x97256 Audio models Whisper can transcribe speech into text and translate many languages into English. Learn more about Whisper Model Usage Whisper $0.006 Â / minute (rounded to the nearest second) Whisper Usage quotas When you sign up, youâ\x80\x99ll be granted an initial spend limit, or quota, and weâ\x80\x99ll increase that limit over time as you build a track record with your application. If you need more tokens, you can always request a quota increase. Request quota i

'Start for free Start experimenting withÂ $5 in free credit that can be used during your first 3 months. Pay as you go To keep things simple and flexible, pay only for the resources you use. Choose your model Use the right model for the job. We offer a spectrum of capabilities and price points. Built with OpenAI View all customer stories Morgan Stanley Morgan Stanley wealth management deploys GPT-4 to organize its vast knowledge base. Stripe Stripe leverages GPT-4 to streamline user experience and combat fraud. Research Overview Index Product Overview GPT-4 DALLÂ·E 2 Customer stories Safety standards Pricing Safety Overview Company About Blog Careers Charter Security OpenAI Â© 2015â\x80\x8aâ\x80\x93â\x80\x8a2023 Terms & policies Privacy policy Social Twitter YouTube GitHub SoundCloud LinkedIn Back to top'

In [197]:
import os
from dotenv import load_dotenv
import asyncio
import nest_asyncio
import aiohttp
import json
from typing import NamedTuple

nest_asyncio.apply()

load_dotenv()

True

In [198]:
from typing import NamedTuple

class SummarizationResult(NamedTuple):
    num_tokens: int
    num_chars: int
    summary: str
    original_text: str

async def summarize_text_async(text: str, model: str = "babbage") -> SummarizationResult:
    """
    Summarize the given text asynchronously using the specified GPT-3 model.

    Args:
        text (str): The text to be summarized.
        model (str, optional): The name of the GPT-3 model to use. Default is "davinci".

    Returns:
        SummarizationResult: A named tuple containing the following values:
            - int: The number of tokens in the original text.
            - int: The number of characters in the original text.
            - str: The summarized text.
            - str: The original text.
    """
    url = f"https://api.openai.com/v1/engines/{model}/completions"
    api_key = os.environ["OPENAI_API_KEY"]
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "prompt": f"""
This is a summarization application. Given a text, it provides a summary of the text. 

###
Text to summerize:
```
{text}
```

Summery:""",
        "temperature": 0.8,
        "max_tokens": 30,
        "top_p": 1,
        "frequency_penalty": 1,
        "presence_penalty": 1
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, data=json.dumps(data)) as response:
            result = await response.json()
            try:
                num_tokens = result["usage"]["total_tokens"]
            except:
                print(f"Error in API response: {result}")
                return SummarizationResult(0, 0, "", text)
            try:
                summary = result["choices"][0]["text"].strip()
                return SummarizationResult(num_tokens, len(text), summary, text)
            except:
                print(f"Error in API response: {result}")
                return SummarizationResult(num_tokens, 0, "", text)

async def summarize_chunks(chunks: list[str], model: str = "babbage") -> list[SummarizationResult]:
    """
    Summarize a list of text chunks asynchronously using GPT-3.

    Args:
        chunks (list[str]): A list of text chunks to be summarized.

    Returns:
        list: A list of named tuples containing the following values for each input chunk:
            - int: The number of tokens in the original text.
            - int: The number of characters in the original text.
            - str: The summarized text.
            - str: The original text.
    """
    summaries = await asyncio.gather(*(summarize_text_async(chunk, model) for chunk in chunks))
    return summaries

In [199]:
results = await summarize_chunks(chunks, "babbage")
display(results)

total_tokens = sum(result.num_tokens for result in results)
display(total_tokens)

for result in results:
    display(result.summary)


[SummarizationResult(num_tokens=282, num_chars=997, summary='2019-01-16T10:00:01Z (GMT) Â _summarized_%3F \\.\\', original_text='Pricing Close Search Submit Skip to main content Site Navigation Research Overview Index Product Overview ChatGPT GPT-4 DALLÂ·E 2 Customer stories Safety standards Pricing Developers Overview Documentation API reference Examples Safety Company About Blog Careers Charter Security Search Menu Mobile Navigation Close Site Navigation Research Product Developers Safety Company Search Submit Pricing Simple and flexible. Only pay for what you use. Quick links Contact sales Learn more Language models Multiple models, each with different capabilities and price points. Prices are per 1,000 tokens. You can think of tokens as pieces of words, where 1,000 tokens is about 750 words. This paragraph is 35 tokens. GPT-4 With broad general knowledge and domain expertise, GPT-4 can follow complex instructions in natural language and solve difficult problems with accuracy. Learn

1240

'2019-01-16T10:00:01Z (GMT) Â _summarized_%3F \\.\\'

'the text of the summerized text. The function is provided as a wrapper over tokens, which renders it idempotent and thus can'

'A text summarizer. It provides a summary of the current text and describes it in one sentence or more, as per the input.'

'To minimize your costs, you can start with $5 free credit per month to use the service. You will be permitted an unlimited number of'

### 考察

- モデル
    - davinci, curie は高すぎる
        - 使ったら破産まっしぐら
        - ユーザーに払わせたら今度はユーザーが破産する
    - ada, babbabge は十分安いが弱すぎる
        - そのままでは使い物にならない
        - Fine Tuning 必須
    - ada が davinci の 1/50

- text -> chunk -> summery という枠組み
    - これはある程度うまく動きそう
        - テキスト長は 1000 くらいで良い感触
            - チューニングの余地はあると思うがとりあえずこれで十分
    - GPT-3.5 (Chat GPT Web UI) ならうまく動く


- Fine Tuning
    - 自分がチューニングしたモデルを他人に使わせることはできない
        - 従量課金でユーザーに払わせるのは？
            - API Rate Limit 問題
                - 7 ~ 10 日かかるらしいが拡張可能
                - Ada
                    - 毎分 350 k * 200 tokens
                    - おおよそ 30 リクエストを毎分捌ける

- 