### LlamaIndexの前準備

In [None]:
# パッケージのインストール
!pip install llama-index==0.10.39
!pip install llama-index-llms-gemini
!pip install llama-index-embeddings-huggingface

In [1]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
from tqdm.notebook import tqdm

load_dotenv()
GOOGLE_API_KEY=os.environ.get("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [2]:
import logging
import sys

# ログレベルの設定
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)

In [3]:
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# LLMの準備
Settings.llm = Gemini(
    model_name="models/gemini-1.5-flash",
    safety_settings={
        "HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
        "HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
        "HARM_CATEGORY_SEXUALLY_EXPLICIT" : "BLOCK_NONE",
        "HARM_CATEGORY_DANGEROUS_CONTENT" : "BLOCK_NONE"
    }
)

# 埋め込みモデルの準備
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-m3"
)

INFO:datasets:PyTorch version 2.5.1 available.
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/config_sentence_transformers.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/README.md HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/sentence_bert_config.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-m3/resolve/main/config.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface

### Webページへの質問応答

In [4]:
# パッケージのインストール
!pip install llama-index-readers-web

Collecting llama-index-readers-web
  Downloading llama_index_readers_web-0.2.4-py3-none-any.whl.metadata (1.2 kB)
Collecting chromedriver-autoinstaller<0.7.0,>=0.6.3 (from llama-index-readers-web)
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl.metadata (2.1 kB)
Collecting html2text<2025.0.0,>=2024.2.26 (from llama-index-readers-web)
  Downloading html2text-2024.2.26.tar.gz (56 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting newspaper3k<0.3.0,>=0.2.8 (from llama-index-readers-web)
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting playwright<2.0,>=1.30 (from llama-index-readers-web)
  Downloading playwright-1.48.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (3.5 kB)
Collecting selenium<5.0.0,>=4.17.2 (from llama-index-readers-web)
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting spider-client<0.0.28,>=0.0.27 (from llama-index-readers-web)
  Downloading spider-client-0.0.27.tar.gz (

In [11]:
from llama_index.readers.web import BeautifulSoupWebReader

# データローダーの準備
reader = BeautifulSoupWebReader()

# ドキュメントの読み込み
documents = reader.load_data(urls=["https://deepmind.google/about/"])

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): deepmind.google:443
DEBUG:urllib3.connectionpool:https://deepmind.google:443 "GET /about/ HTTP/11" 200 None


In [15]:
print(documents)

[Document(id_='https://deepmind.google/about/', embedding=None, metadata={'URL': 'https://deepmind.google/about/'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='\n\n\n\n\nAbout - Google DeepMind\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Jump to Content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGoogle\n\n\n\n\nDeepMind\n\n\n\n\n\n\nSearch...\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\nClose\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGoogle\n\n\n\n\nDeepMind\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          About\n          \n          \n\n\n\n\n\n\n\n\n\n\n          Learn about Google DeepMind\n          \n          \n        \n\n — Our mission is to build AI responsibly to benefit humanity\n          \n\n\n\n\n\n\n\n          Responsibility & Safety\n          \n          \n        \n\n — We want AI to benefit the world, so we must be thoughtful about how it’s built and used\n          \n\n\n\n\n\n\n

In [12]:
from llama_index.core import VectorStoreIndex

# インデックスとクエリエンジンの準備
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: About - Google DeepMind























...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Publications
          
          
        

 —...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Episode 1
Unreasonably Effective AI with Demis ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Imagen 3
Our highest quality text-to-image mode...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
# 質問応答
response = query_engine.query("Google DeepMindの歴史について教えてください")
print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 2 nodes:
> [Node 75f1a3dc-d660-4524-976a-afe560edbc2e] [Similarity score:             0.662953] About - Google DeepMind






























      Jump to Content
    















G...
> [Node 3fe0dc64-8069-493e-afa3-5f4f799010fc] [Similarity score:             0.657381] Episode 1
Unreasonably Effective AI with Demis Hassabis



              Watch on YouTube
       ...
Google DeepMindは、AIの分野で大きな進歩を遂げてきました。AlphaGoで囲碁の世界チャンピオンを破ったことで有名になり、その後もStarCraft IIでプロゲーマーを破るAlphaStar、タンパク質の3次元構造を予測するAlphaFoldなど、数々の画期的な成果を上げてきました。また、Google Assistantの音声合成モデルWaveNetや、AIによるプログラム作成システムAlphaCode、高速なソートアルゴリズムを発見したAlphaDevなど、様々な分野で革新的な技術を生み出しています。



### Youtube動画への質問応答

In [16]:
# パッケージのインストール
!pip install llama-hub-youtube-transcript
!pip install llama-index-readers-youtube-transcript

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-hub-youtube-transcript
  Downloading llama-hub-youtube-transcript-0.0.1.tar.gz (3.4 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting requests-html (from llama-hub-youtube-transcript)
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests-html->llama-hub-youtube-transcript)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html->llama-hub-youtube-transcript)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests-html->llama-hub-youtube-transcript)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html->llama-hub-youtube-transcript)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html->llama-hub-yo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-readers-youtube-transcript
  Downloading llama_index_readers_youtube_transcript-0.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting youtube-transcript-api>=0.5.0 (from llama-index-readers-youtube-transcript)
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading llama_index_readers_youtube_transcript-0.2.0-py3-none-any.whl (3.6 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api, llama-index-readers-youtube-transcript
Successfully installed llama-index-readers-youtube-transcript-0.2.0 youtube-transcript-api-0.6.2
[0m

In [17]:
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader

# データローダーの準備
reader = YoutubeTranscriptReader()

# ドキュメントの読み込み
documents = reader.load_data(
    ytlinks=["https://www.youtube.com/watch?v=jV1vkHv4zq8"]
)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.youtube.com:443
DEBUG:urllib3.connectionpool:https://www.youtube.com:443 "GET /watch?v=jV1vkHv4zq8 HTTP/11" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.youtube.com:443
DEBUG:urllib3.connectionpool:https://www.youtube.com:443 "GET /api/timedtext?v=jV1vkHv4zq8&ei=EEswZ9_xE8efvcAPv6S42Ac&caps=asr&opi=112496729&exp=xbt&xoaf=5&hl=en&ip=0.0.0.0&ipbits=0&expire=1731243392&sparams=ip,ipbits,expire,v,ei,caps,opi,exp,xoaf&signature=CA06CB9F6A4E423F64AD2D707E98860586A01E96.506C6CDF4F5606815B2F963A8311DE12BD3DE2B7&key=yt8&lang=en HTTP/11" 200 None


In [18]:
from llama_index.core import VectorStoreIndex

# インデックスとクエリエンジンの準備
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: [soft music begins] [Sundar Pichai
speaking] Yo...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: [Lila Ibrahim speaking]
Safety and responsibili...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
# 質問応答
response = query_engine.query("この動画で伝えたいことはなんですか？")
print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 2 nodes:
> [Node 4327458b-2507-43ae-8755-5d059bc6ccf6] [Similarity score:             0.507344] [Lila Ibrahim speaking]
Safety and responsibility
has to be built-in from the beginning.
And at G...
> [Node f24cee32-f489-4711-838d-fde746410ff2] [Similarity score:             0.483356] [soft music begins] [Sundar Pichai
speaking] You know, one of the reasons
we got interested in AI...
Googleが開発した新しいAIモデル「Gemini」について説明しています。 
Geminiは、テキストだけでなく、コード、音声、画像、動画など、さまざまな種類の情報を理解し、処理できる画期的なモデルです。 
Googleは、Geminiが世界中のあらゆる人に役立つAIになることを目指しています。 
また、Geminiの開発にあたっては、安全面と倫理面にも十分に配慮していることを強調しています。 

