In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
!pip install -q pyspark sentence-transformers chromadb datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m135.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m144.8 MB/s[0m eta [36m0

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, size, lit
from pyspark.sql.types import ArrayType, StringType
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer

In [4]:
NUM_ARTICLES_TO_PROCESS = 1000
CHUNK_SIZE = 500
OVERLAP = 50

spark = SparkSession.builder \
        .appName("WikiETL") \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

dataset = load_dataset("omarkamali/wikipedia-monthly", "latest.en", split="train", streaming=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/1416 [00:00<?, ?it/s]

In [23]:
dataset

IterableDataset({
    features: ['id', 'url', 'title', 'text'],
    num_shards: 1416
})

In [27]:
first_article = next(iter(dataset))

print(f"Data Columns: {first_article.keys()}")

print(f"\n Document Title: {first_article['title']}")
print(f"Document Link: {first_article['url']}")
print("-" * 30)
print("Document Preview 500 words")
print(first_article['text'][:500])
print("-" * 30)

# check data if vaild
if 'text' in first_article and len(first_article['text']) > 0:
    print("\nVerified")
    if "<div" not in first_article['text'][:100] and "{{" not in first_article['text'][:100]:
        print("Formating Pass")
    else:
        print("Warning, there might be some html tags")
else:
    print("\nError no text found")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 1ad29848-866f-4a08-bb3e-af71da0e9eb0)')' thrown while requesting GET https://huggingface.co/datasets/omarkamali/wikipedia-monthly/resolve/0b4526c8ceba7353701711134375593aedcab43f/20251001/en/train/train_part_000.parquet
Retrying in 1s [Retry 1/5].


Data Columns: dict_keys(['id', 'url', 'title', 'text'])

 Document Title: Anarchism
Document Link: https://en.wikipedia.org/wiki/Anarchism
------------------------------
Document Preview 500 words
Anarchism is a political philosophy and movement that seeks to abolish all institutions that perpetuate authority, coercion, or hierarchy, primarily targeting the state and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. A historically left-wing movement, anarchism is usually described as the libertarian wing of the socialist movement (libertarian socialism).

Although traces of anarchist ideas are found all throughout hi
------------------------------

Verified
Formating Pass


In [8]:
data_buffer = []
count = 0
target_count = 1000

for doc in dataset:
    if doc.get('text') and len(doc['text']) > 500:
        data_buffer.append((doc['id'], doc['title'], doc['text']))
        count += 1

    if count >= target_count:
        break


df = spark.createDataFrame(data_buffer, ["id", "title", "text"])

def chunk_text(text):
    if not text: return []
    chunks = []
    for i in range(0, len(text), CHUNK_SIZE - OVERLAP):
        chunk = text[i:i + CHUNK_SIZE]
        if len(chunk) > 100:
            chunks.append(chunk)
    return chunks

chunk_udf = udf(chunk_text, ArrayType(StringType()))

In [9]:
processed_df = df \
    .withColumn("chunks", chunk_udf(col("text"))) \
    .select(col("id"), col("title"), explode(col("chunks")).alias("chunk_text"))

In [14]:
output_path = "/content/wiki_processed.parquet"
processed_df.write.mode("overwrite").parquet(output_path)

In [15]:
processed_df.show(3, truncate=50)
spark.stop()

+---+---------+--------------------------------------------------+
| id|    title|                                        chunk_text|
+---+---------+--------------------------------------------------+
| 12|Anarchism|Anarchism is a political philosophy and movemen...|
| 12|Anarchism|ces of anarchist ideas are found all throughout...|
| 12|Anarchism|is Commune, the Russian Civil War and the Spani...|
+---+---------+--------------------------------------------------+
only showing top 3 rows



In [19]:
df_check = pd.read_parquet("/content/wiki_processed.parquet")
total_chuncks = len(df_check)
num_title = len(df_check['title'].unique())
print(f"Total Chunk: {total_chuncks}")
print(f"Total Topics: {num_title}")

Total Chunk: 58965
Total Topics: 1000


In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

chroma_db_path = "/content/chroma_db_wiki"
client = chromadb.PersistentClient(path=chroma_db_path)

try:
  client.delete_collection(name="wiki_knowledge")
except:
  pass

collection = client.create_collection(name="wiki_knowledge")

df_chunks = pd.read_parquet("/content/wiki_processed.parquet")
total_records = len(df_chunks)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
BATCH_SIZE = 256

for i in range(0, total_records, BATCH_SIZE):
    batch = df_chunks.iloc[i:i+BATCH_SIZE]

    documents = batch['chunk_text'].tolist()

    metadatas = batch[['id', 'title']].to_dict(orient='records')

    ids = [f"{row['id']}_{idx}" for idx, row in batch.iterrows()]

    embeddings = model.encode(documents, convert_to_tensor=False).tolist()

    collection.add(
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )

    if (i + BATCH_SIZE) % 1000 < BATCH_SIZE:
      print(f"Progress: {min(i + BATCH_SIZE, total_records)} / {total_records}")


test_query = "What is the definition of machine learning?"
results = collection.query(
    query_texts=[test_query],
    n_results=2
)

for i, doc in enumerate(results['documents'][0]):
    print(f"結果 {i+1} (來自: {results['metadatas'][0][i]['title']}):")
    print(f"內容: {doc}\n")

Progress: 1024 / 58965
Progress: 2048 / 58965
Progress: 3072 / 58965
Progress: 4096 / 58965
Progress: 5120 / 58965
Progress: 6144 / 58965
Progress: 7168 / 58965
Progress: 8192 / 58965
Progress: 9216 / 58965
Progress: 10240 / 58965
Progress: 11008 / 58965
Progress: 12032 / 58965
Progress: 13056 / 58965
Progress: 14080 / 58965
Progress: 15104 / 58965
Progress: 16128 / 58965
Progress: 17152 / 58965
Progress: 18176 / 58965
Progress: 19200 / 58965
Progress: 20224 / 58965
Progress: 21248 / 58965
Progress: 22016 / 58965
Progress: 23040 / 58965
Progress: 24064 / 58965
Progress: 25088 / 58965
Progress: 26112 / 58965
Progress: 27136 / 58965
Progress: 28160 / 58965
Progress: 29184 / 58965
Progress: 30208 / 58965
Progress: 31232 / 58965
Progress: 32000 / 58965
Progress: 33024 / 58965
Progress: 34048 / 58965
Progress: 35072 / 58965
Progress: 36096 / 58965
Progress: 37120 / 58965
Progress: 38144 / 58965
Progress: 39168 / 58965
Progress: 40192 / 58965
Progress: 41216 / 58965
Progress: 42240 / 58965
P

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 10.8MiB/s]


結果 1 (來自: Artificial intelligence):
內容: ectly if no one knows how exactly it works. There have been many cases where a machine learning program passed rigorous tests, but nevertheless learned something different than what the programmers intended. For example, a system that could identify skin diseases better than medical professionals was found to actually have a strong tendency to classify images with a ruler as "cancerous", because pictures of malignancies typically include a ruler to show the scale. Another machine learning system

結果 2 (來自: Artificial intelligence):
內容: acting agents and is used in AI programs that make decisions that involve other agents.Game theory and multi-agent decision theory: .

 Learning 
Machine learning is the study of programs that can improve their performance on a given task automatically.Learning: , , ,  It has been a part of AI from the beginning.
upright=1.4|thumb|In supervised learning, the training data is labelled with the expected answers, whil

In [26]:
import shutil
from google.colab import drive

# 1. 掛載 Google Drive
drive.mount('/content/drive')

# 2. 定義壓縮檔名稱與路徑
source_folder = "/content/chroma_db_wiki"
output_filename = "/content/drive/MyDrive/wiki_vector_db_backup"

# 3. 製作壓縮檔 (這會生成一個 .zip 檔在你的 Google Drive 根目錄)
print("正在壓縮並上傳至 Google Drive，請稍候...")
shutil.make_archive(output_filename, 'zip', source_folder)

print(f"✅ 備份完成！請到 Google Drive 確認是否有 'wiki_vector_db_backup.zip'")

Mounted at /content/drive
正在壓縮並上傳至 Google Drive，請稍候...
✅ 備份完成！請到 Google Drive 確認是否有 'wiki_vector_db_backup.zip'
