In [None]:
pip install langchain

In [33]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import json

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap = 50,
    length_function = len,
    is_separator_regex = False
)

with open(r"/content/data.json", 'r', encoding='utf-8') as f:
    dataset = json.load(f)

chunks = []

for k, v in dataset.items():
  if v is not None:
    if isinstance(v, list):
      v = " ".join(v)
    docs = splitter.create_documents([v])
    for doc in docs:
      doc.metadata['source'] = k
    chunks.extend(docs)

print(len(chunks))

2729


In [34]:
chunks[0]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/Pittsburgh'}, page_content='Second-most populous city in Pennsylvania, U.S.\nThis article is about the city in Pennsylvania. For the region, see\nGreater Pittsburgh\n. For other uses, see\nPittsburgh (disambiguation)\n.\nNot to be confused with\nPittsburg\n.\nCity in Pennsylvania, United States\nPittsburgh\nDionde:gâ\n(\nSeneca\n)\nCity\nDowntown Pittsburgh\nDuquesne Incline\nPhipps Conservatory and Botanical Gardens\nLawrenceville\nPNC Park\nCathedral of Learning\nCarnegie Museums of Pittsburgh\nFlag\nSeal\nCoat of arms\nNickname(s):')

In [35]:
chunks[1]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/Pittsburgh'}, page_content='Flag\nSeal\nCoat of arms\nNickname(s):\nCity of Bridges, Steel City,\nCity of Champions, The \'Burgh, The Paris of Appalachia\nMotto:\nBenigno Numine\n("With the benevolent deity")\nInteractive map of Pittsburgh\nPittsburgh\nShow map of Pennsylvania\nPittsburgh\nShow map of the United States\nCoordinates:\n40°26′23″N\n79°58′35″W\n\ufeff / \ufeff\n40.43972°N 79.97639°W\n\ufeff /\n40.43972; -79.97639\nCountry\nUnited States\nState\nPennsylvania\nCounty\nAllegheny\nFounded\nNovember\xa027, 1758\n; 266 years ago\n(\n1758-11-27\n)\n(fort)\nMunicipal incorporation')

In [52]:
chunks[222]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/History_of_Pittsburgh'}, page_content='Second-most populous city in Pennsylvania, U.S.\nThis article is about the city in Pennsylvania. For the region, see\nGreater Pittsburgh\n. For other uses, see\nPittsburgh (disambiguation)\n.\nNot to be confused with\nPittsburg\n.\nCity in Pennsylvania, United States\nPittsburgh\nDionde:gâ\n(\nSeneca\n)\nCity\nDowntown Pittsburgh\nDuquesne Incline\nPhipps Conservatory and Botanical Gardens\nLawrenceville\nPNC Park\nCathedral of Learning\nCarnegie Museums of Pittsburgh\nFlag\nSeal\nCoat of arms\nNickname(s):')

In [55]:
data_to_save = [
    {
        "page_content": chunk.page_content,
        "metadata": chunk.metadata
    }
    for chunk in chunks
]

output_file_path = 'chunks.json'

try:
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(data_to_save, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Successfully saved {len(data_to_save)} chunks to '{output_file_path}'")

except Exception as e:
    print(f"\n[!] Error saving JSON file: {e}")


✅ Successfully saved 2729 chunks to 'chunks.json'
