### Ingestion & Parsing of JSON

In [1]:
import json
import os

os.makedirs("data/json_docs", exist_ok=True)

In [3]:
sample_data={
  "company": "TechNova Solutions",
  "employees": [
    {
      "id": 101,
      "name": "Alice Johnson",
      "role": "Software Engineer",
      "skills": ["Python", "JavaScript", "SQL"],
      "remote": "true"
    },
    {
      "id": 102,
      "name": "Rajesh Kumar",
      "role": "Data Analyst",
      "skills": ["Excel", "R", "Tableau"],
      "remote": "false"
    },
    {
      "id": 103,
      "name": "Maria Lopez",
      "role": "Project Manager",
      "skills": ["Agile", "Scrum", "Communication"],
      "remote": "true"
    }
  ],
  "projects": [
    {
      "project_id": "P-001",
      "name": "AI Chatbot",
      "status": "In Progress",
      "budget_usd": 50000
    },
    {
      "project_id": "P-002",
      "name": "E-commerce Dashboard",
      "status": "Completed",
      "budget_usd": 75000
    }
  ]
}

sample_data


{'company': 'TechNova Solutions',
 'employees': [{'id': 101,
   'name': 'Alice Johnson',
   'role': 'Software Engineer',
   'skills': ['Python', 'JavaScript', 'SQL'],
   'remote': 'true'},
  {'id': 102,
   'name': 'Rajesh Kumar',
   'role': 'Data Analyst',
   'skills': ['Excel', 'R', 'Tableau'],
   'remote': 'false'},
  {'id': 103,
   'name': 'Maria Lopez',
   'role': 'Project Manager',
   'skills': ['Agile', 'Scrum', 'Communication'],
   'remote': 'true'}],
 'projects': [{'project_id': 'P-001',
   'name': 'AI Chatbot',
   'status': 'In Progress',
   'budget_usd': 50000},
  {'project_id': 'P-002',
   'name': 'E-commerce Dashboard',
   'status': 'Completed',
   'budget_usd': 75000}]}

In [5]:
with open("data/json_docs/sample.json", "w") as file:
    json.dump(sample_data, file, indent=2)

In [16]:
#Save JSON Lines format

jsonl_data=[
    {"timestamp":"2024-01-01", "event":"user_login", "user_id": 123},
    {"timestamp":"2024-01-01", "event":"page_view", "user_id": 123, "page": "/home"},
    {"timestamp":"2024-01-01", "event":"purchase", "user_id": 123, "amount": 90},
]

with open("data/json_docs/events.jsonl", "w") as file:
    for item in jsonl_data:
        file.write(json.dumps(item) + "\n")

In [3]:
# Custom JSON Processing

from langchain_community.document_loaders import JSONLoader
import json

# JSONLoader with jq_schema
employee_loader = JSONLoader(
    file_path ='data/json_docs/sample.json',
    jq_schema='.employees[]', #jq query to extract each employee
    text_content=False #Get full JSON objects
)

employee_docs= employee_loader.load()
print(f"Loaded {len(employee_docs)} employee documents")
print(f"First employee: {employee_docs[0].page_content[:200]}")

Loaded 3 employee documents
First employee: {"id": 101, "name": "Alice Johnson", "role": "Software Engineer", "skills": ["Python", "JavaScript", "SQL"], "remote": "true"}
