# Json Parsing and Processing

In [12]:
!pip install jq
import json
import os
os.makedirs("data/json_files", exist_ok=True)

Collecting jq
  Downloading jq-1.10.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.0 kB)
Downloading jq-1.10.0-cp313-cp313-macosx_11_0_arm64.whl (425 kB)
Installing collected packages: jq
Successfully installed jq-1.10.0


In [3]:
json_data = {
    "company": "TechCorp",
    "employees": [
        {
            "id": 1,
            "name": "Alex Thompson",
            "role": "Frontend Developer",
            "skills": ["Vue.js", "TypeScript", "CSS"],
            "projects": [
                {"name": "RAG System", "status": "In Progress"},
                {"name": "Data Pipeline", "status": "Completed"}
            ]
        },
        {
            "id": 2,
            "name": "Maria Rodriguez",
            "role": "ML Engineer",
            "skills": ["TensorFlow", "Deep Learning", "Docker"],
            "projects": [
                {"name": "ML Model", "status": "In Progress"},
                {"name": "Analytics Dashboard", "status": "Planning"}
            ]
        }
    ],
    "departments": {
        "engineering": {
            "head": "David Chen",
            "budget": 1000000,
            "team_size": 25
        },
        "data_science": {
            "head": "Emily Watson",
            "budget": 750000,
            "team_size": 15
        }
    }
}

In [4]:
json_data

{'company': 'TechCorp',
 'employees': [{'id': 1,
   'name': 'Alex Thompson',
   'role': 'Frontend Developer',
   'skills': ['Vue.js', 'TypeScript', 'CSS'],
   'projects': [{'name': 'RAG System', 'status': 'In Progress'},
    {'name': 'Data Pipeline', 'status': 'Completed'}]},
  {'id': 2,
   'name': 'Maria Rodriguez',
   'role': 'ML Engineer',
   'skills': ['TensorFlow', 'Deep Learning', 'Docker'],
   'projects': [{'name': 'ML Model', 'status': 'In Progress'},
    {'name': 'Analytics Dashboard', 'status': 'Planning'}]}],
 'departments': {'engineering': {'head': 'David Chen',
   'budget': 1000000,
   'team_size': 25},
  'data_science': {'head': 'Emily Watson', 'budget': 750000, 'team_size': 15}}}

In [None]:
with open('data/json_files/company_data.json', 'w') as f:
    json.dump(json_data, f ,indent = 2)

In [9]:
jsonl_data = [
    {"timestamp": "2024-01-01", "event": "user_login", "user_id": 123},
    {"timestamp": "2024-01-01", "event": "page_view", "user_id": 123, "page": "/home"},
    {"timestamp": "2024-01-01", "event": "purchase", "user_id": 123, "amount": 99.99}
]

with open('data/json_files/events.jsonl', 'w') as f:
    for item in jsonl_data:
        f.write(json.dumps(item) + '\n')


## Json Processing Strategies

In [16]:
from langchain_community.document_loaders import JSONLoader
import json
import jq

# Method 1. Json Schema


employee_loader = JSONLoader(
    file_path = 'data/json_files/company_data.json',
    jq_schema='.employees[]', #jq query to extract each employee
    text_content=False # Full json objects
)

employee_docs = employee_loader.load()
print(f"Loader {len(employee_docs)}")
print(f"First emp. {employee_docs[0].page_content[:500]}")

Loader 2
First emp. {"id": 1, "name": "Alex Thompson", "role": "Frontend Developer", "skills": ["Vue.js", "TypeScript", "CSS"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}


## Custom JLoader

In [None]:
from typing import List
from langchain_core.documents import Document
import json

print("Custom Json Loader")

def process_json_custom(file_path: str) -> List[Document]:
    """ Process json with flattening """

    with open(file_path, 'r') as f:
        data = json.load(f)

    documents = []
    # Method 2
    for emp in data.get('employees', []):
        content = f"""Employee Profile:
Name: {emp['name']}
Role: {emp['role']}
Skills: {', '.join(emp['skills'])}

Projects:"""
        for proj in emp.get('projects', []):
            content += f"\n - {proj['name']} (Status: {proj['status']})"

        doc = Document(
            page_content=content,
            metadata={
                'source': file_path,
                'data_type': 'employee_profile',
                'employee_id': emp['id'],
                'employee_name': emp['name'],
                'role': emp['role']
            }
        )

        documents.append(doc)
    
    return documents
    

Custom Json Loader


In [None]:
json_file = process_json_custom('data/json_files/company_data.json')
print(json_file[0])

page_content='Employee Profile:
Name: Alex Thompson
Role: Frontend Developer
Skills: Vue.js, TypeScript, CSS

Projects:
 - RAG System (Status: In Progress)
 - Data Pipeline (Status: Completed)' metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_profile', 'employee_id': 1, 'employee_name': 'Alex Thompson', 'role': 'Frontend Developer'}
