## JSON parsing and processing

### JSON file creation

In [1]:
import json
import os
os.makedirs('data/json_files', exist_ok=True)

In [2]:
# Sample nested JSON data
json_data = {
  "Company": "TechCorp",
  "employees": [
    {
      "employee_id": "E001",
      "name": "Amit Sharma",
      "designation": "Software Engineer",
      "skills": ["Python", "JavaScript", "React"],
      "email": "amit.sharma@company.com",
      "projects": [
        {
          "project_id": "P101",
          "project_name": "RAG System",
          "status": "In Progress"
        },
        {
          "project_id": "P102",
          "project_name": "Data Pipeline",
          "status": "Completed"
        }
      ]
    },
    {
      "employee_id": "E002",
      "name": "Neha Verma",
      "designation": "Data Analyst",
      "skills": ["Python", "SQL", "Machine Learning"],
      "email": "neha.verma@company.com",
      "projects": [
        {
          "project_id": "P201",
          "project_name": "Sales Forecasting",
          "status": "In Progress"
        },
        {
          "project_id": "P202",
          "project_name": "Analytics Dashboard",
          "status": "Planning"
        }
      ]
    }
  ],
  "departments": {
    "engineering": {
        "head": "Mike Johnson",
        "budget": 1000000,
        "team_size": 25
  },
    "data_science": {
        "head": "Sara Williams",
        "budget": 750000,
        "team_size": 15
    }
  }
}


In [3]:
json_data

{'Company': 'TechCorp',
 'employees': [{'employee_id': 'E001',
   'name': 'Amit Sharma',
   'designation': 'Software Engineer',
   'skills': ['Python', 'JavaScript', 'React'],
   'email': 'amit.sharma@company.com',
   'projects': [{'project_id': 'P101',
     'project_name': 'RAG System',
     'status': 'In Progress'},
    {'project_id': 'P102',
     'project_name': 'Data Pipeline',
     'status': 'Completed'}]},
  {'employee_id': 'E002',
   'name': 'Neha Verma',
   'designation': 'Data Analyst',
   'skills': ['Python', 'SQL', 'Machine Learning'],
   'email': 'neha.verma@company.com',
   'projects': [{'project_id': 'P201',
     'project_name': 'Sales Forecasting',
     'status': 'In Progress'},
    {'project_id': 'P202',
     'project_name': 'Analytics Dashboard',
     'status': 'Planning'}]}],
 'departments': {'engineering': {'head': 'Mike Johnson',
   'budget': 1000000,
   'team_size': 25},
  'data_science': {'head': 'Sara Williams',
   'budget': 750000,
   'team_size': 15}}}

In [4]:
with open('data/json_files/company_data.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=2)

In [6]:
# Save JSON lines format
json1_data = [
    {"timestamp": "2025-12-12", "event": "user_login", "user_id": "E001"},
    {"timestamp": "2025-12-12", "event": "page_view", "user_id": "E001", "page": "/home"},
    {"timestamp": "2025-12-12", "event": "purchase", "user_id": "E001", "amount": 99.99}
]

with open('data/json_files/events.jsonl', 'w') as f:
    for item in json1_data:
        f.write(json.dumps(item) + '\n')

### JSON Processing strategies

In [11]:
from langchain_community.document_loaders import JSONLoader
import json
from typing import List, Any, Dict
from langchain_core.documents import Document

In [10]:
# Method 1: JSONLoader iwth jq_schema
print("Method 1: JSONLoader - Extract specific fields.")

# Extract employee information
employee_loader = JSONLoader(file_path='data/json_files/company_data.json',
jq_schema='.employees[]',
text_content=False
)

employee_docs = employee_loader.load()
print(f"Loaded {len(employee_docs)} employee documents.\n")
print(f"First employee document content:\n{employee_docs[0].page_content}\n")
print(employee_docs)

Method 1: JSONLoader - Extract specific fields.
Loaded 2 employee documents.

First employee document content:
{"employee_id": "E001", "name": "Amit Sharma", "designation": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "email": "amit.sharma@company.com", "projects": [{"project_id": "P101", "project_name": "RAG System", "status": "In Progress"}, {"project_id": "P102", "project_name": "Data Pipeline", "status": "Completed"}]}

[Document(metadata={'source': '/Users/neeladnatarajan/DSProjects/LLMOps/hw/RAGUdemy/0-DataIngestParsing/data/json_files/company_data.json', 'seq_num': 1}, page_content='{"employee_id": "E001", "name": "Amit Sharma", "designation": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "email": "amit.sharma@company.com", "projects": [{"project_id": "P101", "project_name": "RAG System", "status": "In Progress"}, {"project_id": "P102", "project_name": "Data Pipeline", "status": "Completed"}]}'), Document(metadata={'source': '/Users/neela

In [12]:
# Method 2: JSONLoader for Custom Parsing and Processing
print("Method 2: JSONLoader - Custom parsing and processing.")

def smart_json_processor(file_path: str) -> List[Document]:
    """Process JSON file for intelligent flattening and context preservation."""

    with open(file_path, 'r') as f:
        data = json.load(f)

    documents = []

    # Strategy 1: Create documents for each employee with full context
    for emp in data.get('employees', []):
        content = f"Employee ID: {emp['employee_id']}\n"
        content += f"Name: {emp['name']}\n"
        content += f"Designation: {emp['designation']}\n"
        content += f"Email: {emp['email']}\n"
        content += f"Skills: {', '.join(emp['skills'])}\n"
        content += "Projects:\n"
        for proj in emp.get('projects', []):
            content += f"  - {proj['project_name']} (Status: {proj['status']})\n"
        
        doc = Document(
            page_content=content,
            metadata={
                'source': file_path,
                'data_type': 'employee_profile',
                'employee_id': emp['employee_id'],
                'employee_name': emp['name'],
                'role': emp['designation']
            }
        )
        documents.append(doc)
    return documents

Method 2: JSONLoader - Custom parsing and processing.


In [13]:
smart_json_docs = smart_json_processor('data/json_files/company_data.json')
print(f"Processed {len(smart_json_docs)} smart documents from JSON.\n")
for i in range(len(smart_json_docs)):
    print(f"\nDocument {i+1} content:\n{smart_json_docs[i].page_content}\n")
    for key, value in smart_json_docs[i].metadata.items():
        print(f"{key}: {value}")    

Processed 2 smart documents from JSON.


Document 1 content:
Employee ID: E001
Name: Amit Sharma
Designation: Software Engineer
Email: amit.sharma@company.com
Skills: Python, JavaScript, React
Projects:
  - RAG System (Status: In Progress)
  - Data Pipeline (Status: Completed)


source: data/json_files/company_data.json
data_type: employee_profile
employee_id: E001
employee_name: Amit Sharma
role: Software Engineer

Document 2 content:
Employee ID: E002
Name: Neha Verma
Designation: Data Analyst
Email: neha.verma@company.com
Skills: Python, SQL, Machine Learning
Projects:
  - Sales Forecasting (Status: In Progress)
  - Analytics Dashboard (Status: Planning)


source: data/json_files/company_data.json
data_type: employee_profile
employee_id: E002
employee_name: Neha Verma
role: Data Analyst


In [18]:
# Home work: Process events JSONL file similarly to extract event logs with context.
file_path = 'data/json_files/events.jsonl'
with open(file_path, 'r') as f:
    event_data = [json.loads(line) for line in f.readlines()]

event_documents = []
for event in event_data:
    content = f"Timestamp: {event['timestamp']}\n"
    content += f"Event: {event['event']}\n"
    content += f"User ID: {event['user_id']}\n"
    if 'page' in event:
        content += f"Page: {event['page']}\n"
    if 'amount' in event:
        content += f"Amount: {event['amount']}\n"

    doc = Document(
        page_content=content,
        metadata={
            'source': file_path,
            'data_type': 'event_log',
            'event_type': event['event'],
            'user_id': event['user_id']
        }
    )
    event_documents.append(doc)
print(f"\nProcessed {len(event_documents)} event log documents from JSONL.\n")
for i in range(len(event_documents)):
    print(f"\nDocument {i+1} content:\n{event_documents[i].page_content}\n")
    for key, value in event_documents[i].metadata.items():
        print(f"{key}: {value}")


Processed 3 event log documents from JSONL.


Document 1 content:
Timestamp: 2025-12-12
Event: user_login
User ID: E001


source: data/json_files/events.jsonl
data_type: event_log
event_type: user_login
user_id: E001

Document 2 content:
Timestamp: 2025-12-12
Event: page_view
User ID: E001
Page: /home


source: data/json_files/events.jsonl
data_type: event_log
event_type: page_view
user_id: E001

Document 3 content:
Timestamp: 2025-12-12
Event: purchase
User ID: E001
Amount: 99.99


source: data/json_files/events.jsonl
data_type: event_log
event_type: purchase
user_id: E001
