# Data Parsing
First of all we will understand how to parse data.
For example: You have csv data, and want to use it in your agents systems, but how ?



In [13]:
# Import necessary libraries

import pandas as pd
import langchain
import os
import sys



In [14]:
os.makedirs("data/strcutures_files", exist_ok=True) # Create folder

In [15]:
# Create simple Data

data = {
    "Product" : ['Laptop', 'Mouse', 'Keyboard'],
    "Category" : ['Electronics', 'Accessories','Accessories'],
    "Price" : [1000, 30, 80],
    'Stock' : [50, 200, 150],
    'Description' : [
        'Good Laptop with GPU',
        'Wireless mouth',
        'Wireless keyboard with BGR'
    ]
}


df = pd.DataFrame(data) # Making a frame of data
df.to_csv('data/strcutures_files/products.csv', index = False) # Create csv file from frame


In [16]:
# Save as Excel with multiple sheets
with pd.ExcelWriter('data/strcutures_files/inventory.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products', index = False)


    # Add another sheet

    data_sum = {
        'Category' : ['Electronics', 'Accessories'],
        'Total_Items' : [1, 2],
        'Total_Value' : [1000, 110]
    }


    pd.DataFrame(data_sum).to_excel(writer, sheet_name='Summary', index=False)

## CSV Processing

In [17]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredCSVLoader

### Row-Based CSV

In [18]:
# Method 1: CSVLoader - Each row start to be a document

print("CSVLoader - Row-based Documents")
csv_loader = CSVLoader(
    file_path = 'data/strcutures_files/products.csv',
    encoding= 'utf-8',
    csv_args={
        'delimiter' : ',',
        'quotechar' : '"',
    }
)

csv_docs = csv_loader.load()
print(len(csv_docs))
print("First doc")
print(csv_docs[0].page_content)
print(csv_docs[0].metadata)


CSVLoader - Row-based Documents
3
First doc
Product: Laptop
Category: Electronics
Price: 1000
Stock: 50
Description: Good Laptop with GPU
{'source': 'data/strcutures_files/products.csv', 'row': 0}


### Custom CSV Loader


In [19]:
# Method 2 - custom csv processing, we can make our own parser
from typing import List
from langchain_core.documents import Document
print("Customization")

def process_csv(file_path : str)  -> List[Document]:
    """Process csv with custom parser"""
    documents = []
    df = pd.read_csv(file_path)
    for idx, row in df.iterrows():

        content = f"""Product Info:
        Name : {row['Product']}
        Category : {row['Category']}
        Price: ${row['Price']}
        Stock: {row['Stock']} units
        Description: {row['Description']}"""


        # Create doc with metadata

        doc = Document(page_content=content,
                       metadata = {
                           'source' : file_path,
                           'row_index' : idx,
                           'product_name' : row['Product'],
                           'category' : row['Category'],
                           'price' : row['Price'],
                           'data_type' : 'product_info'
                       })
        
        documents.append(doc)
    return documents
        





Customization


In [20]:
documents = process_csv('data/strcutures_files/products.csv')
print(documents[0])


page_content='Product Info:
        Name : Laptop
        Category : Electronics
        Price: $1000
        Stock: 50 units
        Description: Good Laptop with GPU' metadata={'source': 'data/strcutures_files/products.csv', 'row_index': 0, 'product_name': 'Laptop', 'category': 'Electronics', 'price': 1000, 'data_type': 'product_info'}


So row-based very fast and simple, but it losses a lot of table context.
Custom loader, can creates summaries, have rich metadata, and very good for Q&A(question and answering)


### Excel Processing

In [21]:
print("Pandas-based Excel Processing")
def process_excel(file_path : str) -> List[Document]:
    """Process Excell with sheet awreness"""

    documents = []

    excel_file = pd.ExcelFile(file_path)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)


        sheet_content = f"Sheet: {sheet_name}\n"
        sheet_content += f"Columns: {', '.join(df.columns)}\n"
        sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content += df.to_string(index=False)


        doc = Document(page_content=sheet_content,
                    metedata = {
                        'source' : file_path,
                        'sheet_name' : sheet_name,
                        'num_rows' : len(df),
                        'num_columns' : len(df.columns),
                        'data_type' : 'excel_sheet'
                    })
        documents.append(doc)
    return documents


    



Pandas-based Excel Processing


In [22]:
excel_docs = process_excel('data/strcutures_files/inventory.xlsx')
print(f"Good, {len(excel_docs)} sheets")

print(excel_docs[0])
print(excel_docs[1])

Good, 2 sheets
page_content='Sheet: Products
Columns: Product, Category, Price, Stock, Description
Rows: 3

 Product    Category  Price  Stock                Description
  Laptop Electronics   1000     50       Good Laptop with GPU
   Mouse Accessories     30    200             Wireless mouth
Keyboard Accessories     80    150 Wireless keyboard with BGR'
page_content='Sheet: Summary
Columns: Category, Total_Items, Total_Value
Rows: 2

   Category  Total_Items  Total_Value
Electronics            1         1000
Accessories            2          110'


### Custom Excel Processing

In [23]:
!pip install unstructured
!pip install msoffcrypto-tool



In [24]:

from langchain_community.document_loaders import UnstructuredExcelLoader

print("Unstructrured Excel Loader")

try:
    excel_loader = UnstructuredExcelLoader(
        'data/strcutures_files/inventory.xlsx',
        mode = 'elements'
    )
    unstructured_docs = excel_loader.load()
except Exception as e:
    print(f"Error {e}")

Unstructrured Excel Loader


In [25]:
unstructured_docs

[Document(metadata={'source': 'data/strcutures_files/inventory.xlsx', 'file_directory': 'data/strcutures_files', 'filename': 'inventory.xlsx', 'last_modified': '2025-10-02T15:53:56', 'page_name': 'Products', 'page_number': 1, 'text_as_html': '<table><tr><td>Product</td><td>Category</td><td>Price</td><td>Stock</td><td>Description</td></tr><tr><td>Laptop</td><td>Electronics</td><td>1000</td><td>50</td><td>Good Laptop with GPU</td></tr><tr><td>Mouse</td><td>Accessories</td><td>30</td><td>200</td><td>Wireless mouth</td></tr><tr><td>Keyboard</td><td>Accessories</td><td>80</td><td>150</td><td>Wireless keyboard with BGR</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table', 'element_id': 'de0905c71a202a0d991a0bcceed97d14'}, page_content='Product Category Price Stock Description Laptop Electronics 1000 50 Good Laptop with GPU Mouse Accessories 30 200 Wireless mouth Keyboard Accessories 80 150 Wireless keyb