## CSV and Excel Parsing  - Structured Data

## Loading CSV and Excel samples

In [2]:
import pandas as pd
import os

In [3]:
os.makedirs('data/structured_files', exist_ok=True)


In [4]:
# Create sample data frame
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
    'Price': [999.99, 29.99, 79.99, 299.99, 89.99],
    'Stock': [50, 200, 150, 75, 100],
    'Description': [
        'A high-performance laptop suitable for all your computing needs.',
        'A wireless mouse with ergonomic design.',
        'A mechanical keyboard with customizable backlight.',
        'A 24-inch full HD monitor with vibrant colors.',
        '1080p webcam with noise cancellation.'
    ]
}
df = pd.DataFrame(data)
df.to_csv('data/structured_files/products.csv', index=False)


In [7]:
# Save as Excel with multiple sheets
with pd.ExcelWriter('data/structured_files/inventory.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products', index=False)

    # Add another sheet
    summary_data = {
        'Category': ['Electronics', 'Accessories'],
        'Total_Items': [3, 2],
        'Total_Value': [1389.97, 109.98]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)

## CSV Processing

In [12]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from typing import List, Any, Dict
from langchain_core.documents import Document

In [11]:
# Method 1: CSV Loader
print("Loading CSV using CSVLoader...")
csv_loader = CSVLoader('data/structured_files/products.csv', encoding='utf-8', csv_args={'delimiter': ',', 'quotechar': '"'})
csv_docs = csv_loader.load()
print(f"Loaded {len(csv_docs)} documents from CSV.\n")
print(f"First document content:\n{csv_docs[0].page_content}\n")
print(f"First document metadata:\n{csv_docs[0].metadata}\n")
print("--------------------------------------------------\n")
print(csv_docs)


Loading CSV using CSVLoader...
Loaded 5 documents from CSV.

First document content:
Product: Laptop
Category: Electronics
Price: 999.99
Stock: 50
Description: A high-performance laptop suitable for all your computing needs.

First document metadata:
{'source': 'data/structured_files/products.csv', 'row': 0}

--------------------------------------------------

[Document(metadata={'source': 'data/structured_files/products.csv', 'row': 0}, page_content='Product: Laptop\nCategory: Electronics\nPrice: 999.99\nStock: 50\nDescription: A high-performance laptop suitable for all your computing needs.'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 1}, page_content='Product: Mouse\nCategory: Accessories\nPrice: 29.99\nStock: 200\nDescription: A wireless mouse with ergonomic design.'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 2}, page_content='Product: Keyboard\nCategory: Accessories\nPrice: 79.99\nStock: 150\nDescription: A mechanica

In [13]:
# Method 2: CSV Loader and processing with custom metadata
print("\n Custom CSV processing ...\n")
def smart_csv_processor(file_path: str) -> List[Document]:
    """ Process CSV with intelligent document creation. """

    df = pd.read_csv(file_path)
    documents = []


    # Strategy 1: One document per row with structured content 
    for idx, row in df.iterrows():
        # Create structured content
        content = f"""Product Information:
        Name: {row['Product']}
        Category: {row['Category']}
        Price: ${row['Price']}
        Stock: {row['Stock']}
        Description: {row['Description']}"""

        # Create document with rich metadata
        doc = Document(
            page_content=content,
            metadata={
                'source': file_path,
                'row_index': idx,
                'product_name': row['Product'],
                'category': row['Category'],
                'price': row['Price'],
                'data_type': 'product_info'
            }
        )
        documents.append(doc)
    return documents


 Custom CSV processing ...



In [17]:
smart_csv_docs = smart_csv_processor('data/structured_files/products.csv')
print(f"Processed {len(smart_csv_docs)} smart documents from CSV.\n")
for i in range(len(smart_csv_docs)):
    print(f"\nDocument {i+1} content:\n{smart_csv_docs[i].page_content}\n")
    for key, value in smart_csv_docs[i].metadata.items():
        print(f"{key}: {value}")

Processed 5 smart documents from CSV.


Document 1 content:
Product Information:
        Name: Laptop
        Category: Electronics
        Price: $999.99
        Stock: 50
        Description: A high-performance laptop suitable for all your computing needs.

 source: data/structured_files/products.csv
 row_index: 0
 product_name: Laptop
 category: Electronics
 price: 999.99
 data_type: product_info

Document 2 content:
Product Information:
        Name: Mouse
        Category: Accessories
        Price: $29.99
        Stock: 200
        Description: A wireless mouse with ergonomic design.

 source: data/structured_files/products.csv
 row_index: 1
 product_name: Mouse
 category: Accessories
 price: 29.99
 data_type: product_info

Document 3 content:
Product Information:
        Name: Keyboard
        Category: Accessories
        Price: $79.99
        Stock: 150
        Description: A mechanical keyboard with customizable backlight.

 source: data/structured_files/products.csv
 row_ind

In [18]:
print("üìä CSV processing strategies:\n")
print("1. Row-based (CSVLoader):")
print(" ‚úÖ Simple one-row-per-document.")
print(" ‚úÖ Good for record lookups.")
print(" ‚ùå Loses table context.\n")

print("2. Smart Row-based (custom):")
print(" ‚úÖ Preserves relationships.")
print(" ‚úÖ Creates summaries.")
print(" ‚úÖ Rich metadata for better retrieval.")
print(" ‚úÖ Better for Q&A.")

üìä CSV processing strategies:

1. Row-based (CSVLoader):
 ‚úÖ Simple one-row-per-document.
 ‚úÖ Good for record lookups.
 ‚ùå Loses table context.

2. Smart Row-based (custom):
 ‚úÖ Preserves relationships.
 ‚úÖ Creates summaries.
 ‚úÖ Rich metadata for better retrieval.
 ‚úÖ Better for Q&A.


## Excel Processing

In [21]:
# Method 1: Using pandas for full control
print("\n Pandas based Excel processing ... \n")
def process_excel_with_pandas(file_path: str) -> List[Document]:
    """ Process Excel with sheet awareness. """
    
    documents = []

    # Read all excel sheets
    excel_file = pd.ExcelFile(file_path)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)

        # Create document for each sheet
        sheet_content = f"Sheet: {sheet_name}\n"
        # sheet_content += f"Columns: {', '.join(df.columns)}\n"
        # sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content += df.to_string(index=False)

        doc = Document(
            page_content=sheet_content,
            metadata={
                'source': file_path,
                'sheet_name': sheet_name,
                'num_rows': len(df),
                'num_columns': len(df.columns),
                'data_type': 'excel_sheet'
            }
        )
        documents.append(doc)
    return documents


 Pandas based Excel processing ... 



In [23]:
excel_docs = process_excel_with_pandas('data/structured_files/inventory.xlsx')
print(f"Processed {len(excel_docs)} documents from Excel using pandas.\n")
print(excel_docs)
print("\n‚úÖ Excel processed with pandas for full control over sheets and data.\n")
for doc in excel_docs:
    print(f"Document for sheet: {doc.metadata['sheet_name']}")
    print(f"Content:\n{doc.page_content}\n")
    print(f"Metadata: {doc.metadata}\n")
    print("--------------------------------------------------\n")

Processed 2 documents from Excel using pandas.

[Document(metadata={'source': 'data/structured_files/inventory.xlsx', 'sheet_name': 'Products', 'num_rows': 5, 'num_columns': 5, 'data_type': 'excel_sheet'}, page_content='Sheet: Products\n Product    Category  Price  Stock                                                      Description\n  Laptop Electronics 999.99     50 A high-performance laptop suitable for all your computing needs.\n   Mouse Accessories  29.99    200                          A wireless mouse with ergonomic design.\nKeyboard Accessories  79.99    150               A mechanical keyboard with customizable backlight.\n Monitor Electronics 299.99     75                   A 24-inch full HD monitor with vibrant colors.\n  Webcam Electronics  89.99    100                            1080p webcam with noise cancellation.'), Document(metadata={'source': 'data/structured_files/inventory.xlsx', 'sheet_name': 'Summary', 'num_rows': 2, 'num_columns': 3, 'data_type': 'excel_sheet'},

In [24]:
# Method 2: UnstructuredExcelLoader
print("\n Method 2: UnstructuredExcelLoader\n")
try:
    excel_loader = UnstructuredExcelLoader('data/structured_files/inventory.xlsx', mode='elements')
    print("‚úÖ Handles Complex Excel features.")
    print("‚úÖ Preserves formatting info.")
    print("‚ùå Requires unstructured library setup.")
    excel_docs = excel_loader.load()
    print(f"\nProcessed {len(excel_docs)} using UnstructuredExcelLoader.")
    for doc in excel_docs:
        print(f"\nDocument sheet name {doc.metadata['sheet_name']}")
        print(f"Content:\n{doc.page_content}\n")
        print(f"Metadata: {doc.metadata}\n")
        print("--------------------------------------------------\n")
except Exception as e:
    print(f"Error initializing UnstructuredExcelLoader: {e}")


 Method 2: UnstructuredExcelLoader

‚úÖ Handles Complex Excel features.
‚úÖ Preserves formatting info.
‚ùå Requires unstructured library setup.
Error initializing UnstructuredExcelLoader: No module named 'msoffcrypto'
