#CSV and Excel file processing


In [10]:
import pandas as pd
import os

In [11]:
os.makedirs("data/Structured_files", exist_ok=True)

In [12]:
data = {
    'Products': ['Laptop', 'Smartphone', 'Headphones', 'Chair', 'Table'],
    'Category': ['Electronics', 'Electronics', 'Accessories', 'Furniture', 'Furniture'],
    'Price': [60000, 25000, 2000, 3500, 7000],
    'Stock': [10, 25, 50, 15, 20],
    'Description': [
        'High performance laptop with 16GB RAM',
        'Latest smartphone with AMOLED display',
        'Noise-cancelling wireless headphones',
        'Ergonomic office chair',
        'Wooden dining table with 4 seats'
    ]
}

df = pd.DataFrame(data)
df.to_csv(r"data\Structured_files\products.csv", index=False)



In [14]:
with pd.ExcelWriter(r"data\Structured_files\inventory.xlsx") as writer:
    df.to_excel(writer, sheet_name="Products", index=False)
    
    summary_data = {
        'Category':['Electronics','Accessories'],
        'Total_Items' :[3,2],
        'Total_Value':[85000,22000]
    }
    
    pd.DataFrame(summary_data).to_excel(writer,sheet_name="Summary", index=False)

####CSV Processing


In [15]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import  UnstructuredCSVLoader


In [18]:
csv_loader = CSVLoader(
    file_path=r"data\Structured_files\Products.csv",
    encoding="utf-8",
    csv_args={
        "delimiter":",",
        "quotechar":"\"",
    }
)

csv_docs = csv_loader.load()

for i, doc in enumerate(csv_docs):
    print(f"document {i} content:{doc.page_content}")
    print(f"document {i} metadata:{doc.metadata}")






document 0 content:Products: Laptop
Category: Electronics
Price: 60000
Stock: 10
Description: High performance laptop with 16GB RAM
document 0 metadata:{'source': 'data\\Structured_files\\Products.csv', 'row': 0}
document 1 content:Products: Smartphone
Category: Electronics
Price: 25000
Stock: 25
Description: Latest smartphone with AMOLED display
document 1 metadata:{'source': 'data\\Structured_files\\Products.csv', 'row': 1}
document 2 content:Products: Headphones
Category: Accessories
Price: 2000
Stock: 50
Description: Noise-cancelling wireless headphones
document 2 metadata:{'source': 'data\\Structured_files\\Products.csv', 'row': 2}
document 3 content:Products: Chair
Category: Furniture
Price: 3500
Stock: 15
Description: Ergonomic office chair
document 3 metadata:{'source': 'data\\Structured_files\\Products.csv', 'row': 3}
document 4 content:Products: Table
Category: Furniture
Price: 7000
Stock: 20
Description: Wooden dining table with 4 seats
document 4 metadata:{'source': 'data\\

In [27]:
##processing the pdf file intelligently

from typing import List
from langchain_core.documents import Document


def process_pdf_file(file_path: str) -> List[Document]:
    
    df = pd.read_csv(file_path)
    
    documents = []
    for idx, row in df.iterrows():
        
        content = f"""
        Product Information:
        name: {row["Products"]}
        description: {row["Description"]}
        price: {row["Price"]}
        category: {row["Category"]}
        stock_quantity: {row["Stock"]}
        """
        
        doc = Document (
            
            page_content=content,
            metadata={
                "source": file_path,
                "name": row["Products"],
                "description": row["Description"],
                "price": row["Price"],
                "category": row["Category"],
                "stock_quantity": row["Stock"],
                "row_index": idx,
            }

        )
        
        documents.append(doc)
        return documents


In [28]:
procced_docs = process_pdf_file("data/Structured_files/products.csv")
print(procced_docs[0])

page_content='
        Product Information:
        name: Laptop
        description: High performance laptop with 16GB RAM
        price: 60000
        category: Electronics
        stock_quantity: 10
        ' metadata={'source': 'data/Structured_files/products.csv', 'name': 'Laptop', 'description': 'High performance laptop with 16GB RAM', 'price': 60000, 'category': 'Electronics', 'stock_quantity': 10, 'row_index': 0}


In [None]:
##processing the Excel file

def process_excel_file(file_path: str) -> List[Document]:
    excel_file = pd.ExcelFile(file_path)
    documents = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        sheet_content = f"sheet name : {sheet_name}\n"
        sheet_content += f"Columns : {', '.join(df.columns)}\n"
        sheet_content += f"Rows : {len(df)}\n\n"
        sheet_content += df.to_string(index=False)
        
        
        doc = Document(
            page_content=sheet_content,
            metadata={
                "source": file_path,
                "sheet_name": sheet_name,
                "num_rows": len(df),
                "num_columns": len(df.columns)
            }
        )
        
        documents.append(doc)
        return documents

In [34]:
print(process_excel_file(r"data\Structured_files\inventory.xlsx")[0].page_content)


sheet name : Products
Columns : Products, Category, Price, Stock, Description
Rows : 5

  Products    Category  Price  Stock                           Description
    Laptop Electronics  60000     10 High performance laptop with 16GB RAM
Smartphone Electronics  25000     25 Latest smartphone with AMOLED display
Headphones Accessories   2000     50  Noise-cancelling wireless headphones
     Chair   Furniture   3500     15                Ergonomic office chair
     Table   Furniture   7000     20      Wooden dining table with 4 seats


In [40]:
    #processing the Excel file using unstructuredExcelLoader
from langchain.document_loaders import UnstructuredExcelLoader

loader = UnstructuredExcelLoader('data/Structured_files/inventory.xlsx' , mode='elements')

docs = loader.load()
print(docs[0].page_content)




Products Category Price Stock Description Laptop Electronics 60000 10 High performance laptop with 16GB RAM Smartphone Electronics 25000 25 Latest smartphone with AMOLED display Headphones Accessories 2000 50 Noise-cancelling wireless headphones Chair Furniture 3500 15 Ergonomic office chair Table Furniture 7000 20 Wooden dining table with 4 seats
