# Load and Cleaning Data

In [2]:
import sys
sys.path.append('../')

In [3]:
DATA_DIR = '../data'
REVIEWS_FILE = 'olist_order_reviews_dataset.csv'
ORDERS_FILE = 'olist_orders_dataset.csv'
CUSTOMERS_FILE = 'olist_customers_dataset.csv'
PROCESSED_DATA_FILE = 'processed_customer_data.parquet'

In [None]:
import pandas as pd
import os

try:
    reviews_df = pd.read_csv(os.path.join(DATA_DIR, REVIEWS_FILE))
    orders_df = pd.read_csv(os.path.join(DATA_DIR, ORDERS_FILE))
    customers_df = pd.read_csv(os.path.join(DATA_DIR, CUSTOMERS_FILE))
    
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"Error: {e}. Check the CSV files are in the 'data/' directory.")

if 'reviews_df' in locals():
    print("\n--- Reviews Data ---")
    print(reviews_df.info())
    print(reviews_df.head())

if 'orders_df' in locals():
    print("\n--- Orders Data ---")
    
    orders_customers_df = pd.merge(orders_df, customers_df, on='customer_id')
    print(orders_customers_df.info())
    print(orders_customers_df.head())

Files loaded successfully!

--- Reviews Data ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None
                          review_id                          order_id  \
0  7bc2406110b926393aa56f80a40eba40  73fc7af87114b39712e6da79b0a377eb   
1  80e641a11e56f04c1ad469d5645fdfde  a548910a1c6147796b98fdf73dbeba33   
2  228ce5500dc1d8e020d8d1322874b6f0  f9e4b658b201a9f2ecdecbb34bed034b   
3  e64fb393e7b32834

In [None]:
# --- 1. Merge the datasets ---
orders_customers_df = pd.merge(orders_df, customers_df, on='customer_id')

# Merge the result with reviews to link orders to their reviews
merged_df = pd.merge(orders_customers_df, reviews_df, on='order_id')

print("Data merged successfully. Shape of merged data:", merged_df.shape)

Data merged successfully. Shape of merged data: (99224, 18)


## Handle missing data and data types
- For this project, we only care about reviews with actual comments.

In [None]:

merged_df.dropna(subset=['review_comment_message'], inplace=True)

# Convert timestamps to datetime
merged_df['order_purchase_timestamp'] = pd.to_datetime(merged_df['order_purchase_timestamp'])

print("Filtered for reviews with comments. New shape:", merged_df.shape)

Filtered for reviews with comments. New shape: (40977, 18)


## Define the transformation function
- Convert structured data into a text sentence.

In [None]:

def create_document_for_order(row):
    """
    Creates a single text document for a given order (a row in the dataframe).
    """
    # Convert structured data to natural language
    purchase_date = row['order_purchase_timestamp'].strftime('%B %d, %Y')
    customer_location = f"{row['customer_city']}, {row['customer_state']}"
    
    order_summary = (
        f"Customer {row['customer_unique_id']} from {customer_location} "
        f"placed an order on {purchase_date}. "
        f"The order status is '{row['order_status']}'. "
    )
    
    review_summary = (
        f"The customer left a review with a score of {row['review_score']} out of 5. "
        f"The review comment is: '{row['review_comment_message']}'"
    )
    
    # Combine everything into one document
    full_document = order_summary + review_summary
    return full_document

## Apply the function and create the final dataset

In [None]:
print("\nGenerating text documents for each order...")
merged_df['text_document'] = merged_df.apply(create_document_for_order, axis=1)


Generating text documents for each order...


## Create final Dataframe with required columns

In [None]:
final_df = merged_df[['customer_unique_id', 'order_id', 'text_document']]

## Save the final dataset as a parquet file

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

print("\n--- Example of a generated document ---")
print(final_df.iloc[0]['text_document'])

table = pa.Table.from_pandas(final_df)
pq.write_table(table, os.path.join(DATA_DIR, PROCESSED_DATA_FILE))

print(f"\nProcessed data saved to '{PROCESSED_DATA_FILE}'.")


--- Example of a generated document ---
Customer 7c396fd4830fd04220f754e42b4e5bff from sao paulo, SP placed an order on October 02, 2017. The order status is 'delivered'. The customer left a review with a score of 4 out of 5. The review comment is: 'Não testei o produto ainda, mas ele veio correto e em boas condições. Apenas a caixa que veio bem amassada e danificada, o que ficará chato, pois se trata de um presente.'

Processed data saved to 'processed_customer_data.parquet'.


# Text Chunking & Embedding

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

DATA_DIR = '../data'
PROCESSED_DATA_FILE = os.path.join(DATA_DIR, 'processed_customer_data.parquet')
EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'customer_embeddings.pkl')
EMBEDDING_MODEL_NAME = 'paraphrase-multilingual-mpnet-base-v2'


df = pq.read_pandas(PROCESSED_DATA_FILE).to_pandas()
print(f"Loaded {len(df)} documents to be chunked and embedded.")


text_splitter = RecursiveCharacterTextSplitter(

    chunk_size=512,
    chunk_overlap=50
)


print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cpu')
print("Model loaded successfully.")

In [None]:
import pandas as pd
import os
import pickle
import pyarrow.parquet as pq


# Chunking
documents_to_embed = []
print("\nStarting chunking and embedding process...")

for index, row in df.iterrows():
    # Split the document into chunks
    chunks = text_splitter.split_text(row['text_document'])
    
    # Generate an embedding for each chunk
    chunk_embeddings = embedding_model.encode(chunks)
    
    # Store the results
    for i, chunk in enumerate(chunks):
        documents_to_embed.append({
            'customer_unique_id': row['customer_unique_id'],
            'order_id': row['order_id'],
            'chunk_text': chunk,
            'embedding': chunk_embeddings[i]
        })
    
    if (index + 1) % 1000 == 0:
        print(f"Processed {index + 1}/{len(df)} documents...")

print(f"\nGenerated a total of {len(documents_to_embed)} chunks.")

# --- 5. Inspect and Save ---
print("\n--- Example of a single processed document ---")
example = documents_to_embed[0]
print("Customer ID:", example['customer_unique_id'])
print("Chunk Text:", example['chunk_text'])
print("Embedding Shape:", example['embedding'].shape) # Will be (768,) for this model

# Save the final list of chunks and embeddings to a file
with open(EMBEDDINGS_FILE, 'wb') as f:
    pickle.dump(documents_to_embed, f)
    
print(f"\nChunks and embeddings saved to '{EMBEDDINGS_FILE}'.")

Loaded 40977 documents to be chunked and embedded.
Loading embedding model: paraphrase-multilingual-mpnet-base-v2...


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 0ccde8bf-7871-46bc-8e50-8eb14fa9f085)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Model loaded successfully.

Starting chunking and embedding process...
Processed 5000/40977 documents...
Processed 9000/40977 documents...
Processed 20000/40977 documents...
Processed 21000/40977 documents...
Processed 22000/40977 documents...
Processed 25000/40977 documents...
Processed 26000/40977 documents...
Processed 27000/40977 documents...
Processed 28000/40977 documents...
Processed 31000/40977 documents...
Processed 32000/40977 documents...
Processed 41000/40977 documents...
Processed 46000/40977 documents...
Processed 50000/40977 documents...
Processed 60000/40977 documents...
Processed 61000/40977 documents...
Processed 62000/40977 documents...
Processed 64000/40977 documents...
Processed 66000/40977 documents...
Processed 68000/40977 documents...
Processed 69000/40977 documents...
Processed 70000/40977 documents...
Processed 71000/40977 documents...
Processed 72000/40977 documents...
Processed 75000/40977 documents...
Processed 76000/40977 documents...
Processed 78000/40977

In [1]:
import torch

print(torch.__version__)
my_tensor = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device="cpu")
print(my_tensor)
torch.cuda.is_available()

2.8.0+cu126
tensor([[1., 2., 3.],
        [4., 5., 6.]])


True

## Setup DB and store chunked data
- store chunked data

In [None]:
import chromadb
import os
import pickle
import uuid 
import time

DATA_DIR = '../data'
DB_DIR = '../db' 
EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'customer_embeddings.pkl')
COLLECTION_NAME = 'customer_reviews'

print(f"Loading embeddings from '{EMBEDDINGS_FILE}'...")
with open(EMBEDDINGS_FILE, 'rb') as f:
    docs_to_embed = pickle.load(f)
print(f"Loaded {len(docs_to_embed)} document chunks.")

client = chromadb.PersistentClient(path=DB_DIR)

collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"ChromaDB collection '{COLLECTION_NAME}' is ready.")

ids = []
documents = []
metadata = []

for doc in docs_to_embed:
    # Each document needs a unique ID.
    ids.append(str(uuid.uuid4())) 
    documents.append(doc['chunk_text'])
    metadata.append({
        'customer_id': doc['customer_unique_id'],
        'order_id': doc['order_id']
    })
    time.sleep(0.1)
    
    
batch_size = 5000
print(f"\nAdding documents to the collection in batches of {batch_size}...")

for i in range(0, len(documents), batch_size):
    collection.add(
        ids=ids[i:i + batch_size],
        documents=documents[i:i + batch_size],
        embeddings=[e.tolist() for e in [d['embedding'] for d in docs_to_embed[i:i + batch_size]]],
        metadatas=metadatas[i:i + batch_size]
    )
    print(f"Added batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
    time.sleep(0.1)
    

print("\nAll documents have been added to the ChromaDB collection.")
print(f"Total documents in collection: {collection.count()}")



Loading embeddings from '../data\customer_embeddings.pkl'...
Loaded 40977 document chunks.
ChromaDB collection 'customer_reviews' is ready.

Adding documents to the collection in batches of 5000...


NameError: name 'metadatas' is not defined

## Test Query

In [None]:
print("\n--- Running a test query ---")
query_text = "qual foi a reclamação do cliente?" # Portuguese for "what was the customer's complaint?"


results = collection.query(
    query_texts=[query_text],
    n_results=3
)

print("Query Results:")
for i, doc in enumerate(results['documents'][0]):
    print(f"\nResult {i+1}:")
    print("Text:", doc)
    print("Metadata:", results['metadatas'][0][i])
    print("Distance:", results['distances'][0][i])

# RAG Chain

In [None]:
import chromadb
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


load_dotenv()
print("API Key loaded.")


DB_DIR = '../db'
COLLECTION_NAME = 'customer_reviews'
EMBEDDING_MODEL_NAME = 'paraphrase-multilingual-mpnet-base-v2'


embedding_function = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)

vectorstore = Chroma(
    persist_directory=DB_DIR,
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_function
)
print(f"Connected to ChromaDB collection '{COLLECTION_NAME}'.")


retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) # Retrieve top 5 most relevant chunks
print("Retriever created.")


## Templating Prompt

In [1]:

# This template structures how we'll present the retrieved documents and the user's question to the LLM.
template = """
You are an expert assistant for a Brazilian e-commerce company.
Answer the user's question based ONLY on the following context.
If the context doesn't contain the answer, say you don't have enough information.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = ChatPromptTemplate.from_template(template)


NameError: name 'ChatPromptTemplate' is not defined

## Create RAG Chain

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")


rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created successfully.")


## Test RAG Chain

In [None]:
print("\n--- Testing the RAG chain ---")

test_customer_id = '861eff4711a542e4b93843c6dd7febb0'    # This is real id from our dataset

question = f"What was the review score and comment for the customer {test_customer_id}?"

response = rag_chain.invoke(question)

print(f"\nQuestion: {question}")
print(f"\nAnswer: {response}")