<a href="https://colab.research.google.com/github/MiskirB/B5W6-Intelligent-Complaint-Analysis/blob/main/02_chunking_embedding_indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers faiss-cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.makedirs("/content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store", exist_ok=True)


In [None]:
import pandas as pd

data_path = "/content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/data/filtered_complaints.csv"
df = pd.read_csv(data_path)

print(df.shape)
df[['Product', 'cleaned_narrative']].head()


In [None]:
def chunk_text(text, chunk_size=300, chunk_overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - chunk_overlap
    return chunks

# Apply chunking
chunk_data = []
for idx, row in df.iterrows():
    chunks = chunk_text(row['cleaned_narrative'])
    for c in chunks:
        chunk_data.append({
            "complaint_id": idx,
            "product": row["Product"],
            "text": c
        })

chunk_df = pd.DataFrame(chunk_data)
print("✅ Total Chunks:", len(chunk_df))
chunk_df.head()


In [None]:
!ls /content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store/


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from tqdm.auto import tqdm
import os

# 1. Prepare input texts
texts = chunk_df['text'].tolist()

# 2. Load the embedding model (GPU will be used automatically)
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Encode with GPU (fast)
embeddings = model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=64  # you can tune this
)

# 4. Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("✅ FAISS index built. Total vectors:", index.ntotal)

# 5. Prepare metadata
metadata = chunk_df.to_dict(orient='records')

# 6. Save both to Google Drive
index_path = "/content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store/faiss_index.index"
metadata_path = "/content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store/metadata.pkl"

os.makedirs("/content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store", exist_ok=True)

faiss.write_index(index, index_path)
print("✅ FAISS index saved to:", index_path)

with open(metadata_path, "wb") as f:
    pickle.dump(metadata, f)
print("✅ Metadata saved to:", metadata_path)


In [None]:
!ls -lh /content/drive/MyDrive/B5W6-Intelligent-Complaint-Analysis/vector_store/
