In [None]:
# ============================================
# ToS ANALYZER - VECTOR DATABASE BUILDER
# IAT 360 Final Project - Hugging Face Edition
# ============================================

# 1. INSTALL DEPENDENCIES (with correct versions)
print("üì¶ Installing packages...\n")
!pip install -q sentence-transformers faiss-cpu datasets langchain==0.1.0 langchain-community==0.0.13 langchain-text-splitters

print("‚úÖ Installation complete!\n")

# 2. IMPORTS (FIXED)
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter  # FIXED IMPORT
from langchain_core.documents import Document  # FIXED IMPORT
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import json
from tqdm import tqdm

# 3. LOAD DATA FROM HUGGING FACE
print("=" * 60)
print("STEP 1: LOADING DATASET FROM HUGGING FACE")
print("=" * 60)

print("\nüì• Downloading OPP-115 dataset...")
dataset = load_dataset("alzoubi36/opp_115")

# Explore the dataset structure
print(f"\n‚úÖ Dataset loaded successfully!")
print(f"   - Train set: {len(dataset['train'])} examples")
print(f"   - Validation set: {len(dataset['validation'])} examples")
print(f"   - Test set: {len(dataset['test'])} examples")

# Let's look at a sample
print("\nüìã Sample document:")
print("-" * 60)
sample = dataset['train'][0]
print(f"Text preview: {sample['text'][:300]}...")
print(f"\nAvailable fields: {list(sample.keys())}")
print("-" * 60)

# 4. CONVERT TO LANGCHAIN DOCUMENTS
print("\n" + "=" * 60)
print("STEP 2: PREPARING DOCUMENTS")
print("=" * 60)

documents = []

# Combine train + validation + test for maximum coverage
all_splits = ['train', 'validation', 'test']

for split in all_splits:
    print(f"\nProcessing {split} set...")
    for i, item in enumerate(tqdm(dataset[split])):
        doc = Document(
            page_content=item['text'],
            metadata={
                'source': split,
                'index': i,
                'segment_id': f"{split}_{i}"
            }
        )
        documents.append(doc)

print(f"\n‚úÖ Total documents prepared: {len(documents)}")

# Save sample for inspection
sample_docs = [
    {
        'text': doc.page_content[:200],
        'metadata': doc.metadata
    }
    for doc in documents[:10]
]

with open('sample_documents.json', 'w') as f:
    json.dump(sample_docs, f, indent=2)

print("üíæ Saved sample_documents.json for inspection")

# 5. CHUNK THE DOCUMENTS
print("\n" + "=" * 60)
print("STEP 3: CHUNKING DOCUMENTS FOR RETRIEVAL")
print("=" * 60)

print("\n‚úÇÔ∏è Splitting text into searchable chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " "],
    length_function=len,
)

chunks = text_splitter.split_documents(documents)

print(f"\n‚úÖ Created {len(chunks)} searchable chunks")
print(f"   Average chunk size: ~{sum(len(c.page_content) for c in chunks) // len(chunks)} characters")

# 6. CREATE EMBEDDINGS MODEL
print("\n" + "=" * 60)
print("STEP 4: LOADING EMBEDDING MODEL FROM HUGGING FACE")
print("=" * 60)

print("\nü§ó Loading: sentence-transformers/all-MiniLM-L6-v2")
print("   - Model size: 80MB")
print("   - Embedding dimensions: 384")
print("   - License: Apache 2.0 (free for all uses)")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

print("\n‚úÖ Embedding model loaded successfully!")

# 7. BUILD VECTOR DATABASE
print("\n" + "=" * 60)
print("STEP 5: BUILDING FAISS VECTOR DATABASE")
print("=" * 60)

print("\nüî® Creating embeddings for all chunks...")
print("   (This may take 2-3 minutes for 3,000+ chunks)")

vectorstore = FAISS.from_documents(chunks, embeddings)

print("\n‚úÖ Vector database created successfully!")
print(f"   Total vectors: {vectorstore.index.ntotal}")

# 8. TEST THE DATABASE
print("\n" + "=" * 60)
print("STEP 6: TESTING RETRIEVAL")
print("=" * 60)

test_queries = [
    "Can the company sell my personal data to third parties?",
    "Do they use my content to train AI models?",
    "Can I delete all my data from their servers?",
    "Am I forced into arbitration instead of court?"
]

print("\nüß™ Running test queries...\n")

for query in test_queries:
    print(f"Query: '{query}'")
    results = vectorstore.similarity_search(query, k=2)
    print(f"‚úÖ Found {len(results)} relevant chunks")
    print(f"   Top result preview: {results[0].page_content[:150]}...")
    print()

# 9. SAVE THE DATABASE
print("=" * 60)
print("STEP 7: SAVING DATABASE")
print("=" * 60)

print("\nüíæ Saving FAISS database to disk...")
vectorstore.save_local("faiss_index_tos_hf")

print("‚úÖ Database saved to: faiss_index_tos_hf/")

# 10. CREATE METADATA FILE
metadata = {
    "dataset": "alzoubi36/opp_115",
    "total_documents": len(documents),
    "total_chunks": len(chunks),
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "embedding_dimension": 384,
    "chunk_size": 800,
    "chunk_overlap": 100,
    "vector_database": "FAISS",
    "date_created": "2024-12",
    "license": "Apache 2.0",
    "use_case": "Terms of Service Analysis - IAT 360"
}

with open('faiss_index_tos_hf/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("‚úÖ Metadata saved")

# 11. ZIP FOR DOWNLOAD
print("\nüì¶ Creating zip file for download...")
!zip -r -q faiss_index_tos_hf.zip faiss_index_tos_hf

print("\n" + "=" * 60)
print("‚úÖ SUCCESS! DATABASE READY FOR DEPLOYMENT")
print("=" * 60)

print("""
üìÅ FILES CREATED:
   1. faiss_index_tos_hf/          (the vector database folder)
   2. faiss_index_tos_hf.zip       (download this for your Streamlit app)
   3. sample_documents.json        (for inspection)

üéØ NEXT STEPS:
   1. Download 'faiss_index_tos_hf.zip' from the Files panel (left sidebar)
   2. Unzip it on your computer
   3. Upload the 'faiss_index_tos_hf' folder to your GitHub repo
   4. Deploy your Streamlit app!

üí° TIP: The vector database is now ready to use with either:
   - Option A: 100% Free (Hugging Face models only)
   - Option B: Hybrid (HF embeddings + OpenAI GPT-3.5)
""")

# Display file sizes
print("\nüìä File sizes:")
!du -sh faiss_index_tos_hf
!du -sh faiss_index_tos_hf.zip

üì¶ Installing packages...

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Cannot install langchain-community==0.0.13, langchain-text-splitters==0.2.0, langchain-text-splitters==0.2.1, langchain-text-splitters==0.2.2, langchain-text-splitters==0.2.4, langchain-text-splitters==0.3.0, langchain-text-splitters==0.3.1, langchain-text-splitters==0.3.10, langchain-text-splitters==0.3.11, langchain-text-splitters==0.3.2, langchain-text-splitters==0.3.3, langchain-text-splitters==0.3.4, langchain-text-splitters==0.3.5, langchain-text-splitters==0.3.6, langchain-text-splitters==0.3.7, langchain-text-splitters==0.3.8, langchain-text-splitters==0.3.9, langchain-text-splitters==1.0.0 and langchain==0.1.0 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit ht

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



‚úÖ Dataset loaded successfully!
   - Train set: 2185 examples
   - Validation set: 550 examples
   - Test set: 697 examples

üìã Sample document:
------------------------------------------------------------
Text preview:  ""Contact Us"" Link If you contact us through the ""Contact Us"" link on this site, we ask you for information such as your first name, e-mail address, and other information, so we can respond to your questions and comments. You may choose to provide additional information as well. ...

Available fields: ['text', 'label']
------------------------------------------------------------

STEP 2: PREPARING DOCUMENTS

Processing train set...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2185/2185 [00:00<00:00, 34835.62it/s]



Processing validation set...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 550/550 [00:00<00:00, 21670.90it/s]



Processing test set...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 697/697 [00:00<00:00, 27230.16it/s]


‚úÖ Total documents prepared: 3432
üíæ Saved sample_documents.json for inspection

STEP 3: CHUNKING DOCUMENTS FOR RETRIEVAL

‚úÇÔ∏è Splitting text into searchable chunks...

‚úÖ Created 3929 searchable chunks
   Average chunk size: ~401 characters

STEP 4: LOADING EMBEDDING MODEL FROM HUGGING FACE

ü§ó Loading: sentence-transformers/all-MiniLM-L6-v2
   - Model size: 80MB
   - Embedding dimensions: 384
   - License: Apache 2.0 (free for all uses)



  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


‚úÖ Embedding model loaded successfully!

STEP 5: BUILDING FAISS VECTOR DATABASE

üî® Creating embeddings for all chunks...
   (This may take 2-3 minutes for 3,000+ chunks)

‚úÖ Vector database created successfully!
   Total vectors: 3929

STEP 6: TESTING RETRIEVAL

üß™ Running test queries...

Query: 'Can the company sell my personal data to third parties?'
‚úÖ Found 2 relevant chunks
   Top result preview: Note, that we will not share your Personally Identifiable Information with third parties for their marketing purposes without obtaining your prior con...

Query: 'Do they use my content to train AI models?'
‚úÖ Found 2 relevant chunks
   Top result preview: Our automated systems analyze your content (including emails) to provide you personally relevant product features, such as customized search results, ...

Query: 'Can I delete all my data from their servers?'
‚úÖ Found 2 relevant chunks
   Top result preview: Remember that even after you cancel your account, copies of some in