In [1]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Configure
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import requests
import json

In [2]:
# Connect to Weaviate
client = weaviate.connect_to_local(skip_init_checks=True)

try:
    # Test basic connectivity
    meta = client.get_meta()
    print("✅ Weaviate is ready!")
    print(f"Weaviate version: {meta['version']}")
    
    # Test if we can create a simple collection
    collections = client.collections.list_all()
    print(f"Current collections: {len(collections)}")
    
except Exception as e:
    print(f"❌ Connection error: {e}")
finally:
    client.close()

✅ Weaviate is ready!
Weaviate version: 1.23.7
Current collections: 1


In [3]:
# Load the cleaned IATE data
try:
    iate_terminology = pd.read_csv('../data/processed/iate_terminology_clean.csv')
    print(f"Loaded {len(iate_terminology)} term pairs")
except FileNotFoundError:
    raise FileNotFoundError("CSV file not found. Did you run the cleaning step?")
except Exception as e:
    raise RuntimeError(f"Failed to load CSV: {e}")

if iate_terminology.empty:
    raise ValueError("Loaded CSV is empty. Check data preprocessing.")
    
# Initialize embedding model
print("Loading embedding model (this may take a moment)...")
try:
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    print("✅ Model loaded!")
except Exception as e:
    raise RuntimeError(f"Failed to load SentenceTransformer model: {e}")

# Connect to Weaviate
client = weaviate.connect_to_local(skip_init_checks=True)
print("✅ Connected to Weaviate")

Loaded 14475 term pairs
Loading embedding model (this may take a moment)...
✅ Model loaded!
✅ Connected to Weaviate


In [4]:
# Define the terminology collection schema
collection_name = "TerminologyEntry"

# Delete existing collection if it exists
try:
    client.collections.delete(collection_name)
    print(f"Deleted existing collection: {collection_name}")
except:
    pass

# Create new collection (simple, working configuration)
collection = client.collections.create(
    name=collection_name,
    properties=[
        wvc.config.Property(name="pt_term", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="en_term", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="subject_field", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="entry_id", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="pt_reliability", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="en_reliability", data_type=wvc.config.DataType.TEXT),
    ]
)

print(f"✅ Created collection: {collection_name}")

Deleted existing collection: TerminologyEntry
✅ Created collection: TerminologyEntry


In [5]:
# Generate embeddings for Portuguese terms
print("Generating embeddings for Portuguese terms...")

pt_terms = iate_terminology['pt_term'].tolist()
embeddings = model.encode(pt_terms, show_progress_bar=False)

print(f"✅ Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")

Generating embeddings for Portuguese terms...
✅ Generated 14475 embeddings, dimension: 384


In [6]:
# Load data into Weaviate using REST API
print("Loading data into Weaviate using REST API...")

# Reconnect to Weaviate
client.close()
client = weaviate.connect_to_local(skip_init_checks=True)
collection = client.collections.get("TerminologyEntry")

# Insert objects one by one
total_inserted = 0

for i, row in iate_terminology.iterrows():
    try:
        collection.data.insert(
            properties={
                "pt_term": row['pt_term'],
                "en_term": row['en_term'],
                "subject_field": row['subject_field'] if pd.notna(row['subject_field']) else "",
                "entry_id": str(row['entry_id']),
                "pt_reliability": str(row['pt_reliability']),
                "en_reliability": str(row['en_reliability']),
            },
            vector=embeddings[i].tolist()
        )
        total_inserted += 1
        
        if total_inserted % 1000 == 0:
            print(f"Inserted {total_inserted}/{len(iate_terminology)} objects")
            
    except Exception as e:
        print(f"Error inserting object {i}: {e}")
        continue

print(f"✅ Loaded {total_inserted} term pairs into Weaviate")

# Verify the data
total_objects = collection.aggregate.over_all(total_count=True)
print(f"Collection now contains: {total_objects.total_count} objects")

Loading data into Weaviate using REST API...
Inserted 1000/14475 objects
Inserted 2000/14475 objects
Inserted 3000/14475 objects
Inserted 4000/14475 objects
Inserted 5000/14475 objects
Inserted 6000/14475 objects
Inserted 7000/14475 objects
Inserted 8000/14475 objects
Inserted 9000/14475 objects
Inserted 10000/14475 objects
Inserted 11000/14475 objects
Inserted 12000/14475 objects
Inserted 13000/14475 objects
Inserted 14000/14475 objects
✅ Loaded 14475 term pairs into Weaviate
Collection now contains: 14475 objects


In [7]:
# Test semantic search functionality using REST API
print("=== Testing Semantic Search (REST API) ===")

def test_search_rest(query, n_results=5):
    print(f"\nQuery: '{query}'")
    print("-" * 50)
    
    # Generate query embedding
    query_embedding = model.encode([query])
    
    # Use REST API for search
    url = "http://localhost:8080/v1/graphql"
    
    graphql_query = {
        "query": """
        {
            Get {
                TerminologyEntry(
                    nearVector: {vector: %s}
                    limit: %d
                ) {
                    pt_term
                    en_term
                    subject_field
                    _additional {
                        distance
                    }
                }
            }
        }
        """ % (json.dumps(query_embedding[0].tolist()), n_results)
    }
    
    try:
        response = requests.post(url, json=graphql_query)
        result = response.json()
        
        if 'data' in result and 'Get' in result['data']:
            objects = result['data']['Get']['TerminologyEntry']
            
            for i, obj in enumerate(objects):
                distance = obj['_additional']['distance']
                similarity = 1 - distance
                
                print(f"{i+1}. Similarity: {similarity:.4f}")
                print(f"   PT: {obj['pt_term']}")
                print(f"   EN: {obj['en_term']}")
                print(f"   Domain: {obj['subject_field']}")
                print()
        else:
            print(f"Error in response: {result}")
            
    except Exception as e:
        print(f"Error: {e}")

# Test with Portuguese queries
test_queries = [
    "comércio internacional",
    "produto agrícola", 
    "transporte marítimo",
    "acordo comercial"
]

for query in test_queries:
    test_search_rest(query)

=== Testing Semantic Search (REST API) ===

Query: 'comércio internacional'
--------------------------------------------------
1. Similarity: 0.9642
   PT: comércio mundial
   EN: world trade
   Domain: preparation for market;land transport;life sciences;TRANSPORT

2. Similarity: 0.9637
   PT: termos do comércio internacional
   EN: International Rules for the Interpretation of Trade Terms
   Domain: international trade;commercial transaction;LAW;FINANCE

3. Similarity: 0.9637
   PT: termos do comércio internacional
   EN: International Commercial Terms
   Domain: international trade;commercial transaction;LAW;FINANCE

4. Similarity: 0.9637
   PT: termos do comércio internacional
   EN: ICC rules for the use of domestic and international trade terms
   Domain: international trade;commercial transaction;LAW;FINANCE

5. Similarity: 0.9637
   PT: termos do comércio internacional
   EN: Incoterms
   Domain: international trade;commercial transaction;LAW;FINANCE


Query: 'produto agrícola'
