In [None]:
!pip3 install rdflib meilisearch pyld

In [None]:
# =============================================================================
# IMPORTS
# =============================================================================
import requests
import json
from datetime import datetime, timezone, timedelta
from pyld import jsonld
import uuid

print("‚úÖ Imports loaded")

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================
FUSEKI_URL = "https://fuseki.app.quality-link.eu"
DATASET_NAME = "pipeline-data"
FUSEKI_USERNAME = "admin"
FUSEKI_PASSWORD = "" 

MEILISEARCH_URL = "https://search.knowledgeinnovation.eu"  
MEILISEARCH_API_KEY = ""  
INDEX_NAME = "education-entities"

auth = (FUSEKI_USERNAME, FUSEKI_PASSWORD) if FUSEKI_USERNAME and FUSEKI_PASSWORD else None

query_url = f"{FUSEKI_URL}/{DATASET_NAME}/sparql"

print(f"‚úÖ Configuration set")
print(f"   Fuseki URL: {FUSEKI_URL}")
print(f"   Dataset: {DATASET_NAME}")
print(f"   Meilisearch Index: {INDEX_NAME}")

In [None]:
# =============================================================================
# STEP 1: SET TARGET DATE
# =============================================================================

In [None]:
target_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")

print(f"‚úÖ Target date set to {target_date}")

In [None]:
# =============================================================================
# STEP 2: QUERY JENA FOR COURSES BY DATE
# =============================================================================

In [None]:
query_courses_by_date = f"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ql: <http://data.quality-link.eu/ontology/v1#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?learningOpportunity ?ingestedDate ?ingestedAt ?title
WHERE {{
  ?learningOpportunity rdf:type ql:LearningOpportunitySpecification .
  ?learningOpportunity ql:ingestedDate ?ingestedDate .
  ?learningOpportunity ql:ingestedAt ?ingestedAt .
  OPTIONAL {{ ?learningOpportunity dcterms:title ?title }}
  
  FILTER (?ingestedDate = "{target_date}"^^xsd:date)
}}
ORDER BY ?ingestedAt
LIMIT 100
"""

In [None]:
try:
    response = requests.get(
        query_url,
        params={'query': query_courses_by_date, 'format': 'application/sparql-results+json'},
        auth=auth,
        timeout=30
    )
    response.raise_for_status()
    
    results = response.json()['results']['bindings']
    print(f"‚úÖ Query successful!")
    print(f"üìä Found {len(results)} courses ingested on {target_date}")
    
except requests.RequestException as e:
    print(f"‚ùå Query failed: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"   Response: {e.response.text[:500]}")
    results = []

In [None]:
if results:
    print("\nüìã Courses found:")
    for idx, result in enumerate(results, 1):
        uri = result['learningOpportunity']['value']
        ingested_date = result['ingestedDate']['value']
        ingested_at = result['ingestedAt']['value']
        title = result.get('title', {}).get('value', 'No title')
        print(f"\n   {idx}. {uri}")
        print(f"      üìÖ Date: {ingested_date}")
        print(f"      üïê Time: {ingested_at}")
        print(f"      üìù Title: {title[:80]}...")
else:
    print(f"\n‚ö†Ô∏è  No courses found for {target_date}")
    print("   This could mean:")
    print("   - No courses were ingested on this date")
    print("   - The ql:ingestedDate field hasn't been added yet")
    print("   - Check your date format (should be YYYY-MM-DD)")

In [None]:
# =============================================================================
# STEP 3: PROCESS EACH COURSE - GET FULL DATA
# =============================================================================

In [None]:
all_documents = []

for idx, result in enumerate(results, 1):
    course_uri = result['learningOpportunity']['value']
    course_title = result.get('title', {}).get('value', 'No title')
    
    print(f"\n[{idx}/{len(results)}] Processing: {course_uri}")
    print(f"   Title: {course_title[:60]}...")
    
    query_full_data = f"""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
    CONSTRUCT {{
      ?s ?p ?o .
    }}
    WHERE {{
      <{course_uri}> (<>|!<>)* ?s .
      ?s ?p ?o .
    }}
    """
    
    try:
        print(f"   üîΩ Fetching complete course data...")
        response = requests.get(
            query_url,
            params={'query': query_full_data, 'format': 'application/ld+json'},
            auth=auth,
            timeout=30
        )
        response.raise_for_status()
        
        raw_jsonld = response.json()
        print(f"   ‚úÖ Retrieved raw JSON-LD data ({len(raw_jsonld)} objects)")
        
        all_documents.append({
            'uri': course_uri,
            'title': course_title,
            'raw_data': raw_jsonld
        })
        
    except requests.RequestException as e:
        print(f"   ‚ùå Failed to retrieve data: {e}")
        continue

print(f"\n‚úÖ Successfully retrieved data for {len(all_documents)} courses")


In [None]:
all_documents[0]["raw_data"]["@graph"]

In [None]:
all_documents[0]

In [None]:
if all_documents and all_documents[0]['raw_data']:
    print("\n" + "="*60)
    print("üìÑ EXAMPLE RAW JSON-LD (First Object)")
    print("="*60)
    example = json.dumps(all_documents[0]['raw_data']["@graph"][0], indent=2)
    print(example)

In [None]:
# =============================================================================
# STEP 4: LOAD FRAME CONFIGURATION
# =============================================================================

In [None]:
with open("frame.json", "r") as f:
    frame_config = json.load(f)

print("‚úÖ Frame configuration loaded from frame.json")

In [None]:
# =============================================================================
# STEP 5 & 6: APPLY JSON-LD FRAMING AND CLEAN UP TO ALL COURSES
# =============================================================================

In [None]:
meilisearch_documents = []

for idx, doc in enumerate(all_documents, 1):
    course_uri = doc['uri']
    course_title = doc['title']
    raw_jsonld = doc['raw_data']
    
    print(f"\n[{idx}/{len(all_documents)}] Processing: {course_title[:50]}...")
    print(f"   URI: {course_uri}")
    
    try:
        print(f"   üîÑ Applying JSON-LD framing...")
        framed_json = jsonld.frame(raw_jsonld, frame_config)
        print(f"   ‚úÖ Framing successful")
        
        if '@context' in framed_json:
            del framed_json['@context']
            print(f"   ‚úÖ Removed @context")
        
        framed_json['id'] = str(uuid.uuid5(uuid.NAMESPACE_URL, course_uri))
        print(f"   ‚úÖ Added Meilisearch ID: {framed_json['id']}")
        
        has_title = 'dcterms:title' in framed_json
        has_type = 'type' in framed_json or '@type' in framed_json
        has_ingested = 'ql:ingestedDate' in framed_json
        print(f"   üìã Verification:")
        print(f"      - Has title: {has_title}")
        print(f"      - Has type: {has_type}")
        print(f"      - Has ingestedDate: {has_ingested}")
        
        meilisearch_documents.append(framed_json)
        
    except Exception as e:
        print(f"   ‚ùå Processing failed: {e}")
        import traceback
        traceback.print_exc()
        continue

In [None]:
def normalize_field_names(obj, parent_key=None):

    if obj is None:
        return None
    
    if isinstance(obj, dict):
        if "@value" in obj:
            return obj["@value"]
        
        normalized = {}
        for key, value in obj.items():
            if key.startswith("@"):
                continue
            
            new_key = key.replace(":", "_")
            
            cleaned_value = normalize_field_names(value, parent_key=new_key)
            
            if cleaned_value is not None:
                normalized[new_key] = cleaned_value
        
        return normalized if normalized else None
    
    elif isinstance(obj, list):
        normalized_list = [normalize_field_names(item, parent_key=parent_key) for item in obj]
        
        normalized_list = [item for item in normalized_list if item is not None]
        
        if not normalized_list:
            return None
        
        seen = set()
        unique_list = []
        for item in normalized_list:
            if isinstance(item, dict):
                item_str = json.dumps(item, sort_keys=True)
                if item_str not in seen:
                    seen.add(item_str)
                    unique_list.append(item)
            elif item not in seen:
                seen.add(item)
                unique_list.append(item)
        
        if parent_key in ["ingestedAt", "ingestedDate"]:
            if unique_list and all(isinstance(x, str) for x in unique_list):
                return sorted(unique_list)[-1]
        
        return unique_list
    
    else:
        return obj

In [None]:
final_documents = []

for idx, doc in enumerate(meilisearch_documents, 1):
    print(f"\n[{idx}/{len(meilisearch_documents)}] Normalizing document...")
    
    try:
        # normalized = normalize_field_names(doc)
        normalized = doc
        
        print(f"   ‚úÖ Normalized successfully")
        print(f"      ID: {normalized.get('id')}")
        print(f"      Title: {normalized.get('dcterms_title', 'N/A')[:50]}")
        print(f"      Has learning outcomes: {'elm_learningOutcome' in normalized}")
        print(f"      Has ingestedDate: {'ingestedDate' in normalized or 'ql_ingestedDate' in normalized}")
        
        print(f"      All fields: {list(normalized.keys())[:10]}...")
        
        final_documents.append(normalized)
        
    except Exception as e:
        print(f"   ‚ùå Normalization failed: {e}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*60}")
print(f"‚úÖ Normalized {len(final_documents)} documents")
print(f"{'='*60}")

In [None]:
final_documents[0]

In [None]:
# =============================================================================
# STEP 7: UPLOAD TO MEILISEARCH
# =============================================================================

In [None]:
ACTUALLY_UPLOAD = True  

if not ACTUALLY_UPLOAD:
    print("‚ö†Ô∏è  UPLOAD DISABLED (Preview Mode)")
    print("   Set ACTUALLY_UPLOAD = True to enable uploading")
    print(f"\n   Would upload {len(meilisearch_documents)} documents to:")
    print(f"   URL: {MEILISEARCH_URL}/indexes/{INDEX_NAME}/documents")
    print(f"   Total size: {sum(len(json.dumps(doc)) for doc in meilisearch_documents)} bytes")
else:
    upload_url = f"{MEILISEARCH_URL}/indexes/{INDEX_NAME}/documents"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {MEILISEARCH_API_KEY}"
    }
    
    uploaded_count = 0
    failed_count = 0
    
    for idx, doc in enumerate(final_documents, 1):
        title = doc.get('dcterms_title', 'No title')
        print(f"\n[{idx}/{len(final_documents)}] Uploading: {title[:50]}...")
        
        try:
            response = requests.post(upload_url, headers=headers, json=doc)
            response.raise_for_status()
            
            task_info = response.json()
            task_uid = task_info.get('taskUid')
            print(f"   ‚úÖ Uploaded successfully (Task UID: {task_uid})")
            uploaded_count += 1
            
        except requests.RequestException as e:
            print(f"   ‚ùå Upload failed: {e}")
            failed_count += 1
            continue
    
    print(f"\n{'='*60}")
    print("üìä UPLOAD SUMMARY")
    print(f"{'='*60}")
    print(f"‚úÖ Successfully uploaded: {uploaded_count}")
    print(f"‚ùå Failed uploads:        {failed_count}")
    print(f"üìà Total documents:       {len(final_documents)}")