In [None]:
!pip3 install rdflib meilisearch pyld

In [5]:
# =============================================================================
# IMPORTS
# =============================================================================
import requests
import json
from datetime import datetime, timezone, timedelta
from pyld import jsonld
import uuid

print("‚úÖ Imports loaded")

‚úÖ Imports loaded


In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================
FUSEKI_URL = "https://fuseki.app.quality-link.eu"
DATASET_NAME = "pipeline-data"
FUSEKI_USERNAME = "admin"
FUSEKI_PASSWORD = "" 

MEILISEARCH_URL = "https://lwowo04cs888sswsswoc4kwo.serverfarm.knowledgeinnovation.eu"  
MEILISEARCH_API_KEY = ""  
INDEX_NAME = "education-entities-tmp"

auth = (FUSEKI_USERNAME, FUSEKI_PASSWORD) if FUSEKI_USERNAME and FUSEKI_PASSWORD else None

query_url = f"{FUSEKI_URL}/{DATASET_NAME}/sparql"

print(f"‚úÖ Configuration set")
print(f"   Fuseki URL: {FUSEKI_URL}")
print(f"   Dataset: {DATASET_NAME}")
print(f"   Meilisearch Index: {INDEX_NAME}")

‚úÖ Configuration set
   Fuseki URL: https://fuseki.app.quality-link.eu
   Dataset: pipeline-data
   Meilisearch Index: education-entities-tmp


In [7]:
# =============================================================================
# STEP 1: SET TARGET DATE
# =============================================================================

In [8]:
target_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")

print(f"‚úÖ Target date set to {target_date}")

‚úÖ Target date set to 2025-12-10


In [9]:
# =============================================================================
# STEP 2: QUERY JENA FOR COURSES BY DATE
# =============================================================================

In [None]:
query_courses_by_date = f"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ql: <http://data.quality-link.eu/ontology/v1#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?learningOpportunity ?ingestedDate ?ingestedAt ?title
WHERE {{
  ?learningOpportunity rdf:type ql:LearningOpportunitySpecification .
  ?learningOpportunity ql:ingestedDate ?ingestedDate .
  ?learningOpportunity ql:ingestedAt ?ingestedAt .
  OPTIONAL {{ ?learningOpportunity dcterms:title ?title }}
  
  FILTER (?ingestedDate = "{target_date}"^^xsd:date)
}}
ORDER BY ?ingestedAt
LIMIT 100
"""

In [11]:
try:
    response = requests.get(
        query_url,
        params={'query': query_courses_by_date, 'format': 'application/sparql-results+json'},
        auth=auth,
        timeout=30
    )
    response.raise_for_status()
    
    results = response.json()['results']['bindings']
    print(f"‚úÖ Query successful!")
    print(f"üìä Found {len(results)} courses ingested on {target_date}")
    
except requests.RequestException as e:
    print(f"‚ùå Query failed: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"   Response: {e.response.text[:500]}")
    results = []

‚úÖ Query successful!
üìä Found 8 courses ingested on 2025-12-10


In [12]:
if results:
    print("\nüìã Courses found:")
    for idx, result in enumerate(results, 1):
        uri = result['learningOpportunity']['value']
        ingested_date = result['ingestedDate']['value']
        ingested_at = result['ingestedAt']['value']
        title = result.get('title', {}).get('value', 'No title')
        print(f"\n   {idx}. {uri}")
        print(f"      üìÖ Date: {ingested_date}")
        print(f"      üïê Time: {ingested_at}")
        print(f"      üìù Title: {title[:80]}...")
else:
    print(f"\n‚ö†Ô∏è  No courses found for {target_date}")
    print("   This could mean:")
    print("   - No courses were ingested on this date")
    print("   - The ql:ingestedDate field hasn't been added yet")
    print("   - Check your date format (should be YYYY-MM-DD)")


üìã Courses found:

   1. http://course.are.cool/123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-09T20:14:05.689242+00:00
      üìù Title: Computer Science - Linked Open Data...

   2. urn:schac:courseId:uni-lj.si:123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-09T20:14:05.689242+00:00
      üìù Title: Computer Science - Linked Open Data...

   3. http://course.are.cool/123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-09T20:14:06.971915+00:00
      üìù Title: Computer Science - Linked Open Data...

   4. urn:schac:courseId:uni-lj.si:123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-09T20:14:06.971915+00:00
      üìù Title: Computer Science - Linked Open Data...

   5. http://course.are.cool/123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-10T08:38:39.529511+00:00
      üìù Title: Computer Science - Linked Open Data...

   6. urn:schac:courseId:uni-lj.si:123
      üìÖ Date: 2025-12-10
      üïê Time: 2025-12-10T08:38:39.529511+00:00
    

In [13]:
# =============================================================================
# STEP 3: PROCESS EACH COURSE - GET FULL DATA
# =============================================================================

In [14]:
all_documents = []

for idx, result in enumerate(results, 1):
    course_uri = result['learningOpportunity']['value']
    course_title = result.get('title', {}).get('value', 'No title')
    
    print(f"\n[{idx}/{len(results)}] Processing: {course_uri}")
    print(f"   Title: {course_title[:60]}...")
    
    query_full_data = f"""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
    CONSTRUCT {{
      ?s ?p ?o .
    }}
    WHERE {{
      <{course_uri}> (<>|!<>)* ?s .
      ?s ?p ?o .
    }}
    """
    
    try:
        print(f"   üîΩ Fetching complete course data...")
        response = requests.get(
            query_url,
            params={'query': query_full_data, 'format': 'application/ld+json'},
            auth=auth,
            timeout=30
        )
        response.raise_for_status()
        
        raw_jsonld = response.json()
        print(f"   ‚úÖ Retrieved raw JSON-LD data ({len(raw_jsonld)} objects)")
        
        all_documents.append({
            'uri': course_uri,
            'title': course_title,
            'raw_data': raw_jsonld
        })
        
    except requests.RequestException as e:
        print(f"   ‚ùå Failed to retrieve data: {e}")
        continue

print(f"\n‚úÖ Successfully retrieved data for {len(all_documents)} courses")



[1/8] Processing: http://course.are.cool/123
   Title: Computer Science - Linked Open Data...
   üîΩ Fetching complete course data...
   ‚úÖ Retrieved raw JSON-LD data (2 objects)

[2/8] Processing: urn:schac:courseId:uni-lj.si:123
   Title: Computer Science - Linked Open Data...
   üîΩ Fetching complete course data...
   ‚úÖ Retrieved raw JSON-LD data (2 objects)

[3/8] Processing: http://course.are.cool/123
   Title: Computer Science - Linked Open Data...
   üîΩ Fetching complete course data...
   ‚úÖ Retrieved raw JSON-LD data (2 objects)

[4/8] Processing: urn:schac:courseId:uni-lj.si:123
   Title: Computer Science - Linked Open Data...
   üîΩ Fetching complete course data...
   ‚úÖ Retrieved raw JSON-LD data (2 objects)

[5/8] Processing: http://course.are.cool/123
   Title: Computer Science - Linked Open Data...
   üîΩ Fetching complete course data...
   ‚úÖ Retrieved raw JSON-LD data (2 objects)

[6/8] Processing: urn:schac:courseId:uni-lj.si:123
   Title: Computer Science

In [15]:
all_documents[0]["raw_data"]["@graph"]

[{'@id': '_:b0',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': '_:b1',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': '_:b2',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': '_:b3',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': '_:b4',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': '_:b5',
  'skos:notation': 'AT0005',
  'elm:schemeId': {'@id': 'ql:OrgReg'},
  '@type': 'ql:OrgRegIdentifier'},
 {'@id': 'http://data.quality-link.eu/examples/provider/AT0005',
  'adms:identifier': [{'@id': '_:b0'},
   {'@id': '_:b1'},
   {'@id': '_:b2'},
   {'@id': '_:b3'},
   {'@id': '_:b4'},
   {'@id': '_:b5'}],
  '@type': 'ql:HigherEducationInstitution'},
 {'@id': '_:b6',
  'dct:

In [16]:
all_documents[0]

{'uri': 'http://course.are.cool/123',
 'title': 'Computer Science - Linked Open Data',
 'raw_data': {'@graph': [{'@id': '_:b0',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': '_:b1',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': '_:b2',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': '_:b3',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': '_:b4',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': '_:b5',
    'skos:notation': 'AT0005',
    'elm:schemeId': {'@id': 'ql:OrgReg'},
    '@type': 'ql:OrgRegIdentifier'},
   {'@id': 'http://data.quality-link.eu/examples/provider/AT0005',
    'adms:identifier': [{'@id': '_:b0'},
     {'@i

In [19]:
if all_documents and all_documents[0]['raw_data']:
    print("\n" + "="*60)
    print("üìÑ EXAMPLE RAW JSON-LD (First Object)")
    print("="*60)
    example = json.dumps(all_documents[0]['raw_data']["@graph"][0], indent=2)
    print(example)


üìÑ EXAMPLE RAW JSON-LD (First Object)
{
  "@id": "_:b0",
  "skos:notation": "AT0005",
  "elm:schemeId": {
    "@id": "ql:OrgReg"
  },
  "@type": "ql:OrgRegIdentifier"
}


In [None]:
# =============================================================================
# STEP 4: LOAD FRAME CONFIGURATION
# =============================================================================

In [18]:
with open("frame.json", "r") as f:
    frame_config = json.load(f)

print("‚úÖ Frame configuration loaded from frame.json")

‚úÖ Frame configuration loaded from frame.json


In [None]:
# =============================================================================
# STEP 5 & 6: APPLY JSON-LD FRAMING AND CLEAN UP TO ALL COURSES
# =============================================================================

In [None]:
meilisearch_documents = []

for idx, doc in enumerate(all_documents, 1):
    course_uri = doc['uri']
    course_title = doc['title']
    raw_jsonld = doc['raw_data']
    
    print(f"\n[{idx}/{len(all_documents)}] Processing: {course_title[:50]}...")
    print(f"   URI: {course_uri}")
    
    try:
        print(f"   üîÑ Applying JSON-LD framing...")
        framed_json = jsonld.frame(raw_jsonld, frame_config)
        print(f"   ‚úÖ Framing successful")
        
        if '@context' in framed_json:
            del framed_json['@context']
            print(f"   ‚úÖ Removed @context")
        
        framed_json['id'] = str(uuid.uuid5(uuid.NAMESPACE_URL, course_uri))
        print(f"   ‚úÖ Added Meilisearch ID: {framed_json['id']}")
        
        has_title = 'dcterms:title' in framed_json
        has_type = 'type' in framed_json or '@type' in framed_json
        has_ingested = 'ql:ingestedDate' in framed_json
        print(f"   üìã Verification:")
        print(f"      - Has title: {has_title}")
        print(f"      - Has type: {has_type}")
        print(f"      - Has ingestedDate: {has_ingested}")
        
        meilisearch_documents.append(framed_json)
        
    except Exception as e:
        print(f"   ‚ùå Processing failed: {e}")
        import traceback
        traceback.print_exc()
        continue


[1/8] Processing: Computer Science - Linked Open Data...
   URI: http://course.are.cool/123
   üîÑ Applying JSON-LD framing...
   ‚úÖ Framing successful
   ‚úÖ Removed @context
   ‚úÖ Added Meilisearch ID: 23d2b003-a3f0-5c58-8229-63eb7b946006
   üìã Verification:
      - Has title: True
      - Has type: True
      - Has ingestedDate: False

[2/8] Processing: Computer Science - Linked Open Data...
   URI: urn:schac:courseId:uni-lj.si:123
   üîÑ Applying JSON-LD framing...
   ‚úÖ Framing successful
   ‚úÖ Removed @context
   ‚úÖ Added Meilisearch ID: 2d0df7f0-a0a6-55f1-9959-d8e22295eef0
   üìã Verification:
      - Has title: True
      - Has type: True
      - Has ingestedDate: False

[3/8] Processing: Computer Science - Linked Open Data...
   URI: http://course.are.cool/123
   üîÑ Applying JSON-LD framing...
   ‚úÖ Framing successful
   ‚úÖ Removed @context
   ‚úÖ Added Meilisearch ID: 23d2b003-a3f0-5c58-8229-63eb7b946006
   üìã Verification:
      - Has title: True
      - Has

In [None]:
def normalize_field_names(obj, parent_key=None):

    if obj is None:
        return None
    
    if isinstance(obj, dict):
        if "@value" in obj:
            return obj["@value"]
        
        normalized = {}
        for key, value in obj.items():
            if key.startswith("@"):
                continue
            
            new_key = key.replace(":", "_")
            
            cleaned_value = normalize_field_names(value, parent_key=new_key)
            
            if cleaned_value is not None:
                normalized[new_key] = cleaned_value
        
        return normalized if normalized else None
    
    elif isinstance(obj, list):
        normalized_list = [normalize_field_names(item, parent_key=parent_key) for item in obj]
        
        normalized_list = [item for item in normalized_list if item is not None]
        
        if not normalized_list:
            return None
        
        seen = set()
        unique_list = []
        for item in normalized_list:
            if isinstance(item, dict):
                item_str = json.dumps(item, sort_keys=True)
                if item_str not in seen:
                    seen.add(item_str)
                    unique_list.append(item)
            elif item not in seen:
                seen.add(item)
                unique_list.append(item)
        
        if parent_key in ["ingestedAt", "ingestedDate"]:
            if unique_list and all(isinstance(x, str) for x in unique_list):
                return sorted(unique_list)[-1]
        
        return unique_list
    
    else:
        return obj

In [None]:
final_documents = []

for idx, doc in enumerate(meilisearch_documents, 1):
    print(f"\n[{idx}/{len(meilisearch_documents)}] Normalizing document...")
    
    try:
        normalized = normalize_field_names(doc)
        
        print(f"   ‚úÖ Normalized successfully")
        print(f"      ID: {normalized.get('id')}")
        print(f"      Title: {normalized.get('dcterms_title', 'N/A')[:50]}")
        print(f"      Has learning outcomes: {'elm_learningOutcome' in normalized}")
        print(f"      Has ingestedDate: {'ingestedDate' in normalized or 'ql_ingestedDate' in normalized}")
        
        print(f"      All fields: {list(normalized.keys())[:10]}...")
        
        final_documents.append(normalized)
        
    except Exception as e:
        print(f"   ‚ùå Normalization failed: {e}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*60}")
print(f"‚úÖ Normalized {len(final_documents)} documents")
print(f"{'='*60}")


[1/8] Normalizing document...
   ‚úÖ Normalized successfully
      ID: 23d2b003-a3f0-5c58-8229-63eb7b946006
      Title: Computer Science - Linked Open Data
      Has learning outcomes: True
      Has ingestedDate: True
      All fields: ['id', 'type', 'elm_ISCEDFCode', 'elm_learningOutcome', 'ingestedAt', 'ingestedDate', 'ql_isActive', 'version', 'dcterms_description', 'dcterms_language']...

[2/8] Normalizing document...
   ‚úÖ Normalized successfully
      ID: 2d0df7f0-a0a6-55f1-9959-d8e22295eef0
      Title: Computer Science - Linked Open Data
      Has learning outcomes: True
      Has ingestedDate: True
      All fields: ['id', 'type', 'elm_ISCEDFCode', 'elm_learningOutcome', 'ingestedAt', 'ingestedDate', 'ql_isActive', 'version', 'dcterms_description', 'dcterms_language']...

[3/8] Normalizing document...
   ‚úÖ Normalized successfully
      ID: 23d2b003-a3f0-5c58-8229-63eb7b946006
      Title: Computer Science - Linked Open Data
      Has learning outcomes: True
      Has inge

In [37]:
final_documents[0]

{'id': '23d2b003-a3f0-5c58-8229-63eb7b946006',
 'type': 'LearningOpportunitySpecification',
 'elm_ISCEDFCode': 'http://data.europa.eu/snb/isced-f/0611',
 'elm_learningOutcome': [{'type': 'elm:LearningOutcome',
   'dcterms_title': 'The learner is able to explain different serialisation formats for RDF data.'},
  {'type': 'elm:LearningOutcome',
   'dcterms_title': 'The learner is able to define and explain basic RDF concepts.'}],
 'ingestedAt': '2025-12-10T08:38:41.599280+00:00',
 'ingestedDate': '2025-12-10',
 'ql_isActive': 'true',
 'version': '1.0',
 'dcterms_description': 'This course in computer science focuses on linked open data and concepts such as RDF.',
 'dcterms_language': 'http://publications.europa.eu/resource/authority/language/HBS',
 'dcterms_publisher': {'id': 'http://data.quality-link.eu/examples/provider/AT0005',
  'type': 'HigherEducationInstitution',
  'adms_identifier': [{'type': 'OrgRegIdentifier',
    'elm_schemeId': 'ql:OrgReg',
    'skos_notation': 'AT0005'}]},
 

In [None]:
# =============================================================================
# STEP 7: UPLOAD TO MEILISEARCH
# =============================================================================

In [41]:
ACTUALLY_UPLOAD = True  

if not ACTUALLY_UPLOAD:
    print("‚ö†Ô∏è  UPLOAD DISABLED (Preview Mode)")
    print("   Set ACTUALLY_UPLOAD = True to enable uploading")
    print(f"\n   Would upload {len(meilisearch_documents)} documents to:")
    print(f"   URL: {MEILISEARCH_URL}/indexes/{INDEX_NAME}/documents")
    print(f"   Total size: {sum(len(json.dumps(doc)) for doc in meilisearch_documents)} bytes")
else:
    upload_url = f"{MEILISEARCH_URL}/indexes/{INDEX_NAME}/documents"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {MEILISEARCH_API_KEY}"
    }
    
    uploaded_count = 0
    failed_count = 0
    
    for idx, doc in enumerate(final_documents, 1):
        title = doc.get('dcterms_title', 'No title')
        print(f"\n[{idx}/{len(final_documents)}] Uploading: {title[:50]}...")
        
        try:
            response = requests.post(upload_url, headers=headers, json=doc)
            response.raise_for_status()
            
            task_info = response.json()
            task_uid = task_info.get('taskUid')
            print(f"   ‚úÖ Uploaded successfully (Task UID: {task_uid})")
            uploaded_count += 1
            
        except requests.RequestException as e:
            print(f"   ‚ùå Upload failed: {e}")
            failed_count += 1
            continue
    
    print(f"\n{'='*60}")
    print("üìä UPLOAD SUMMARY")
    print(f"{'='*60}")
    print(f"‚úÖ Successfully uploaded: {uploaded_count}")
    print(f"‚ùå Failed uploads:        {failed_count}")
    print(f"üìà Total documents:       {len(final_documents)}")


[1/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 134)

[2/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 135)

[3/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 136)

[4/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 137)

[5/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 138)

[6/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 139)

[7/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 140)

[8/8] Uploading: Computer Science - Linked Open Data...
   ‚úÖ Uploaded successfully (Task UID: 141)

üìä UPLOAD SUMMARY
‚úÖ Successfully uploaded: 8
‚ùå Failed uploads:        0
üìà Total documents:       8
