# MIAO Automatic Modeling: Stress Detection Dataset

This notebook demonstrates automatic generation of MIAO-compliant RDF annotations from:
1. A text dataset with binary stress labels (0=no stress, 1=stress)
2. Machine learning experiment results (model performance metrics)

The notebook creates a complete RDF knowledge graph following the MIAO ontology structure.

## 1. Setup and Dependencies

In [1]:
# Install required packages (uncomment if needed)
# !pip install rdflib pandas numpy scikit-learn

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from rdflib import Graph, Namespace, Literal, URIRef, RDF, RDFS, XSD
from rdflib.namespace import DCTERMS, PROV
import hashlib
import json

## 2. Define Namespaces

In [3]:
# Define MIAO and related namespaces
MIAO = Namespace("https://w3id.org/miao#")
MLS = Namespace("http://www.w3.org/ns/mls#")
EX = Namespace("https://w3id.org/miao/experiment#")

# Create RDF graph
g = Graph()
g.bind("miao", MIAO)
g.bind("mls", MLS)
g.bind("ex", EX)
g.bind("dcterms", DCTERMS)
g.bind("prov", PROV)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)

## 3. Load Input Data

### 3.1 Load Stress Detection Dataset

Expected format:
- CSV file with columns: `text`, `label`
- `label`: 0 (no stress) or 1 (stress)

In [4]:
# Load dataset (replace with your actual file path)
# For demonstration, we'll create a sample dataset

# Option 1: Load from CSV
# df_dataset = pd.read_csv('stress_dataset.csv')

# Option 2: Create sample data for demonstration
df_dataset = pd.DataFrame({
    'text': [
        "I'm feeling overwhelmed with work deadlines and family responsibilities",
        "Had a great day at the beach, feeling relaxed and happy",
        "Can't sleep at night, constant worry about finances",
        "Enjoyed a peaceful morning walk in the park",
        "My anxiety is through the roof with upcoming exams"
    ],
    'label': [1, 0, 1, 0, 1]  # 1=stress, 0=no stress
})

print(f"Loaded {len(df_dataset)} samples")
print(f"Stress distribution: {df_dataset['label'].value_counts().to_dict()}")
df_dataset.head()

Loaded 5 samples
Stress distribution: {1: 3, 0: 2}


Unnamed: 0,text,label
0,I'm feeling overwhelmed with work deadlines an...,1
1,"Had a great day at the beach, feeling relaxed ...",0
2,"Can't sleep at night, constant worry about fin...",1
3,Enjoyed a peaceful morning walk in the park,0
4,My anxiety is through the roof with upcoming e...,1


### 3.2 Load ML Experiment Results

Expected format:
- DataFrame with columns: `sample_id`, `model_name`, `predicted_label`, `confidence`, `true_label`
- Additional metadata: `experiment_date`, `hyperparameters`, etc.

In [5]:
# Option 1: Load from CSV
# df_results = pd.read_csv('experiment_results.csv')

# Option 2: Create sample results for demonstration
df_results = pd.DataFrame({
    'sample_id': [0, 1, 2, 3, 4],
    'model_name': ['BERT_Stress_Classifier', 'BERT_Stress_Classifier', 
                   'BERT_Stress_Classifier', 'BERT_Stress_Classifier', 
                   'BERT_Stress_Classifier'],
    'predicted_label': [1, 0, 1, 0, 1],
    'confidence': [0.92, 0.87, 0.78, 0.95, 0.84],
    'true_label': [1, 0, 1, 0, 1]
})

# Model performance metrics (aggregate)
model_metrics = {
    'model_name': 'BERT_Stress_Classifier',
    'implementation': 'PyTorch',
    'version': '1.0',
    'accuracy': 0.85,
    'precision': 0.83,
    'recall': 0.88,
    'f1_score': 0.85,
    'training_date': '2025-11-15',
    'hyperparameters': {
        'learning_rate': 2e-5,
        'batch_size': 32,
        'epochs': 10
    }
}

print(f"Loaded {len(df_results)} predictions")
print(f"Accuracy: {(df_results['predicted_label'] == df_results['true_label']).mean():.2f}")
df_results.head()

Loaded 5 predictions
Accuracy: 1.00


Unnamed: 0,sample_id,model_name,predicted_label,confidence,true_label
0,0,BERT_Stress_Classifier,1,0.92,1
1,1,BERT_Stress_Classifier,0,0.87,0
2,2,BERT_Stress_Classifier,1,0.78,1
3,3,BERT_Stress_Classifier,0,0.95,0
4,4,BERT_Stress_Classifier,1,0.84,1


## 4. Create MIAO Schema and Categories

In [6]:
def create_stress_schema(graph):
    """
    Create binary stress classification schema in MIAO format.
    """
    # Define schema
    schema_uri = EX.StressSchema_Binary
    graph.add((schema_uri, RDF.type, MIAO.MentalIllnessesSchema))
    graph.add((schema_uri, DCTERMS.title, 
               Literal("Binary Stress Classification Schema", lang="en")))
    graph.add((schema_uri, DCTERMS.description, 
               Literal("Binary schema for computational stress detection from text", lang="en")))
    graph.add((schema_uri, DCTERMS.created, 
               Literal(datetime.now().strftime("%Y-%m-%d"), datatype=XSD.date)))
    
    # Define categories
    # Category 0: No Stress
    no_stress_uri = EX.Stress_Negative
    graph.add((no_stress_uri, RDF.type, MIAO.MentalIllnessCategory))
    graph.add((no_stress_uri, DCTERMS.title, Literal("No Stress", lang="en")))
    graph.add((no_stress_uri, DCTERMS.description, 
               Literal("No psychological stress indicators detected", lang="en")))
    graph.add((no_stress_uri, MIAO.isMentalIllnessCategoryOf, schema_uri))
    graph.add((schema_uri, MIAO.hasMentalIllnessCategory, no_stress_uri))
    
    # Category 1: Stress
    stress_uri = EX.Stress_Positive
    graph.add((stress_uri, RDF.type, MIAO.MentalIllnessCategory))
    graph.add((stress_uri, DCTERMS.title, Literal("Stress", lang="en")))
    graph.add((stress_uri, DCTERMS.description, 
               Literal("Psychological stress indicators detected", lang="en")))
    graph.add((stress_uri, MIAO.isMentalIllnessCategoryOf, schema_uri))
    graph.add((schema_uri, MIAO.hasMentalIllnessCategory, stress_uri))
    
    return schema_uri, {0: no_stress_uri, 1: stress_uri}

schema_uri, category_map = create_stress_schema(g)
print(f"Created schema: {schema_uri}")
print(f"Categories: {category_map}")

Created schema: https://w3id.org/miao/experiment#StressSchema_Binary
Categories: {0: rdflib.term.URIRef('https://w3id.org/miao/experiment#Stress_Negative'), 1: rdflib.term.URIRef('https://w3id.org/miao/experiment#Stress_Positive')}


## 5. Model Dataset as MIAO Dataset

In [7]:
def create_dataset_metadata(graph, df, dataset_name="StressTextDataset"):
    """
    Create dataset metadata in MLS format.
    """
    dataset_uri = EX[dataset_name]
    graph.add((dataset_uri, RDF.type, MLS.Dataset))
    graph.add((dataset_uri, RDFS.label, 
               Literal(f"{dataset_name} - Text samples for stress detection", lang="en")))
    graph.add((dataset_uri, DCTERMS.description, 
               Literal(f"Dataset containing {len(df)} text samples with binary stress annotations", lang="en")))
    graph.add((dataset_uri, DCTERMS.extent, 
               Literal(f"{len(df)} samples", lang="en")))
    graph.add((dataset_uri, DCTERMS.format, Literal("text/plain")))
    graph.add((dataset_uri, DCTERMS.created, 
               Literal(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), datatype=XSD.dateTime)))
    
    # Add statistics
    stress_count = (df['label'] == 1).sum()
    no_stress_count = (df['label'] == 0).sum()
    graph.add((dataset_uri, EX.stressCount, Literal(int(stress_count), datatype=XSD.integer)))
    graph.add((dataset_uri, EX.noStressCount, Literal(int(no_stress_count), datatype=XSD.integer)))
    
    return dataset_uri

dataset_uri = create_dataset_metadata(g, df_dataset)
print(f"Created dataset: {dataset_uri}")

Created dataset: https://w3id.org/miao/experiment#StressTextDataset


## 6. Model ML Pipeline and Model

In [8]:
def create_ml_implementation(graph, model_metrics):
    """
    Create ML implementation and software metadata.
    """
    # Software (PyTorch)
    software_uri = EX.PyTorch
    graph.add((software_uri, RDF.type, MLS.Software))
    graph.add((software_uri, RDFS.label, Literal("PyTorch", lang="en")))
    graph.add((software_uri, DCTERMS.description, 
               Literal("Open-source machine learning framework", lang="en")))
    
    # Implementation
    impl_name = model_metrics['model_name'].replace(' ', '_')
    impl_uri = EX[f"{impl_name}_Implementation"]
    graph.add((impl_uri, RDF.type, MLS.Implementation))
    graph.add((impl_uri, RDFS.label, 
               Literal(f"{model_metrics['model_name']} Implementation", lang="en")))
    graph.add((impl_uri, DCTERMS.description, 
               Literal("BERT-based classifier fine-tuned for stress detection", lang="en")))
    graph.add((software_uri, MLS.hasPart, impl_uri))
    
    # Hyperparameters
    for param_name, param_value in model_metrics['hyperparameters'].items():
        param_uri = EX[f"{impl_name}_{param_name}"]
        graph.add((param_uri, RDF.type, MLS.HyperParameter))
        graph.add((param_uri, RDFS.label, Literal(param_name.replace('_', ' ').title(), lang="en")))
        
        # Determine datatype
        if isinstance(param_value, float):
            graph.add((param_uri, MLS.hasValue, Literal(param_value, datatype=XSD.float)))
        elif isinstance(param_value, int):
            graph.add((param_uri, MLS.hasValue, Literal(param_value, datatype=XSD.integer)))
        else:
            graph.add((param_uri, MLS.hasValue, Literal(str(param_value))))
        
        graph.add((impl_uri, MLS.hasHyperParameter, param_uri))
    
    return impl_uri

def create_trained_model(graph, model_metrics, impl_uri):
    """
    Create trained model instance.
    """
    model_name = model_metrics['model_name'].replace(' ', '_')
    model_uri = EX[f"{model_name}_v{model_metrics['version']}"]
    graph.add((model_uri, RDF.type, MLS.Model))
    graph.add((model_uri, RDFS.label, 
               Literal(f"{model_metrics['model_name']} v{model_metrics['version']}", lang="en")))
    graph.add((model_uri, DCTERMS.created, 
               Literal(model_metrics['training_date'], datatype=XSD.date)))
    graph.add((model_uri, DCTERMS.description, 
               Literal("Trained BERT model for binary stress classification", lang="en")))
    
    return model_uri

impl_uri = create_ml_implementation(g, model_metrics)
model_uri = create_trained_model(g, model_metrics, impl_uri)
print(f"Created implementation: {impl_uri}")
print(f"Created model: {model_uri}")

Created implementation: https://w3id.org/miao/experiment#BERT_Stress_Classifier_Implementation
Created model: https://w3id.org/miao/experiment#BERT_Stress_Classifier_v1.0


## 7. Model Detection Run and Evaluation

In [9]:
def create_detection_run(graph, schema_uri, dataset_uri, impl_uri, model_uri, model_metrics):
    """
    Create automatic detection run (experiment execution).
    """
    run_id = hashlib.md5(f"{model_metrics['model_name']}_{datetime.now()}".encode()).hexdigest()[:8]
    run_uri = EX[f"Run_{run_id}"]
    
    graph.add((run_uri, RDF.type, MIAO.AutomaticMentalIllnessesDetection))
    graph.add((run_uri, RDF.type, MLS.Run))
    graph.add((run_uri, RDFS.label, 
               Literal(f"Stress detection run {run_id}", lang="en")))
    graph.add((run_uri, DCTERMS.description, 
               Literal("Automatic stress detection from text using BERT classifier", lang="en")))
    graph.add((run_uri, DCTERMS.created, 
               Literal(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), datatype=XSD.dateTime)))
    
    # Connect to components
    graph.add((run_uri, MIAO.hasInputData, dataset_uri))
    graph.add((run_uri, MIAO.usedMentalIllnessesSchema, schema_uri))
    graph.add((run_uri, MLS.executes, impl_uri))
    
    # Create evaluation measures
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for metric in metrics:
        measure_uri = EX[metric]
        graph.add((measure_uri, RDF.type, MLS.EvaluationMeasure))
        graph.add((measure_uri, RDFS.label, Literal(metric.replace('_', ' ').title(), lang="en")))
        
        eval_uri = EX[f"{run_id}_{metric}_evaluation"]
        graph.add((eval_uri, RDF.type, MLS.ModelEvaluation))
        graph.add((eval_uri, MLS.specifiedBy, measure_uri))
        graph.add((eval_uri, MLS.hasValue, Literal(model_metrics[metric], datatype=XSD.float)))
        graph.add((run_uri, MLS.hasOutput, eval_uri))
    
    graph.add((run_uri, MLS.hasOutput, model_uri))
    
    return run_uri, run_id

run_uri, run_id = create_detection_run(g, schema_uri, dataset_uri, impl_uri, model_uri, model_metrics)
print(f"Created detection run: {run_uri}")

Created detection run: https://w3id.org/miao/experiment#Run_4b34daf2


## 8. Model Individual Predictions

In [10]:
def create_predictions(graph, df_results, df_dataset, run_uri, run_id, category_map):
    """
    Create individual mental illness predictions for each sample.
    """
    # Create mental illness set
    illness_set_uri = EX[f"StressSet_{run_id}"]
    graph.add((illness_set_uri, RDF.type, MIAO.MentalIllnessesSet))
    graph.add((illness_set_uri, RDFS.label, 
               Literal(f"Stress detection results from run {run_id}", lang="en")))
    graph.add((illness_set_uri, PROV.wasGeneratedBy, run_uri))
    graph.add((run_uri, PROV.generated, illness_set_uri))
    
    # Define Stress subclass
    stress_class = EX.Stress
    graph.add((stress_class, RDFS.subClassOf, MIAO.MentalIllness))
    graph.add((stress_class, RDFS.label, Literal("Stress", lang="en")))
    
    # Create individual predictions
    for idx, row in df_results.iterrows():
        sample_id = row['sample_id']
        predicted_label = row['predicted_label']
        confidence = row['confidence']
        
        # Create illness instance
        illness_uri = EX[f"Stress_{run_id}_sample_{sample_id}"]
        graph.add((illness_uri, RDF.type, stress_class))
        graph.add((illness_uri, MIAO.belongsToMentalIllnessesSet, illness_set_uri))
        graph.add((illness_set_uri, MIAO.hasMentalIllness, illness_uri))
        
        # Add category reference
        category_uri = category_map[predicted_label]
        graph.add((illness_uri, MIAO.referredToMentalIllnessCategory, category_uri))
        
        # Add confidence
        graph.add((illness_uri, MIAO.hasMentalIllnessDetectionConfidence, 
                   Literal(float(confidence), datatype=XSD.decimal)))
        
        # Add sample reference
        sample_ref = f"text_sample_{sample_id}"
        graph.add((illness_uri, MIAO.refersToSample, Literal(sample_ref, datatype=XSD.string)))
        
        # Add label and description
        label_text = "Stress" if predicted_label == 1 else "No Stress"
        graph.add((illness_uri, RDFS.label, 
                   Literal(f"{label_text} prediction for sample {sample_id}", lang="en")))
        
        # Optional: add text snippet (first 100 chars)
        if sample_id < len(df_dataset):
            text_snippet = df_dataset.iloc[sample_id]['text'][:100]
            graph.add((illness_uri, DCTERMS.description, 
                       Literal(f"Prediction for text: '{text_snippet}...'", lang="en")))
    
    return illness_set_uri

illness_set_uri = create_predictions(g, df_results, df_dataset, run_uri, run_id, category_map)
print(f"Created {len(df_results)} predictions in set: {illness_set_uri}")

Created 5 predictions in set: https://w3id.org/miao/experiment#StressSet_4b34daf2


## 9. Export RDF Graph

In [11]:
# Print statistics
print("\n" + "="*50)
print("RDF GRAPH STATISTICS")
print("="*50)
print(f"Total triples: {len(g)}")
print(f"\nTriples by type:")

# Count by type
type_counts = {}
for s, p, o in g.triples((None, RDF.type, None)):
    obj_str = str(o).split('#')[-1].split('/')[-1]
    type_counts[obj_str] = type_counts.get(obj_str, 0) + 1

for type_name, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {type_name}: {count}")


RDF GRAPH STATISTICS
Total triples: 124

Triples by type:
  Stress: 5
  EvaluationMeasure: 4
  ModelEvaluation: 4
  HyperParameter: 3
  MentalIllnessCategory: 2
  MentalIllnessesSchema: 1
  Dataset: 1
  Software: 1
  Implementation: 1
  Model: 1
  AutomaticMentalIllnessesDetection: 1
  Run: 1
  MentalIllnessesSet: 1


In [12]:
# Export to Turtle format
output_file = "stress_detection_experiment.ttl"
g.serialize(destination=output_file, format="turtle")
print(f"\nRDF graph exported to: {output_file}")
print(f"File size: {len(g.serialize(format='turtle'))} bytes")


RDF graph exported to: stress_detection_experiment.ttl
File size: 6747 bytes


## 10. Sample SPARQL Queries

In [13]:
# Query 1: Get all predictions with confidence > 0.8
query1 = """
PREFIX miao: <https://w3id.org/miao#>
PREFIX ex: <https://w3id.org/miao/experiment#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?illness ?label ?category ?confidence ?sample
WHERE {
  ?illness a ex:Stress ;
           rdfs:label ?label ;
           miao:referredToMentalIllnessCategory ?category ;
           miao:hasMentalIllnessDetectionConfidence ?confidence ;
           miao:refersToSample ?sample .
  FILTER(?confidence > 0.8)
}
ORDER BY DESC(?confidence)
"""

print("Query 1: High-confidence predictions (> 0.8)")
print("="*50)
results = g.query(query1)
for row in results:
    print(f"Sample: {row.sample}, Confidence: {row.confidence}, Category: {str(row.category).split('#')[-1]}")
print(f"\nTotal: {len(results)} predictions")

Query 1: High-confidence predictions (> 0.8)
Sample: text_sample_3, Confidence: 0.95, Category: Stress_Negative
Sample: text_sample_0, Confidence: 0.92, Category: Stress_Positive
Sample: text_sample_1, Confidence: 0.87, Category: Stress_Negative
Sample: text_sample_4, Confidence: 0.84, Category: Stress_Positive

Total: 4 predictions


In [14]:
# Query 2: Get model performance metrics
query2 = """
PREFIX mls: <http://www.w3.org/ns/mls#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?metric ?value
WHERE {
  ?evaluation a mls:ModelEvaluation ;
              mls:specifiedBy ?measure ;
              mls:hasValue ?value .
  ?measure rdfs:label ?metric .
}
ORDER BY ?metric
"""

print("\nQuery 2: Model Performance Metrics")
print("="*50)
results = g.query(query2)
for row in results:
    print(f"{row.metric}: {float(row.value):.3f}")


Query 2: Model Performance Metrics
Accuracy: 0.850
F1 Score: 0.850
Precision: 0.830
Recall: 0.880


In [15]:
# Query 3: Count predictions by category
query3 = """
PREFIX miao: <https://w3id.org/miao#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT ?categoryTitle (COUNT(?illness) as ?count)
WHERE {
  ?illness miao:referredToMentalIllnessCategory ?category .
  ?category dcterms:title ?categoryTitle .
}
GROUP BY ?categoryTitle
ORDER BY DESC(?count)
"""

print("\nQuery 3: Prediction Distribution")
print("="*50)
results = g.query(query3)
for row in results:
    print(f"{row.categoryTitle}: {row.count} predictions")


Query 3: Prediction Distribution
Stress: <built-in method count of ResultRow object at 0x745ff7dd6bb0> predictions
No Stress: <built-in method count of ResultRow object at 0x745ff7dd6750> predictions


## 11. Validation Report

In [16]:
# Generate validation report
print("\n" + "="*70)
print("MIAO MODELING VALIDATION REPORT")
print("="*70)

# Check required components
checks = [
    ("Mental Illness Schema", len(list(g.triples((None, RDF.type, MIAO.MentalIllnessesSchema)))), 1),
    ("Mental Illness Categories", len(list(g.triples((None, RDF.type, MIAO.MentalIllnessCategory)))), 2),
    ("Dataset", len(list(g.triples((None, RDF.type, MLS.Dataset)))), 1),
    ("ML Implementation", len(list(g.triples((None, RDF.type, MLS.Implementation)))), 1),
    ("Trained Model", len(list(g.triples((None, RDF.type, MLS.Model)))), 1),
    ("Detection Run", len(list(g.triples((None, RDF.type, MIAO.AutomaticMentalIllnessesDetection)))), 1),
    ("Mental Illness Set", len(list(g.triples((None, RDF.type, MIAO.MentalIllnessesSet)))), 1),
    ("Individual Predictions", len(list(g.triples((None, MIAO.hasMentalIllnessDetectionConfidence, None)))), len(df_results)),
    ("Model Evaluations", len(list(g.triples((None, RDF.type, MLS.ModelEvaluation)))), 4),
]

all_passed = True
for component, actual, expected in checks:
    status = "PASS" if actual >= expected else "FAIL"
    if status == "FAIL":
        all_passed = False
    print(f"[{status}] {component}: {actual}/{expected}")

print("\n" + "="*70)
if all_passed:
    print("VALIDATION PASSED: All MIAO components correctly modeled")
else:
    print("VALIDATION FAILED: Some components are missing or incomplete")
print("="*70)


MIAO MODELING VALIDATION REPORT
[PASS] Mental Illness Schema: 1/1
[PASS] Mental Illness Categories: 2/2
[PASS] Dataset: 1/1
[PASS] ML Implementation: 1/1
[PASS] Trained Model: 1/1
[PASS] Detection Run: 1/1
[PASS] Mental Illness Set: 1/1
[PASS] Individual Predictions: 5/5
[PASS] Model Evaluations: 4/4

VALIDATION PASSED: All MIAO components correctly modeled
