In [1]:
!pip install transformers datasets scikit-learn pandas

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.w

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd

# 1. Load your structured data (CSV with 'input' and 'output' columns)
df = pd.read_csv("your_data.csv")  # Replace with your file path
dataset = Dataset.from_pandas(df)

# 2. Choose a model (T5 is good for text-to-text tasks)
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 3. Tokenize the dataset
def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# 4. Split into train/test
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

# 5. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)

# 6. Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

# 7. Save the model
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


FileNotFoundError: [Errno 2] No such file or directory: 'your_data.csv'

In [1]:
def generate_output(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generate_output("Product: Apple, Price: 1.2, Quantity: 10"))


NameError: name 'tokenizer' is not defined

In [None]:
<file:///Users/piotr/repos/Iliad/data_access_api/examples/Observations_Features/Jellyfish_in_Israeli_Mediterranean_coast11940737.291666666711232Rhopilema_nomadica> a sosa:Observation,
        geojson:Feature ;
    dct:date "13/07/2011" ;
    dct:time "07:00" ;
    geojson:geometry [ a geojson:Point ;
            geojson:coordinates ( 3.487848e+01 3.244416e+01 ) ] ;
    ns1:strandedJellyfish 0 ;
    iliad:sampleSizeValue "Oct-30" .

<file:///Users/piotr/repos/Iliad/data_access_api/examples/Observations_Features/Jellyfish_in_Israeli_Mediterranean_coast12040737.29166666670-10Rhizostoma_pulmo> a sosa:Observation,
        geojson:Feature ;
    dct:date "13/07/2011" ;
    dct:time "07:00" ;
    geojson:geometry [ a geojson:Point ;
            geojson:coordinates ( 3.479095e+01 3.235545e+01 ) ] ;
    ns1:strandedJellyfish 1 ;
    iliad:sampleSizeValue "0-10" .

In [2]:

import re

# Load the RDF-like data
with open("/Users/piotr/Temp/Jellyfish_trends.txt", "r", encoding="utf-8") as file:
    data = file.read()

# Split into individual observations
observations = data.split("<file:///")[1:]

sparql_queries = []

for obs in observations:
    lines = obs.split(";\n")
    uri_line = lines[0].strip()
    uri = uri_line.split(">")[0]
    species_match = re.search(r'(\w+)$', uri)
    species = species_match.group(1) if species_match else "Unknown"

    date = time = coords = stranded = sample_size = "Unknown"

    for line in lines:
        if "dct:date" in line:
            date = re.search(r'"(.*?)"', line).group(1)
        elif "dct:time" in line:
            time = re.search(r'"(.*?)"', line).group(1)
        elif "geojson:coordinates" in line:
            coords = re.findall(r"([\d\.e\+\-]+)", line)
            coords = f"({coords[0]}, {coords[1]})" if len(coords) == 2 else "Unknown"
        elif "ns1:strandedJellyfish" in line:
            stranded = re.search(r'(\d+)', line).group(1)
        elif "iliad:sampleSizeValue" in line:
            sample_size = re.search(r'"?(.*?)"?\s*\.', line).group(1)

    # Format SPARQL query
    query = f"""
    SELECT ?date ?time ?coordinates ?species ?stranded ?sampleSize
    WHERE {{
        ?obs a sosa:Observation ;
             dct:date "{date}" ;
             dct:time "{time}" ;
             geojson:geometry [ geojson:coordinates {coords} ] ;
             ns1:strandedJellyfish {stranded} ;
             iliad:sampleSizeValue "{sample_size}" .
        BIND("{species}" AS ?species)
    }}
    """
    sparql_queries.append(query.strip())

# Save to file
with open("sparql_queries.txt", "w", encoding="utf-8") as out:
    out.write("\n\n".join(sparql_queries))

print(f"Generated {len(sparql_queries)} SPARQL queries.")


Generated 2000 SPARQL queries.


In [5]:
import re
import csv

# Input and output file paths
input_file = "/Users/piotr/Temp/Jellyfish_trends.txt"
output_file = "/Users/piotr/Temp/jellyfish_observations.csv"

# Read the RDF-like data
with open(input_file, "r", encoding="utf-8") as file:
    data = file.read()

# Split into individual observations
observations = data.split("<file:///")[1:]

# Prepare CSV output
header = ["date", "time", "latitude", "longitude", "species", "strandedJellyfish", "sampleSizeValue"]
rows = []

for obs in observations:
    lines = obs.split(";\n")
    uri_line = lines[0].strip()
    uri = uri_line.split(">")[0]
    species_match = re.search(r'(\w+)$', uri)
    species = species_match.group(1) if species_match else "Unknown"

    date = time = latitude = longitude = stranded = sample_size = "Unknown"

    for line in lines:
        if "dct:date" in line:
            date = re.search(r'"(.*?)"', line).group(1)
        elif "dct:time" in line:
            time = re.search(r'"(.*?)"', line).group(1)
        elif "geojson:coordinates" in line:
            coords = re.findall(r"([\d\.e\+\-]+)", line)
            if len(coords) == 2:
                longitude, latitude = coords
        elif "ns1:strandedJellyfish" in line:
            stranded = re.search(r'(\d+)', line).group(1)
        elif "iliad:sampleSizeValue" in line:
            sample_size = re.search(r'"?(.*?)"?\s*\.', line).group(1)

    rows.append([date, time, latitude, longitude, species, stranded, sample_size])

# Write to CSV
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(rows)

print(f"Extracted {len(rows)} observations to {output_file}")


Extracted 2000 observations to /Users/piotr/Temp/jellyfish_observations.csv
