In [1]:
import requests
import csv
from io import StringIO

# Example file: AboutPage_propsToTypes.csv
GITHUB_RAW_URL = "https://raw.githubusercontent.com/wbsg-uni-mannheim/wdc-sotab/main/data/PropsToTypes/AboutPage_propsToTypes.csv"

response = requests.get(GITHUB_RAW_URL)
response.raise_for_status()  # Raise an error for bad status

csvfile = StringIO(response.text)
reader = csv.DictReader(csvfile)

# Get first 10 properties
for i, row in enumerate(reader):
    if i >= 10:
        break
    print(f"{i+1}. Property: {row['property']} | Expected Types: {row['expected_types']}")


1. Property: AboutPage.breadcrumb | Expected Types: ['BreadcrumbList', 'Text']
2. Property: AboutPage.lastReviewed | Expected Types: Date
3. Property: AboutPage.mainContentOfPage | Expected Types: WebPageElement
4. Property: AboutPage.primaryImageOfPage | Expected Types: ImageObject
5. Property: AboutPage.relatedLink | Expected Types: URL
6. Property: AboutPage.reviewedBy | Expected Types: ['Organization', 'Person']
7. Property: AboutPage.significantLink | Expected Types: URL
8. Property: AboutPage.speakable | Expected Types: ['SpeakableSpecification', 'URL']
9. Property: AboutPage.specialty | Expected Types: Specialty
10. Property: AboutPage.about | Expected Types: Thing


In [2]:
import requests
import csv
import pandas as pd
import ast

# 1. Choose the file and base link
type_name = "AboutPage"
github_csv_url = f"https://raw.githubusercontent.com/wbsg-uni-mannheim/wdc-sotab/main/data/PropsToTypes/{type_name}_propsToTypes.csv"
schema_link = f"https://schema.org/{type_name}"

# 2. Download and parse the CSV
response = requests.get(github_csv_url)
response.raise_for_status()
rows = list(csv.DictReader(response.text.splitlines()))

# 3. Process and explode rows with multiple expected types
output_rows = []
for row in rows:
    prop = row["property"]
    expected_types = row["expected_types"]
    try:
        # Handle expected_types as a list (if present)
        types = ast.literal_eval(expected_types) if expected_types.startswith("[") else [expected_types]
    except:
        types = [expected_types]
    for t in types:
        t = t.strip().strip("'").strip('"')
        output_rows.append({
            "schema_link": schema_link,
            "property": prop,
            "expected_type": t
        })

# 4. Save to Excel
df = pd.DataFrame(output_rows)
excel_file = f"{type_name}_properties.xlsx"
df.to_excel(excel_file, index=False)

print(f"Excel file created: {excel_file}")


Excel file created: AboutPage_properties.xlsx


In [4]:
import requests
import pandas as pd
import ast
from io import StringIO

# GitHub API to list folder contents
api_url = "https://api.github.com/repos/wbsg-uni-mannheim/wdc-sotab/contents/data/PropsToTypes"
raw_prefix = "https://raw.githubusercontent.com/wbsg-uni-mannheim/wdc-sotab/main/data/PropsToTypes/"

headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(api_url, headers=headers)
files_json = response.json()

# Get only CSV files
csv_files = [f['name'] for f in files_json if f['name'].endswith('.csv')]
print("CSV files found:", csv_files[:3], "...")

all_rows = []

# For testing, just process first 3 CSVs
for fname in csv_files[:3]:
    type_name = fname.replace("_propsToTypes.csv", "")
    schema_link = f"https://schema.org/{type_name}"
    raw_url = raw_prefix + fname
    try:
        r = requests.get(raw_url, headers=headers)
        r.raise_for_status()
        df = pd.read_csv(StringIO(r.text))
    except Exception as e:
        print(f"Failed to process {fname}: {e}")
        continue
    for _, row in df.iterrows():
        prop = row["property"]
        expected_types = str(row["expected_types"])
        try:
            types = ast.literal_eval(expected_types) if expected_types.strip().startswith("[") else [expected_types]
        except Exception:
            types = [expected_types]
        for t in types:
            t = str(t).strip().strip("'").strip('"')
            all_rows.append({
                "schema_link": f'=HYPERLINK("{schema_link}", "{type_name}")',
                "property": prop,
                "expected_type": t
            })

if all_rows:
    df_out = pd.DataFrame(all_rows)
    df_out.to_excel("First3PropsToTypes.xlsx", index=False)
    print("Excel file created: First3PropsToTypes.xlsx")
else:
    print("No data found!")


CSV files found: ['AboutPage_propsToTypes.csv', 'Action_propsToTypes.csv', 'AdministrativeArea_propsToTypes.csv'] ...
Excel file created: First3PropsToTypes.xlsx


In [5]:
import requests
import pandas as pd
import ast
from io import StringIO

# 1. Get all CSV file names from GitHub API
api_url = "https://api.github.com/repos/wbsg-uni-mannheim/wdc-sotab/contents/data/PropsToTypes"
raw_prefix = "https://raw.githubusercontent.com/wbsg-uni-mannheim/wdc-sotab/main/data/PropsToTypes/"

headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(api_url, headers=headers)
files_json = response.json()

csv_files = [f['name'] for f in files_json if f['name'].endswith('.csv')]
print(f"Total CSV files found: {len(csv_files)}")

all_rows = []

# 2. Process ALL CSV files
for fname in csv_files:
    type_name = fname.replace("_propsToTypes.csv", "")
    schema_link = f"https://schema.org/{type_name}"
    raw_url = raw_prefix + fname
    try:
        r = requests.get(raw_url, headers=headers)
        r.raise_for_status()
        df = pd.read_csv(StringIO(r.text))
    except Exception as e:
        print(f"Failed to process {fname}: {e}")
        continue
    for _, row in df.iterrows():
        prop = row["property"]
        expected_types = str(row["expected_types"])
        # Explode lists like ['Person','Organization'] into separate lines
        try:
            types = ast.literal_eval(expected_types) if expected_types.strip().startswith("[") else [expected_types]
        except Exception:
            types = [expected_types]
        for t in types:
            t = str(t).strip().strip("'").strip('"')
            all_rows.append({
                "schema_link": f'=HYPERLINK("{schema_link}", "{type_name}")',
                "property": prop,
                "expected_type": t
            })

# 3. Write to Excel
if all_rows:
    df_out = pd.DataFrame(all_rows)
    excel_file = "AllPropsToTypes.xlsx"
    df_out.to_excel(excel_file, index=False)
    print(f"Excel file created: {excel_file}")
else:
    print("No data found!")


Total CSV files found: 116
Excel file created: AllPropsToTypes.xlsx


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_direct_properties(type_name):
    url = f"https://schema.org/{type_name}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    tables = soup.find_all('table')
    results = []

    for table in tables:
        # Find the section header just before the table
        prev = table.find_previous(['h4', 'h3', 'h2', 'h5'])
        if prev and prev.text.strip() == f"Properties from {type_name}":
            # Go through rows
            for row in table.find_all('tr'):
                cells = row.find_all(['th', 'td'])
                if len(cells) == 2:  # Skip header
                    prop = cells[0].get_text(strip=True)
                    exptype = cells[1].get_text(strip=True)
                    results.append({'type': type_name, 'property': prop, 'expected_type': exptype})
    return results

# Example for AboutPage
props = get_direct_properties("AboutPage")
df = pd.DataFrame(props)
print(df)
df.to_excel("AboutPage_direct_properties.xlsx", index=False)


Excel file created: AllPropsToTypes_noThingProps.xlsx


Going all the way 

In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_direct_properties(type_name):
    url = f"https://schema.org/{type_name}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    results = []
    table = soup.find("table", class_="definition-table")
    if not table:
        return results
    rows = table.find_all("tr")
    in_correct_section = False
    for row in rows:
        # Section header row
        if "supertype" in row.get("class", []):
            th = row.find("th", class_="supertype-name")
            if th and th.find("a") and th.find("a").get_text(strip=True) == type_name:
                in_correct_section = True
            else:
                in_correct_section = False
            continue
        # Data row (must have property and type and description)
        if in_correct_section and row.find("th", class_="prop-nam"):
            prop = row.find("th", class_="prop-nam").get_text(strip=True)
            type_cell = row.find("td", class_="prop-ect")
            # Type can have multiple <a> or text, sometimes separated by <br/>
            if type_cell:
                # This reliably gets the human-readable, joined text (with or, |, etc)
                types_text = type_cell.get_text(separator="|", strip=True)
                types = [t.strip() for t in types_text.replace("or", "|").split("|") if t.strip()]
            else:
                types = [""]
            for exptype in types:
                results.append({
                    "type": type_name,
                    "property": prop,
                    "expected_type": exptype
                })
    return results

# Test for WebPage and AboutPage
for t in ["AboutPage", "WebPage"]:
    props = get_direct_properties(t)
    df = pd.DataFrame(props)
    print(f"Direct properties for {t}:")
    print(df)
    df.to_excel(f"{t}_direct_properties.xlsx", index=False)


Direct properties for AboutPage:
Empty DataFrame
Columns: []
Index: []
Direct properties for WebPage:
       type            property           expected_type
0   WebPage          breadcrumb          BreadcrumbList
1   WebPage          breadcrumb                    Text
2   WebPage        lastReviewed                    Date
3   WebPage   mainContentOfPage          WebPageElement
4   WebPage  primaryImageOfPage             ImageObject
5   WebPage         relatedLink                     URL
6   WebPage          reviewedBy            Organization
7   WebPage          reviewedBy                  Person
8   WebPage     significantLink                     URL
9   WebPage           speakable  SpeakableSpecification
10  WebPage           speakable                     URL
11  WebPage           specialty               Specialty


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_direct_properties(type_name):
    url = f"https://schema.org/{type_name}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    results = []
    table = soup.find("table", class_="definition-table")
    if not table:
        return results
    rows = table.find_all("tr")
    in_correct_section = False
    for row in rows:
        # Section header row
        if "supertype" in row.get("class", []):
            th = row.find("th", class_="supertype-name")
            if th and th.find("a") and th.find("a").get_text(strip=True) == type_name:
                in_correct_section = True
            else:
                in_correct_section = False
            continue
        # Data row (must have property and type)
        if in_correct_section and row.find("th", class_="prop-nam"):
            prop_cell = row.find("th", class_="prop-nam")
            # The property name and link
            prop = prop_cell.get_text(strip=True)
            prop_link_tag = prop_cell.find("a")
            if prop_link_tag and prop_link_tag.get("href"):
                prop_url = "https://schema.org" + prop_link_tag.get("href")
            else:
                prop_url = f"https://schema.org/{prop}"
            type_cell = row.find("td", class_="prop-ect")
            if type_cell:
                types_text = type_cell.get_text(separator="|", strip=True)
                types = [t.strip() for t in types_text.replace("or", "|").split("|") if t.strip()]
            else:
                types = [""]
            for exptype in types:
                results.append({
                    "property_link": f'=HYPERLINK("{prop_url}", "{prop}")',
                    "type_property": f"{type_name}.{prop}",
                    "expected_type": exptype
                })
    return results

# 1. Get all types from GitHub
api_url = "https://api.github.com/repos/wbsg-uni-mannheim/wdc-sotab/contents/data/PropsToTypes"
headers = {'User-Agent': 'Mozilla/5.0'}
files_json = requests.get(api_url, headers=headers).json()
type_names = sorted({f['name'].replace("_propsToTypes.csv", "") for f in files_json if f['name'].endswith('.csv')})

# 2. Loop over all types, scrape direct properties
all_results = []
for i, t in enumerate(type_names):
    print(f"[{i+1}/{len(type_names)}] {t}...", end="")
    props = get_direct_properties(t)
    if props:
        all_results.extend(props)
        print(f" {len(props)} direct properties found.")
    else:
        print(" No direct properties.")
    time.sleep(0.15)  # Be nice to schema.org

# 3. Save all results in one Excel
df = pd.DataFrame(all_results)
excel_file = "AllSchemaOrgDirectProperties_withLinks.xlsx"
df.to_excel(excel_file, index=False)
print(f"All done! Output in {excel_file}")


[1/116] AboutPage... No direct properties.
[2/116] Action... 22 direct properties found.
[3/116] AdministrativeArea... No direct properties.
[4/116] AggregateRating... 3 direct properties found.
[5/116] Airport... 2 direct properties found.
[6/116] Audience... 2 direct properties found.
[7/116] AudioObject... 4 direct properties found.
[8/116] Book... 7 direct properties found.
[9/116] BookFormatType... No direct properties.
[10/116] Boolean... No direct properties.
[11/116] Brand... 5 direct properties found.
[12/116] CategoryCode... 4 direct properties found.
[13/116] City... No direct properties.
[14/116] Clip... 17 direct properties found.
[15/116] CollegeOrUniversity... No direct properties.
[16/116] Comment... 7 direct properties found.
[17/116] ContactPoint... 14 direct properties found.
[18/116] Continent... No direct properties.
[19/116] Country... No direct properties.
[20/116] CreativeWork... 184 direct properties found.
[21/116] CreativeWorkSeason... 16 direct properties fo

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def extract_expected_types(type_cell):
    types = [a.get_text(strip=True) for a in type_cell.find_all("a")]
    last_text = type_cell.get_text(" ", strip=True)
    if types:
        last_type = types[-1]
        if last_type in last_text:
            after_last = last_text.split(last_type, 1)[-1]
            # Get plain text types after the last linked type
            extras = [s.strip() for s in after_last.split("or") if s.strip()]
            for extra in extras:
                if extra and extra not in types:
                    types.append(extra)
    else:
        types = [s.strip() for s in last_text.split("or") if s.strip()]
    return types

def get_direct_properties(type_name):
    url = f"https://schema.org/{type_name}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    results = []
    table = soup.find("table", class_="definition-table")
    if not table:
        return results
    rows = table.find_all("tr")
    in_correct_section = False
    for row in rows:
        if "supertype" in row.get("class", []):
            th = row.find("th", class_="supertype-name")
            if th and th.find("a") and th.find("a").get_text(strip=True) == type_name:
                in_correct_section = True
            else:
                in_correct_section = False
            continue
        if in_correct_section and row.find("th", class_="prop-nam"):
            prop_cell = row.find("th", class_="prop-nam")
            prop = prop_cell.get_text(strip=True)
            prop_link_tag = prop_cell.find("a")
            if prop_link_tag and prop_link_tag.get("href"):
                prop_url = "https://schema.org" + prop_link_tag.get("href")
            else:
                prop_url = f"https://schema.org/{prop}"
            type_cell = row.find("td", class_="prop-ect")
            if type_cell:
                types = extract_expected_types(type_cell)
            else:
                types = [""]
            for exptype in types:
                results.append({
                    "property_link": f'=HYPERLINK("{prop_url}", "{prop}")',
                    "type_property": f"{type_name}.{prop}",
                    "expected_type": exptype
                })
    return results

# 1. Get all types from GitHub
api_url = "https://api.github.com/repos/wbsg-uni-mannheim/wdc-sotab/contents/data/PropsToTypes"
headers = {'User-Agent': 'Mozilla/5.0'}
files_json = requests.get(api_url, headers=headers).json()
type_names = sorted({f['name'].replace("_propsToTypes.csv", "") for f in files_json if f['name'].endswith('.csv')})

# 2. Loop over all types, scrape direct properties
all_results = []
for i, t in enumerate(type_names):
    print(f"[{i+1}/{len(type_names)}] {t}...", end="")
    props = get_direct_properties(t)
    if props:
        all_results.extend(props)
        print(f" {len(props)} direct properties found.")
    else:
        print(" No direct properties.")
    time.sleep(0.15)  # Be nice to schema.org

# 3. Save all results in one Excel
df = pd.DataFrame(all_results)
excel_file = "AllSchemaOrgDirectProperties_withLinks.xlsx"
df.to_excel(excel_file, index=False)
print(f"All done! Output in {excel_file}")


[1/116] AboutPage... No direct properties.
[2/116] Action... 22 direct properties found.
[3/116] AdministrativeArea... No direct properties.
[4/116] AggregateRating... 3 direct properties found.
[5/116] Airport... 2 direct properties found.
[6/116] Audience... 2 direct properties found.
[7/116] AudioObject... 4 direct properties found.
[8/116] Book... 6 direct properties found.
[9/116] BookFormatType... No direct properties.
[10/116] Boolean... No direct properties.
[11/116] Brand... 5 direct properties found.
[12/116] CategoryCode... 3 direct properties found.
[13/116] City... No direct properties.
[14/116] Clip... 14 direct properties found.
[15/116] CollegeOrUniversity... No direct properties.
[16/116] Comment... 5 direct properties found.
[17/116] ContactPoint... 14 direct properties found.
[18/116] Continent... No direct properties.
[19/116] Country... No direct properties.
[20/116] CreativeWork... 169 direct properties found.
[21/116] CreativeWorkSeason... 14 direct properties fo

PermissionError: [Errno 13] Permission denied: 'AllSchemaOrgDirectProperties_withLinks.xlsx'

Frequencies

In [5]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("AllSchemaOrgDirectProperties_withLinks.xlsx")

# Frequency of schema.org types (first column)
type_counts = df['property_link'].value_counts().reset_index()
type_counts.columns = ['property_link', 'count']

# Frequency of expected types (third column)
expected_counts = df['expected_type'].value_counts().reset_index()
expected_counts.columns = ['expected_type', 'count']

# Display top 10 of each (optional)
print("Top 10 Schema.org types by property count:")
print(type_counts.head(10))

print("\nTop 10 expected types by count:")
print(expected_counts.head(10))

# Save results to Excel
with pd.ExcelWriter("PropsToTypes_Frequency.xlsx") as writer:
    type_counts.to_excel(writer, sheet_name='Type_Frequency', index=False)
    expected_counts.to_excel(writer, sheet_name='ExpectedType_Frequency', index=False)

print("Frequency distributions saved to PropsToTypes_Frequency.xlsx")


Top 10 Schema.org types by property count:
    property_link  count
0      areaServed     20
1  valueReference     16
2        location     16
3        keywords     15
4        category     15
5    validThrough     14
6     itemOffered     14
7       validFrom     12
8           value     12
9           actor     10

Top 10 expected types by count:
       expected_type  count
0               Text    311
1                URL     85
2             Person     76
3       Organization     59
4               Date     39
5              Place     39
6           DateTime     38
7  QuantitativeValue     36
8        DefinedTerm     33
9             Number     32
Frequency distributions saved to PropsToTypes_Frequency.xlsx
