In [None]:
# @title Phase 0: Environment Setup and Library Installation
# @markdown Mount Google Drive to persist data and outputs.
from google.colab import drive
drive.mount('/content/drive')

# @markdown Install all required libraries quietly.
# @markdown We include `datasets` to easily load the data from Hugging Face.
!pip install -q pandas pyarrow torch torchvision torchaudio transformers spacy requests tqdm ftfy SPARQLWrapper datasets

# @markdown Download the SpaCy model for candidate generation.
!python -m spacy download en_core_web_lg

# @markdown Create the project directory structure in your Google Drive.
import os

# --- IMPORTANT ---
# You can change this path to your preferred location in Google Drive
PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
# -----------------

os.makedirs(os.path.join(PROJECT_ROOT, "data"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "notebooks"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "output"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "src"), exist_ok=True)

print(f"Project directory created at: {PROJECT_ROOT}")
print("Setup complete. You can now proceed with the next phases.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting en-core-web-lg==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Project directory created at: /content/drive/MyDrive/ontology_project
Setup complete. You can now proceed with the next phases.


In [None]:
!unzip "/content/drive/MyDrive/ontology_project/data/cornell/cornell_movie_dialogs_corpus.zip" -d "/content/drive/MyDrive/ontology_project/data/cornell/"

Archive:  /content/drive/MyDrive/ontology_project/data/cornell/cornell_movie_dialogs_corpus.zip
   creating: /content/drive/MyDrive/ontology_project/data/cornell/cornell movie-dialogs corpus/
  inflating: /content/drive/MyDrive/ontology_project/data/cornell/cornell movie-dialogs corpus/.DS_Store  
   creating: /content/drive/MyDrive/ontology_project/data/cornell/__MACOSX/
   creating: /content/drive/MyDrive/ontology_project/data/cornell/__MACOSX/cornell movie-dialogs corpus/
  inflating: /content/drive/MyDrive/ontology_project/data/cornell/__MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: /content/drive/MyDrive/ontology_project/data/cornell/cornell movie-dialogs corpus/chameleons.pdf  
  inflating: /content/drive/MyDrive/ontology_project/data/cornell/__MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: /content/drive/MyDrive/ontology_project/data/cornell/cornell movie-dialogs corpus/movie_characters_metadata.txt  
  inflating: /content/drive/MyDrive/onto

In [None]:
# @title Debug: List the contents of the DailyDialog data directory
!ls -R /content/drive/MyDrive/ontology_project/data/daily_dialog/

/content/drive/MyDrive/ontology_project/data/daily_dialog/:
ijcnlp_dailydialog  ijcnlp_dailydialog.zip

/content/drive/MyDrive/ontology_project/data/daily_dialog/ijcnlp_dailydialog:
dialogues_act.txt      dialogues_text.txt   readme.txt	train.zip
dialogues_emotion.txt  dialogues_topic.txt  test.zip	validation.zip


In [None]:
# @title Cell 0.1: Project Setup and Dependencies
# @markdown Run this cell once per session to set up the environment.

# --- 1. Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- 2. Define Project Root ---
# All file paths will be relative to this root directory.
import os
PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"

# --- 3. Install All Required Packages ---
print("Installing required Python packages...")
!pip install -q pandas pyarrow torch transformers spacy requests tqdm ftfy SPARQLWrapper

# --- 4. Download SpaCy Language Model ---
print("\nDownloading SpaCy 'en_core_web_lg' model...")
!python -m spacy download en_core_web_lg

# --- 5. Create Project Directory Structure ---
print("\nCreating project directories...")
os.makedirs(os.path.join(PROJECT_ROOT, "data"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "output"), exist_ok=True)
# Create subdirectories for manually downloaded data
os.makedirs(os.path.join(PROJECT_ROOT, "data/daily_dialog"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "data/cornell"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "data/lmsys"), exist_ok=True)


print("\nSetup complete. You may now proceed with the next cells.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Installing required Python packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.4/565.4 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25h
Downloading SpaCy 'en_core_web_lg' model...
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload depen

In [None]:
# @title Phase 1 (Final Version): Path and ValueError Fix
import pandas as pd
import re
import os
import glob
from google.colab import drive

# --- Setup and Mounting ---
if not os.path.isdir('/content/drive/MyDrive'):
    drive.mount('/content/drive')

PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
UNIFIED_CORPUS_PATH = os.path.join(PROJECT_ROOT, "data/unified_corpus.parquet")

# --- Unzip Datasets (Run this once) ---
DAILYDIALOG_ZIP_PATH = os.path.join(PROJECT_ROOT, "data/daily_dialog/ijcnlp_dailydialog.zip")
DAILYDIALOG_EXTRACT_PATH = os.path.join(PROJECT_ROOT, "data/daily_dialog/")
if os.path.exists(DAILYDIALOG_ZIP_PATH):
    print("Unzipping main DailyDialog zip...")
    !unzip -q -o "{DAILYDIALOG_ZIP_PATH}" -d "{DAILYDIALOG_EXTRACT_PATH}"
    NESTED_TRAIN_ZIP_PATH = os.path.join(PROJECT_ROOT, "data/daily_dialog/ijcnlp_dailydialog/train.zip")
    NESTED_EXTRACT_PATH = os.path.join(PROJECT_ROOT, "data/daily_dialog/ijcnlp_dailydialog/")
    if os.path.exists(NESTED_TRAIN_ZIP_PATH):
        print("Unzipping nested train.zip...")
        !unzip -q -o "{NESTED_TRAIN_ZIP_PATH}" -d "{NESTED_EXTRACT_PATH}"

# --- Parsers for Manually Loaded Data ---

def clean_text(text):
    """A simple function to clean text."""
    return re.sub(r'\s+', ' ', text).strip()

def load_dailydialog():
    """Loads the DailyDialog dataset from the manually downloaded text file."""
    dialogues_path = os.path.join(PROJECT_ROOT, "data/daily_dialog/ijcnlp_dailydialog/train/dialogues_train.txt")
    if not os.path.exists(dialogues_path):
        raise FileNotFoundError(f"DailyDialog file not found at: {dialogues_path}")
    print("Loading and parsing DailyDialog dataset from text file...")
    dialogues = [line.strip().replace('__eou__', '').strip() for line in open(dialogues_path, 'r', encoding='utf-8')]
    df = pd.DataFrame({'text': dialogues, 'source_dataset': 'daily_dialog'})
    df['doc_id'] = [f"dd_{i}" for i in range(len(df))]
    return df

def load_cornell():
    """Loads the Cornell Movie Dialogs dataset from the manually uploaded files."""
    cornell_path = os.path.join(PROJECT_ROOT, "data/cornell/cornell movie-dialogs corpus")
    lines_path = os.path.join(cornell_path, 'movie_lines.txt')
    convos_path = os.path.join(cornell_path, 'movie_conversations.txt')
    if not os.path.exists(lines_path):
        raise FileNotFoundError(f"Cornell dataset files not found at: {cornell_path}")
    print("Loading and parsing Cornell Movie Dialogs dataset...")
    lines_dict = {parts[0]: parts[4] for line in open(lines_path, 'r', encoding='iso-8859-1') if len(parts := line.strip().split(' +++$+++ ')) == 5}
    dialogues = []
    for line in open(convos_path, 'r', encoding='iso-8859-1'):
        parts = line.strip().split(' +++$+++ ')
        if len(parts) == 4:
            line_ids = parts[3].replace("[", "").replace("]", "").replace("'", "").split(", ")
            full_dialogue = " ".join([lines_dict[lid] for lid in line_ids if lid in lines_dict])
            if full_dialogue:
                dialogues.append(full_dialogue)
    df = pd.DataFrame({'text': dialogues, 'source_dataset': 'cornell_movie_dialogs'})
    df['doc_id'] = [f"cornell_{i}" for i in range(len(df))]
    return df

def load_lmsys():
    """
    Loads the LMSYS-Chat-1M dataset by processing Parquet files one-by-one.
    """
    # --- FIX 1: Corrected the path to include the 'data' subdirectory ---
    lmsys_dir_path = os.path.join(PROJECT_ROOT, "data/lmsys/data/")
    if not os.path.isdir(lmsys_dir_path):
        raise FileNotFoundError(f"LMSYS data directory not found at: {lmsys_dir_path}")

    parquet_files = glob.glob(os.path.join(lmsys_dir_path, "*.parquet"))
    if not parquet_files:
        raise FileNotFoundError(f"No Parquet files found in: {lmsys_dir_path}")

    print(f"Found {len(parquet_files)} LMSYS Parquet shards. Processing one by one...")

    all_conversations = []
    for i, file_path in enumerate(parquet_files):
        print(f"  > Processing shard {i+1}/{len(parquet_files)}: {os.path.basename(file_path)}...")
        df_shard = pd.read_parquet(file_path)

        shard_conversations = []
        for conv_list in df_shard['conversation']:
            # --- FIX 2: Replaced ambiguous 'if conv_list' with explicit 'len(conv_list) > 0' ---
            if len(conv_list) > 0 and conv_list[0]['role'] == 'human':
                full_convo = " ".join([turn['content'] for turn in conv_list if turn['content']])
                shard_conversations.append(full_convo)

        all_conversations.extend(shard_conversations)

    print("Finished processing all shards.")
    df = pd.DataFrame({'text': all_conversations, 'source_dataset': 'lmsys_chat'})
    df['doc_id'] = [f"lmsys_{i}" for i in range(len(df))]
    return df

# --- Subtask 2.2: Unify and Persist Data ---

print("\nStarting data unification process...")
df_daily = load_dailydialog()
df_cornell = load_cornell()
df_lmsys = load_lmsys()
print(f"Loaded {len(df_daily)} from DailyDialog, {len(df_cornell)} from Cornell, {len(df_lmsys)} from LMSYS.")

# Balancing
min_samples = min(len(df_daily), len(df_cornell))
lmsys_sample_size = min(len(df_lmsys), min_samples * 2)
df_lmsys_sampled = df_lmsys.sample(n=lmsys_sample_size, random_state=42, replace=False)
df_cornell_sampled = df_cornell.sample(n=min_samples, random_state=42, replace=False)

# Unification
df_unified = pd.concat([df_daily, df_cornell_sampled, df_lmsys_sampled], ignore_index=True)
df_unified['text'] = df_unified['text'].apply(clean_text)
df_unified = df_unified.sample(frac=1, random_state=42).reset_index(drop=True)

df_unified.to_parquet(UNIFIED_CORPUS_PATH)

print(f"\nUnified and balanced corpus created with {len(df_unified)} documents.")
print(f"Saved to: {UNIFIED_CORPUS_PATH}")
df_unified.head()

Unzipping main DailyDialog zip...
Unzipping nested train.zip...

Starting data unification process...
Loading and parsing DailyDialog dataset from text file...
Loading and parsing Cornell Movie Dialogs dataset...
Found 6 LMSYS Parquet shards. Processing one by one...
  > Processing shard 1/6: train-00000-of-00006-4feeb3f83346a0e9.parquet...
  > Processing shard 2/6: train-00001-of-00006-4030672591c2f478.parquet...
  > Processing shard 3/6: train-00002-of-00006-1779b7cec9462180.parquet...
  > Processing shard 4/6: train-00003-of-00006-2fa862bfed56af1f.parquet...
  > Processing shard 5/6: train-00004-of-00006-18f4bdd50c103e71.parquet...
  > Processing shard 6/6: train-00005-of-00006-fe1acc5d10a9f0e2.parquet...
Finished processing all shards.
Loaded 11118 from DailyDialog, 83079 from Cornell, 0 from LMSYS.

Unified and balanced corpus created with 22236 documents.
Saved to: /content/drive/MyDrive/ontology_project/data/unified_corpus.parquet


Unnamed: 0,text,source_dataset,doc_id
0,Could I have a refund on this ? I'm afraid you...,daily_dialog,dd_9960
1,One of my students told me she was very depres...,daily_dialog,dd_3472
2,Which aisle has the produce ? Aisle A is where...,daily_dialog,dd_6842
3,"Alison , would you like to have some more ? No...",daily_dialog,dd_259
4,"Well, good to see you, Miles. Jack. See you.",cornell_movie_dialogs,cornell_25340


In [None]:
# @title Phase 2 (Corrected): Candidate Mention Generation with SpaCy
import spacy
from tqdm.auto import tqdm
import pandas as pd
import os

# Ensure the model is downloaded in the current session.
# This is necessary if the runtime has been restarted.
#!python -m spacy download en_core_web_lg
#already done in cell 0.1

# Define paths
PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
UNIFIED_CORPUS_PATH = os.path.join(PROJECT_ROOT, "data/unified_corpus.parquet")
CANDIDATES_PATH = os.path.join(PROJECT_ROOT, "data/candidate_mentions.parquet")

# Load the SpaCy model.
nlp = spacy.load("en_core_web_lg")

def generate_candidates(texts: list[str], doc_ids: list[str]):
    """
    Extracts noun chunks as candidate mentions from a list of texts using SpaCy's nlp.pipe
    for efficient batch processing.
    """
    candidates = []

    # --- FIX APPLIED HERE ---
    # The 'parser' component is REQUIRED for the .noun_chunks attribute.
    # We remove 'parser' from the disable list. We can still disable 'ner' for performance.
    docs = nlp.pipe(texts, disable=["ner"])
    # ------------------------

    for i, doc in tqdm(enumerate(docs), total=len(texts), desc="Generating Candidates"):
        doc_id = doc_ids[i]
        for chunk in doc.noun_chunks:
            candidates.append({
                "doc_id": doc_id,
                "mention": chunk.text,
                "start_char": chunk.start_char,
                "end_char": chunk.end_char,
                "context": chunk.sent.text # E-BERT needs the sentence context
            })
    return pd.DataFrame(candidates)

# --- Usage ---
print("Loading unified corpus...")
df_unified = pd.read_parquet(UNIFIED_CORPUS_PATH)

# For demonstration, let's process a smaller subset.
# On a full run, you can remove the .head(5000)
print("Generating candidate mentions from a subset of the corpus...")
df_subset = df_unified.head(5000)
df_candidates = generate_candidates(df_subset['text'].tolist(), df_subset['doc_id'].tolist())

# Save the candidates to Parquet
df_candidates.to_parquet(CANDIDATES_PATH)

print(f"\nGenerated {len(df_candidates)} candidate mentions.")
print(f"Saved to: {CANDIDATES_PATH}")
df_candidates.head()

Loading unified corpus...
Generating candidate mentions from a subset of the corpus...


Generating Candidates:   0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# @title Phase 3: Simulated Entity Linking
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import os

# --- This cell assumes previous phases have been run ---

PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
CANDIDATES_PATH = os.path.join(PROJECT_ROOT, "data/candidate_mentions.parquet")
LINKED_ENTITIES_PATH = os.path.join(PROJECT_ROOT, "output/linked_entities.parquet")

# --- Model Loading ---
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval() # Set model to evaluation mode
print(f"Model '{MODEL_NAME}' loaded on {device}.")


# --- Batch Inference Script (with Simulation) ---
def link_entities_batch_simulated(df_candidates, batch_size=32):
    """
    Processes candidates in batches and returns SIMULATED entity links.
    """
    results = []

    for i in tqdm(range(0, len(df_candidates), batch_size), desc="Linking Entities (Simulated)"):
        batch_df = df_candidates.iloc[i:i+batch_size]

        # Format inputs as required by the model: [CLS] context [SEP] mention [SEP]
        inputs = tokenizer(
            text=batch_df['context'].tolist(),
            text_pair=batch_df['mention'].tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        # Run the model
        with torch.no_grad():
            outputs = model(**inputs)
            # In a real EL model, you'd process `outputs` to get Wikidata IDs.
            # Here, we simulate this complex step.

        # --- SIMULATION LOGIC ---
        num_items = len(batch_df)
        simulated_qids = [f"Q{np.random.randint(1, 2000)}" for _ in range(num_items)]
        simulated_scores = np.random.beta(a=5, b=2, size=num_items)

        for idx, (row_index, row) in enumerate(batch_df.iterrows()):
            results.append({
                "doc_id": row['doc_id'],
                "mention": row['mention'],
                "wikidata_id": simulated_qids[idx],
                "confidence": simulated_scores[idx]
            })

    return pd.DataFrame(results)

# --- Filter Low-Confidence Links ---
print("Loading candidate mentions...")
df_candidates = pd.read_parquet(CANDIDATES_PATH)

print("Running batch linking process...")
# Using the full set of candidates generated in the previous step
df_linked_entities = link_entities_batch_simulated(df_candidates)

CONFIDENCE_THRESHOLD = 0.7
df_confident_links = df_linked_entities[df_linked_entities['confidence'] > CONFIDENCE_THRESHOLD].copy()

df_confident_links.to_parquet(LINKED_ENTITIES_PATH)

print(f"\nFinished linking. Found {len(df_linked_entities)} total links.")
print(f"Filtered to {len(df_confident_links)} confident links (confidence > {CONFIDENCE_THRESHOLD}).")
print(f"Saved to: {LINKED_ENTITIES_PATH}")
df_confident_links.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Model 'bert-base-cased' loaded on cuda.
Loading candidate mentions...
Running batch linking process...


Linking Entities (Simulated):   0%|          | 0/3160 [00:00<?, ?it/s]


Finished linking. Found 101110 total links.
Filtered to 58769 confident links (confidence > 0.7).
Saved to: /content/drive/MyDrive/ontology_project/output/linked_entities.parquet


Unnamed: 0,doc_id,mention,wikidata_id,confidence
0,dd_9960,I,Q989,0.766884
2,dd_9960,this,Q1226,0.829274
3,dd_9960,I,Q303,0.793955
5,dd_9960,I,Q1686,0.735059
7,dd_9960,something,Q348,0.810509


In [None]:
# @title Phase 4 (Corrected): Aggregation, Enrichment, and Ontology Building

import json
import time
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm.auto import tqdm
import os

# Define paths
PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
LINKED_ENTITIES_PATH = os.path.join(PROJECT_ROOT, "output/linked_entities.parquet")
CACHE_FILE = os.path.join(PROJECT_ROOT, "output/wikidata_cache.json")
ONTOLOGY_PATH = os.path.join(PROJECT_ROOT, "output/final_ontology.json")

# --- Subtask 5.1: Group and Aggregate by Wikidata ID ---
print("Loading confident entity links...")
df_links = pd.read_parquet(LINKED_ENTITIES_PATH)

df_aggregated = df_links.groupby('wikidata_id').agg(
    occurrences=('wikidata_id', 'count'),
    avg_confidence=('confidence', 'mean')
).reset_index().sort_values(by="occurrences", ascending=False)

print("Aggregated entity mentions:")
print(df_aggregated.head())


# --- Subtask 5.2: Wikidata Enrichment via SPARQL (with Caching) ---
def get_wikidata_details_cached(wikidata_ids: list[str]):
    """
    Queries Wikidata for details about entity IDs.
    Implements a file-based cache to avoid re-querying and respect API rate limits.
    """
    try:
        with open(CACHE_FILE, 'r') as f:
            cache = json.load(f)
    except (IOError, json.JSONDecodeError):
        cache = {}

    ids_to_fetch = [qid for qid in wikidata_ids if qid not in cache]

    if ids_to_fetch:
        print(f"Cache miss for {len(ids_to_fetch)} IDs. Querying Wikidata...")
        # Set a user-agent to be polite to the Wikidata API
        sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent="MyAwesomeOntologyProject/1.0 (myemail@example.com)")

        chunk_size = 100
        for i in tqdm(range(0, len(ids_to_fetch), chunk_size), desc="Querying Wikidata"):
            id_chunk = ids_to_fetch[i:i+chunk_size]
            query = f"""
            SELECT ?item ?itemLabel ?itemDescription (GROUP_CONCAT(DISTINCT ?type; separator=",") AS ?types)
            WHERE {{
              VALUES ?item {{wd:{' wd:'.join(id_chunk)}}}
              OPTIONAL {{ ?item wdt:P31 ?type. }}
              SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
            }}
            GROUP BY ?item ?itemLabel ?itemDescription
            """
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)

            try:
                results = sparql.query().convert()
                for result in results["results"]["bindings"]:
                    qid = result['item']['value'].split('/')[-1]
                    cache[qid] = {
                        'label': result.get('itemLabel', {}).get('value', 'N/A'),
                        'description': result.get('itemDescription', {}).get('value', 'N/A'),
                        'types': [t.split('/')[-1] for t in result.get('types', {}).get('value', '').split(',') if t]
                    }
            except Exception as e:
                print(f"Error querying Wikidata for chunk {i}: {e}")
            time.sleep(1) # Be polite to the API

        with open(CACHE_FILE, 'w') as f:
            json.dump(cache, f, indent=2)

    return {qid: cache.get(qid, {}) for qid in wikidata_ids}


# --- Subtask 5.3: Map Wikidata Types to Ontology Types ---
WIKIDATA_TYPE_MAP = {
    'Q5': 'Person', 'Q43229': 'Organization', 'Q6256': 'Country', 'Q515': 'City',
    'Q4830453': 'Business', 'Q7889': 'Movie', 'Q11424': 'Film', 'Q47461344': 'Written Work'
}

def map_entity_type(types_list):
    for qid_type in types_list:
        if qid_type in WIKIDATA_TYPE_MAP:
            return WIKIDATA_TYPE_MAP[qid_type]
    return "Other"

# --- Run Enrichment and Mapping ---
unique_ids = df_aggregated['wikidata_id'].tolist()
enriched_data = get_wikidata_details_cached(unique_ids)

df_aggregated['label'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('label', 'N/A'))
df_aggregated['description'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('description', 'N/A'))
df_aggregated['types'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('types', []))
df_aggregated['entity_type'] = df_aggregated['types'].apply(map_entity_type)

print("\nEnriched and typed data:")
print(df_aggregated.head())


# --- Task 6: Final Ontology JSON Construction ---
def build_ontology_json(df_enriched):
    """Builds the final nested JSON ontology from the enriched DataFrame."""
    ontology = {"ontology": {"id": "root", "name": "Domain", "children": []}}

    grouped_by_type = df_enriched.groupby('entity_type')

    for type_name, group_df in grouped_by_type:
        type_node = {
            "id": f"concept_{type_name.lower()}", "name": type_name,
            "description": f"Entities of type {type_name}", "children": []
        }
        for _, row in group_df.iterrows():
            entity_node = {
                "id": row['wikidata_id'], "name": row['label'], "description": row['description'],
                "metadata": {
                    "occurrences": int(row['occurrences']),
                    "avg_confidence": round(float(row['avg_confidence']), 4)
                },
                "children": []
            }
            type_node['children'].append(entity_node)
        ontology['ontology']['children'].append(type_node)

    return ontology

# --- Generate and Save the Final Ontology ---
final_ontology = build_ontology_json(df_aggregated)

with open(ONTOLOGY_PATH, 'w') as f:
    json.dump(final_ontology, f, indent=4)

print(f"\nFinal ontology constructed and saved to: {ONTOLOGY_PATH}")
print("\nOntology Snippet:")
print(json.dumps(final_ontology, indent=2)[:1000] + "...")

Loading confident entity links...
Aggregated entity mentions:
     wikidata_id  occurrences  avg_confidence
1540        Q586           49        0.811664
1859        Q873           47        0.819328
1254        Q328           47        0.827757
759        Q1682           47        0.818606
1251        Q325           46        0.833782

Enriched and typed data:
     wikidata_id  occurrences  avg_confidence              label  \
1540        Q586           49        0.811664               Bonn   
1859        Q873           47        0.819328       Meryl Streep   
1254        Q328           47        0.827757  English Wikipedia   
759        Q1682           47        0.818606              Havel   
1251        Q325           46        0.833782   digital dark age   

                                            description  \
1540  city in Germany and capital of former West Ger...   
1859                                   American actress   
1254              English-language edition of Wiki

###Inorder to count number of entities
- run the cell below
- then cell 4
- then final cell

In [None]:
# @title Phase 0: Environment Setup (Partial)
# @markdown This cell re-mounts drive and defines the project path variables.
from google.colab import drive
import os

if not os.path.isdir('/content/drive/MyDrive'):
    drive.mount('/content/drive')

PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
print(f"Project root is set to: {PROJECT_ROOT}")

Project root is set to: /content/drive/MyDrive/ontology_project


In [None]:
# @title Final Cell: Count Leaf Node Entities

# This cell assumes that the previous phase (Phase 4) has been run,
# and the following variables are available in the environment:
# - df_aggregated: The final DataFrame before JSON conversion.
# - final_ontology: The final dictionary representing the ontology.

print("--- Method 1: Counting from the final DataFrame ---")
# The number of rows in the aggregated DataFrame represents the total number
# of unique entities that should be in our final ontology.
count_from_df = len(df_aggregated)
print(f"The number of unique entities in the final DataFrame is: {count_from_df}")


print("\n--- Method 2: Counting by traversing the final JSON object ---")
# We can also count the leaf nodes by iterating through the generated structure.
# This verifies that the JSON construction process included all entities.
leaf_node_count = 0
# The first level of children are the "type" nodes (e.g., Person, Organization)
type_nodes = final_ontology.get('ontology', {}).get('children', [])

for type_node in type_nodes:
    # The children of each type_node are the entity leaf nodes
    num_entities_in_type = len(type_node.get('children', []))
    leaf_node_count += num_entities_in_type

print(f"The total number of leaf-node entities in the ontology tree is: {leaf_node_count}")


# --- Verification ---
print("\n--- Verification ---")
if count_from_df == leaf_node_count:
    print("✅ Success: The DataFrame count and the JSON tree count match.")
else:
    print("⚠️ Warning: Counts do not match. Please review the ontology construction logic.")

--- Method 1: Counting from the final DataFrame ---
The number of unique entities in the final DataFrame is: 1999

--- Method 2: Counting by traversing the final JSON object ---
The total number of leaf-node entities in the ontology tree is: 1999

--- Verification ---
✅ Success: The DataFrame count and the JSON tree count match.


In [None]:
!pip install anytree


Collecting anytree
  Downloading anytree-2.13.0-py3-none-any.whl.metadata (8.0 kB)
Downloading anytree-2.13.0-py3-none-any.whl (45 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anytree
Successfully installed anytree-2.13.0


In [None]:
# @title Visualize the Full Ontology Tree
import json
from anytree import Node, RenderTree
import os

# --- This cell assumes Phase 4 has been run and the JSON file exists ---

PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
ONTOLOGY_PATH = os.path.join(PROJECT_ROOT, "output/final_ontology.json")

# 1. Load the ontology from the JSON file
try:
    with open(ONTOLOGY_PATH, 'r') as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"ERROR: The ontology file was not found at {ONTOLOGY_PATH}")
    print("Please ensure you have successfully run the Phase 4 cell.")
    # Stop execution of this cell if file not found
    raise

# 2. Build the tree using anytree
root_data = data.get('ontology', {})
root_node = Node(f"{root_data.get('name', 'Root')} (id: {root_data.get('id')})")

# The first level of children are the "type" nodes (e.g., Person, Organization)
type_nodes_data = root_data.get('children', [])

for type_data in type_nodes_data:
    # Create a parent node for each entity type
    type_node = Node(f"{type_data.get('name', 'N/A')} (id: {type_data.get('id')})", parent=root_node)

    # The children of each type_node are the entity leaf nodes
    entity_nodes_data = type_data.get('children', [])
    for entity_data in entity_nodes_data:
        entity_name = entity_data.get('name', 'N/A')
        entity_id = entity_data.get('id', 'N/A')
        occurrences = entity_data.get('metadata', {}).get('occurrences', 0)

        # Create the leaf node for each entity
        Node(f"{entity_name} (id: {entity_id}, occurrences: {occurrences})", parent=type_node)

# 3. Render and print the tree
print("--- Full Ontology Tree Visualization ---")
for pre, _, node in RenderTree(root_node):
    print(f"{pre}{node.name}")

--- Full Ontology Tree Visualization ---
Domain (id: root)
├── Business (id: concept_business)
│   ├── Mercedes-Benz Mobility Services (id: Q1902, occurrences: 40)
│   ├── Q1418 (id: Q1418, occurrences: 35)
│   ├── Google (id: Q95, occurrences: 33)
│   ├── TogliattiAzot (id: Q1343, occurrences: 32)
│   ├── Juventus FC (id: Q1422, occurrences: 29)
│   ├── Meta (id: Q380, occurrences: 27)
│   └── Intel Corporation (id: Q248, occurrences: 25)
├── City (id: concept_city)
│   ├── Bonn (id: Q586, occurrences: 49)
│   ├── Jerusalem (id: Q1218, occurrences: 41)
│   ├── Celje (id: Q1012, occurrences: 41)
│   ├── Havana (id: Q1563, occurrences: 39)
│   ├── Caracas (id: Q1533, occurrences: 39)
│   ├── Wąchock (id: Q439, occurrences: 37)
│   ├── Ho Chi Minh City (id: Q1854, occurrences: 37)
│   ├── Kochi (id: Q1800, occurrences: 36)
│   ├── San Marino (id: Q1848, occurrences: 35)
│   ├── Los Angeles (id: Q65, occurrences: 35)
│   ├── Kuala Lumpur (id: Q1865, occurrences: 35)
│   ├── Abidjan (id: Q

In [None]:
# @title Visualize a Pruned (Top 5 per Category) Ontology Tree
import json
from anytree import Node, RenderTree
import os

# --- This cell also assumes Phase 4 has been run ---

PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
ONTOLOGY_PATH = os.path.join(PROJECT_ROOT, "output/final_ontology.json")
MAX_LEAVES_PER_BRANCH = 5 # <--- You can change this number

# 1. Load the ontology from the JSON file
try:
    with open(ONTOLOGY_PATH, 'r') as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"ERROR: The ontology file was not found at {ONTOLOGY_PATH}")
    raise

# 2. Build the pruned tree
root_data = data.get('ontology', {})
root_node_pruned = Node(f"{root_data.get('name', 'Root')} (id: {root_data.get('id')})")

type_nodes_data = root_data.get('children', [])

for type_data in type_nodes_data:
    type_name = type_data.get('name', 'N/A')
    total_entities_in_type = len(type_data.get('children', []))

    # Add the total count to the type node's name for context
    type_node = Node(
        f"{type_name} (Showing top {MAX_LEAVES_PER_BRANCH} of {total_entities_in_type})",
        parent=root_node_pruned
    )

    # Get all entities but only loop through the first few
    entity_nodes_data = type_data.get('children', [])
    for entity_data in entity_nodes_data[:MAX_LEAVES_PER_BRANCH]:
        entity_name = entity_data.get('name', 'N/A')
        entity_id = entity_data.get('id', 'N/A')
        occurrences = entity_data.get('metadata', {}).get('occurrences', 0)

        Node(f"{entity_name} (id: {entity_id}, occurrences: {occurrences})", parent=type_node)

    # If there are more entities than we showed, add an ellipsis node
    if total_entities_in_type > MAX_LEAVES_PER_BRANCH:
        Node("...", parent=type_node)

# 3. Render and print the pruned tree
print(f"--- Pruned Ontology Tree (Top {MAX_LEAVES_PER_BRANCH} Entities per Category) ---")
for pre, _, node in RenderTree(root_node_pruned):
    print(f"{pre}{node.name}")

'''**Explanation of Changes:**
*   **`MAX_LEAVES_PER_BRANCH`**: A variable is introduced to easily control how many entities you want to see per category.
*   **Slicing**: The loop `for entity_data in entity_nodes_data[:MAX_LEAVES_PER_BRANCH]:` now uses list slicing to only iterate over the first 5 items.
*   **Context in Node Name**: The category node now tells you how many total entities it contains (e.g., "Person (Showing top 5 of 127)").
*   **Ellipsis Node**: An "..." node is added at the end of each branch if there were more entities than were displayed, clearly indicating that the view is truncated.'''

--- Pruned Ontology Tree (Top 5 Entities per Category) ---
Domain (id: root)
├── Business (Showing top 5 of 7)
│   ├── Mercedes-Benz Mobility Services (id: Q1902, occurrences: 40)
│   ├── Q1418 (id: Q1418, occurrences: 35)
│   ├── Google (id: Q95, occurrences: 33)
│   ├── TogliattiAzot (id: Q1343, occurrences: 32)
│   ├── Juventus FC (id: Q1422, occurrences: 29)
│   └── ...
├── City (Showing top 5 of 88)
│   ├── Bonn (id: Q586, occurrences: 49)
│   ├── Jerusalem (id: Q1218, occurrences: 41)
│   ├── Celje (id: Q1012, occurrences: 41)
│   ├── Havana (id: Q1563, occurrences: 39)
│   ├── Caracas (id: Q1533, occurrences: 39)
│   └── ...
├── Country (Showing top 5 of 174)
│   ├── Sri Lanka (id: Q854, occurrences: 43)
│   ├── Belarus (id: Q184, occurrences: 42)
│   ├── Turkmenistan (id: Q874, occurrences: 42)
│   ├── The Bahamas (id: Q778, occurrences: 41)
│   ├── Antigua and Barbuda (id: Q781, occurrences: 41)
│   └── ...
├── Film (Showing top 5 of 4)
│   ├── Swept Away (id: Q1365, occurrenc

'**Explanation of Changes:**\n*   **`MAX_LEAVES_PER_BRANCH`**: A variable is introduced to easily control how many entities you want to see per category.\n*   **Slicing**: The loop `for entity_data in entity_nodes_data[:MAX_LEAVES_PER_BRANCH]:` now uses list slicing to only iterate over the first 5 items.\n*   **Context in Node Name**: The category node now tells you how many total entities it contains (e.g., "Person (Showing top 5 of 127)").\n*   **Ellipsis Node**: An "..." node is added at the end of each branch if there were more entities than were displayed, clearly indicating that the view is truncated.'

In [None]:
# @title Ontology Restructuring and Refinement

import json
import time
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from tqdm.auto import tqdm
import os
from anytree import Node, RenderTree

# --- 0. Setup and Configuration ---
print("--- Step 0: Initializing ---")
PROJECT_ROOT = "/content/drive/MyDrive/ontology_project"
LINKED_ENTITIES_PATH = os.path.join(PROJECT_ROOT, "output/linked_entities.parquet")
CACHE_FILE = os.path.join(PROJECT_ROOT, "output/wikidata_cache.json")
NEW_ONTOLOGY_PATH = os.path.join(PROJECT_ROOT, "output/final_ontology_restructured.json")
MAX_LEAVES_PER_BRANCH = 7 # How many entities to show per category in the visualization

# --- 1. Re-use Existing Functions (for self-containment) ---

def get_wikidata_details_cached(wikidata_ids: list[str]):
    """Queries Wikidata using the existing cache to enrich our data."""
    try:
        with open(CACHE_FILE, 'r') as f: cache = json.load(f)
    except (IOError, json.JSONDecodeError):
        print("Warning: Wikidata cache not found. This run may be slow.")
        cache = {}

    ids_to_fetch = [qid for qid in wikidata_ids if qid not in cache]

    if ids_to_fetch:
        print(f"Cache miss for {len(ids_to_fetch)} IDs. Querying Wikidata...")
        sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent="ColabOntologyBuilder/1.0 (https://colab.research.google.com/)")
        chunk_size = 100
        for i in tqdm(range(0, len(ids_to_fetch), chunk_size), desc="Querying Wikidata"):
            id_chunk = ids_to_fetch[i:i+chunk_size]
            query = f"""
            SELECT ?item ?itemLabel ?itemDescription (GROUP_CONCAT(DISTINCT ?type; separator=",") AS ?types)
            WHERE {{
              VALUES ?item {{wd:{' wd:'.join(id_chunk)}}}
              OPTIONAL {{ ?item wdt:P31 ?type. }}
              SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
            }} GROUP BY ?item ?itemLabel ?itemDescription
            """
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            try:
                results = sparql.query().convert()
                for result in results["results"]["bindings"]:
                    qid = result['item']['value'].split('/')[-1]
                    cache[qid] = {'label': result.get('itemLabel', {}).get('value', 'N/A'), 'description': result.get('itemDescription', {}).get('value', 'N/A'), 'types': [t.split('/')[-1] for t in result.get('types', {}).get('value', '').split(',') if t]}
            except Exception as e: print(f"Error querying chunk: {e}")
            time.sleep(1)
        with open(CACHE_FILE, 'w') as f: json.dump(cache, f, indent=2)
    return {qid: cache.get(qid, {}) for qid in wikidata_ids}

# --- 2. Load and Aggregate the Original Data ---
print("\n--- Step 1: Loading and Aggregating Original Data ---")
try:
    df_links = pd.read_parquet(LINKED_ENTITIES_PATH)
except FileNotFoundError:
    print(f"ERROR: Could not find {LINKED_ENTITIES_PATH}")
    print("Please ensure you have successfully run Phase 3.")
    raise

df_aggregated = df_links.groupby('wikidata_id').agg(
    occurrences=('wikidata_id', 'count'),
    avg_confidence=('confidence', 'mean')
).reset_index()

print(f"Loaded and aggregated {len(df_aggregated)} unique entities.")

# --- 3. Enrich with Wikidata Types using the Cache ---
print("\n--- Step 2: Enriching Entities with Wikidata Types ---")
unique_ids = df_aggregated['wikidata_id'].tolist()
enriched_data = get_wikidata_details_cached(unique_ids)

df_aggregated['label'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('label', 'N/A'))
df_aggregated['description'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('description', 'N/A'))
df_aggregated['types'] = df_aggregated['wikidata_id'].apply(lambda qid: enriched_data.get(qid, {}).get('types', []))

print("Enrichment complete.")

# --- 4. Define New, Restructured Mapping Logic ---
print("\n--- Step 3: Defining and Applying New Restructuring Rules ---")

# This map defines our new, merged categories.
# The keys are the final category names.
# The values are lists of Wikidata Q-IDs that should be mapped to that category.
RESTRUCTURED_TYPE_MAP = {
    "Locations": [
        'Q515', 'Q6256', 'Q3957', 'Q1549591', 'Q5119', 'Q1286', 'Q1653',
        'Q82794', 'Q13221722', 'Q15284', 'Q486972', 'Q23442', 'Q98', 'Q97'
    ],
    "Organizations": [
        'Q43229', 'Q4830453', 'Q66', 'Q79913', 'Q1377', 'Q1003', 'Q1065'
    ],
    "Person": ['Q5'],
    # We will explicitly move Film and Written Work to "Other"
    "To_Other": ['Q11424', 'Q47461344']
}

def map_restructured_entity_type(types_list):
    """
    Applies the new mapping and merging rules.
    1. Checks for types to explicitly move to "Other".
    2. Checks for matches in Locations, Organizations, Person.
    3. Defaults to "Other" if no match is found.
    """
    if not types_list:
        return "Other"

    # Rule 1: Move specific types to Other
    if any(t in RESTRUCTURED_TYPE_MAP["To_Other"] for t in types_list):
        return "Other"

    # Rule 2: Check for a match in our main categories
    for category, qids in RESTRUCTURED_TYPE_MAP.items():
        if category == "To_Other": continue
        if any(t in qids for t in types_list):
            return category

    return "Other"

# Apply the new mapping function
df_aggregated['entity_type'] = df_aggregated['types'].apply(map_restructured_entity_type)
print("New category mapping applied.")
print("\nValue counts for new categories:")
print(df_aggregated['entity_type'].value_counts())

# --- 5. Build and Save the New Ontology ---
print("\n--- Step 4: Building and Saving Restructured Ontology ---")

def build_ontology_json(df_enriched):
    """Builds the final nested JSON ontology from the enriched DataFrame."""
    ontology = {"ontology": {"id": "root", "name": "Domain", "children": []}}
    # Sort by occurrences within each group to see the most frequent ones first
    df_enriched = df_enriched.sort_values(by="occurrences", ascending=False)
    grouped_by_type = df_enriched.groupby('entity_type')

    for type_name, group_df in grouped_by_type:
        type_node = {"id": f"concept_{type_name.lower()}", "name": type_name, "children": []}
        for _, row in group_df.iterrows():
            entity_node = {
                "id": row['wikidata_id'],
                "name": row['label'],
                "description": row['description'],
                "metadata": {
                    "occurrences": int(row['occurrences']),
                    "avg_confidence": round(float(row['avg_confidence']), 4)
                }
            }
            type_node['children'].append(entity_node)
        ontology['ontology']['children'].append(type_node)
    return ontology

final_ontology_restructured = build_ontology_json(df_aggregated)

with open(NEW_ONTOLOGY_PATH, 'w') as f:
    json.dump(final_ontology_restructured, f, indent=4)

print(f"New ontology saved to: {NEW_ONTOLOGY_PATH}")

# --- 6. Visualize the New, Pruned Tree ---
print(f"\n--- Pruned Restructured Tree (Top {MAX_LEAVES_PER_BRANCH} per Category) ---")
root_data = final_ontology_restructured.get('ontology', {})
root_node_pruned = Node(f"{root_data.get('name', 'Root')} (id: {root_data.get('id')})")
type_nodes_data = root_data.get('children', [])

for type_data in type_nodes_data:
    type_name = type_data.get('name', 'N/A')
    total_entities = len(type_data.get('children', []))
    type_node = Node(f"{type_name} (Showing top {min(MAX_LEAVES_PER_BRANCH, total_entities)} of {total_entities})", parent=root_node_pruned)

    for entity_data in type_data.get('children', [])[:MAX_LEAVES_PER_BRANCH]:
        name = entity_data.get('name', 'N/A')
        occurrences = entity_data.get('metadata', {}).get('occurrences', 0)
        Node(f"{name} (occurrences: {occurrences})", parent=type_node)

    if total_entities > MAX_LEAVES_PER_BRANCH:
        Node("...", parent=type_node)

for pre, _, node in RenderTree(root_node_pruned):
    print(f"{pre}{node.name}")

--- Step 0: Initializing ---

--- Step 1: Loading and Aggregating Original Data ---
Loaded and aggregated 1999 unique entities.

--- Step 2: Enriching Entities with Wikidata Types ---
Enrichment complete.

--- Step 3: Defining and Applying New Restructuring Rules ---
New category mapping applied.

Value counts for new categories:
entity_type
Other            1222
Person            394
Locations         371
Organizations      12
Name: count, dtype: int64

--- Step 4: Building and Saving Restructured Ontology ---
New ontology saved to: /content/drive/MyDrive/ontology_project/output/final_ontology_restructured.json

--- Pruned Restructured Tree (Top 7 per Category) ---
Domain (id: root)
├── Locations (Showing top 7 of 371)
│   ├── Bonn (occurrences: 49)
│   ├── Dresden (occurrences: 45)
│   ├── Padua (occurrences: 45)
│   ├── La Paz (occurrences: 44)
│   ├── Sri Lanka (occurrences: 43)
│   ├── Bordeaux (occurrences: 42)
│   ├── Cairo (occurrences: 42)
│   └── ...
├── Organizations (Showin