# 1. INIT - Import packages

In [1]:
import torch
import os
import sys
from pathlib import Path

# --- Start of the robust path modification ---
try:
    # This variable is defined in VS Code notebooks
    current_file_path = Path(__vsc_ipynb_file__)
except NameError:
    # Fallback for other environments or .py scripts
    current_file_path = Path.cwd()

# Traverse up to find the project root
# We assume the project root contains a known directory, e.g., 'Deps' or a marker file like '.git'
project_root = current_file_path
while not (project_root / 'Deps').exists() and project_root.parent != project_root:
    project_root = project_root.parent

# Add project root to the Python path if it's not already there
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project Root: {project_root}")
print(f"Current Working Directory: {os.getcwd()}")

import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
from Deps.CustomFuctions.classes import IMAGENET2012_CLASSES

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print('Pytorch version :', torch.__version__)
print('CUDA version\t:', torch.version.cuda)
print('GPU\t\t:',torch.cuda.get_device_name())

Project Root: /workspace
Current Working Directory: /workspace/A02_Semantic_Analysis
Pytorch version : 2.0.1+cu117
CUDA version	: 11.7
GPU		: NVIDIA A40


# 2. INIT - Downloading the wordnet corpora

In [4]:
nltk.download('wordnet', download_dir= os.path.join(project_root, "Deps"))
# --- NLTK Path Configuration ---
# 1. Define the custom path where you downloaded the data
custom_nltk_data_path = os.path.join(project_root, "Deps")

# 2. Check if this path is already in NLTK's search paths
if custom_nltk_data_path not in nltk.data.path:
    # 3. If not, add it. Using insert(0, ...) gives it the highest priority.
    print(f"Adding custom NLTK data path: {custom_nltk_data_path}")
    nltk.data.path.insert(0, custom_nltk_data_path)
else:
    print("Custom NLTK data path already configured.")

# Now, NLTK knows to look inside your '/workspace/Deps' folder.
# This will work without any errors.
try:
    from nltk.corpus import wordnet
    syns = wordnet.synsets("computer")
    print("\nSuccessfully imported and used wordnet!")
    print(f"Synsets for 'computer': {syns[:5]}")
except LookupError as e:
    print(f"\nSomething went wrong. Here is NLTK's current search path:")
    print(nltk.data.path)

[nltk_data] Downloading package wordnet to /workspace/Deps...
[nltk_data]   Package wordnet is already up-to-date!


Adding custom NLTK data path: /workspace/Deps

Successfully imported and used wordnet!
Synsets for 'computer': [Synset('computer.n.01'), Synset('calculator.n.01')]


# 3. EXECUTIONS - wordnet analysis for hypercategories

In [6]:
class_labels = []
for class_code in IMAGENET2012_CLASSES.keys():
    class_labels.append(class_code)

# imagenet1k_hypercategory_v2
top_categories = {
    'mammal': 'mammal.n.01',
    'others_animal': 'animal.n.01',
    'instrumentality': 'instrumentality.n.03',
    'others_artifact': 'artifact.n.01',
}

def get_ordered_hypernyms(synset):
    hypernyms = []
    current = synset
    while current.hypernyms():
        current = current.hypernyms()[0]
        hypernyms.append(current)
    return hypernyms

class_categories = {}
all_synset_names = []

for synset_id in class_labels:
    synset = wn.synset_from_pos_and_offset('n', int(synset_id[1:]))
    ordered_hypernyms = get_ordered_hypernyms(synset)
    ordered_hypernym_names = [hypernym.name() for hypernym in ordered_hypernyms]
    # print(f"Synset: {synset.name()}, hypernyms: {ordered_hypernym_names}")
    
    assigned_category = 'others_entity'
    for category, top_synset_name in top_categories.items():
        if assigned_category != 'others_entity': break
        for synset_name in ordered_hypernym_names:
            if synset_name == top_synset_name:
                assigned_category = category
                break
    
    class_categories[synset_id] = assigned_category
    all_synset_names += ordered_hypernym_names

value_counts = Counter(class_categories.values())
sorted_value_counts = sorted(value_counts.items())
for value, count in sorted_value_counts:
    print(f"{value}: {count}")

torch.save(class_categories, os.path.join(project_root, "Results", "hypercategory", "imagenet1k_hypercategory_v2.pt"))


instrumentality: 350
mammal: 218
others_animal: 180
others_artifact: 172
others_entity: 80
