In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
iab_tax_path = 'data/content/Content Taxonomy 3.0.tsv'

In [5]:
iab_tax = pd.read_csv(iab_tax_path,
                      sep='\t',
                      header=1, # IAB Content Taxonomy has 2 headers
)
iab_tax.head()

Unnamed: 0,Unique ID,Parent,Name,Tier 1,Tier 2,Tier 3,Tier 4,Unnamed: 7
0,150,150,Attractions,Attractions,,,,
1,151,150,Amusement and Theme Parks,Attractions,Amusement and Theme Parks,,,
2,179,150,Bars & Restaurants,Attractions,Bars & Restaurants,,,
3,181,150,Casinos & Gambling,Attractions,Casinos & Gambling,,,
4,153,150,Historic Site and Landmark Tours,Attractions,Historic Site and Landmark Tours,,,


In [6]:
# Substitute NaN with empty string in columns starting with 'Tier'
iab_tax.loc[:, iab_tax.columns.str.startswith('Tier')] = iab_tax.loc[:, iab_tax.columns.str.startswith('Tier')].fillna('')
# Create a new column by concatenating columns starting with 'Tier'
iab_tax['combined'] = iab_tax.loc[:, iab_tax.columns.str.startswith('Tier')].apply(lambda x: ' '.join(x), axis=1)
# Delete leading and trailing whitespaces
iab_tax['combined'] = iab_tax['combined'].str.strip()

In [8]:
# Fill the Nan in Parent with 0
iab_tax['Parent'] = iab_tax['Parent'].fillna(0)

In [9]:
iab_tax.head()

Unnamed: 0,Unique ID,Parent,Name,Tier 1,Tier 2,Tier 3,Tier 4,Unnamed: 7,combined
0,150,150,Attractions,Attractions,,,,,Attractions
1,151,150,Amusement and Theme Parks,Attractions,Amusement and Theme Parks,,,,Attractions Amusement and Theme Parks
2,179,150,Bars & Restaurants,Attractions,Bars & Restaurants,,,,Attractions Bars & Restaurants
3,181,150,Casinos & Gambling,Attractions,Casinos & Gambling,,,,Attractions Casinos & Gambling
4,153,150,Historic Site and Landmark Tours,Attractions,Historic Site and Landmark Tours,,,,Attractions Historic Site and Landmark Tours


In [10]:
iab_tax.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 703 entries, 0 to 702
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unique ID   703 non-null    object
 1   Parent      703 non-null    object
 2   Name        703 non-null    object
 3   Tier 1      703 non-null    object
 4   Tier 2      703 non-null    object
 5   Tier 3      703 non-null    object
 6   Tier 4      703 non-null    object
 7   Unnamed: 7  64 non-null     object
 8   combined    703 non-null    object
dtypes: object(9)
memory usage: 49.6+ KB


In [11]:
taxonomy_name = 'iab_content'

In [13]:
dataset_path = '../../data/'

In [17]:
taxonomy_data_path = dataset_path + taxonomy_name + '/'

# Generate .terms file

In [18]:
with open(f'{taxonomy_data_path}{taxonomy_name}.terms', 'w') as f:
    for index, row in iab_tax.iterrows():
        f.write(f"{row['Unique ID']}\t{row['combined']}\n")

# Generate .taxo file

In [19]:
with open(f'{taxonomy_data_path}{taxonomy_name}.taxo', 'w') as f:
    for index, row in iab_tax.iterrows():
        if row['Parent'] != 0:
            f.write(f"{row['Parent']}\t{row['Unique ID']}\n")

# Generate embeddings

In [14]:
import sentence_transformers
from sentence_transformers import SentenceTransformer, util

In [21]:
embed_dir = 'data/content/embeddings/'
iab_embed_file = embed_dir + 'content_tax_embeddings.npy'
if os.path.exists(iab_embed_file):
    taxonomy_embeddings = np.load(iab_embed_file)
    print("Found embeddings of shape: ")
    print(taxonomy_embeddings.shape)

Found embeddings of shape: 
(703, 384)


In [22]:
taxon_id = iab_tax['Unique ID'].values

In [23]:
with open(f'{taxonomy_data_path}{taxonomy_name}.terms.embed', 'w') as f:
    f.write(f"{taxonomy_embeddings.shape[0]} {taxonomy_embeddings.shape[1]}\n")
    for tax_id, tax_embed in zip(taxon_id, taxonomy_embeddings):
        f.write(f"{tax_id} {' '.join(map(str, tax_embed))}\n")

In [24]:
import dgl

FileNotFoundError: Could not find module 'C:\Users\giochi99_Win\PycharmProjects\taxo_compl\.venv\lib\site-packages\dgl\dgl.dll'. Try using the full path with constructor syntax.