In [6]:
import os
import json
import pandas as pd

CSV_PATH = 'classes_GS.csv'
TABLES_FOLDER = 'tables'

df = pd.read_csv(CSV_PATH, header=None)
df.columns = ['table_file', 'dbpedia_class', 'dbpedia_url']
df['table_id'] = df['table_file'].str.replace('.tar.gz', '', regex=False)
mapping = df.set_index('table_id')[['dbpedia_class', 'dbpedia_url']].to_dict('index')

output = []

for table_id, meta in mapping.items():
    json_path = os.path.join(TABLES_FOLDER, f"{table_id}.json")
    if not os.path.isfile(json_path):
        print(f"Warning: {json_path} not found, skipping.")
        continue

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            table_data = json.load(f)
    except UnicodeDecodeError:
        with open(json_path, 'r', encoding='latin-1') as f:
            table_data = json.load(f)

    relation = table_data.get('relation', [])
    # The correct: first element of each row in 'relation' is the column name
    for row in relation:
        if row:  # not empty
            colname = row[0]
            output.append({
                'table_id': table_id,
                'column_name': colname,
                'dbpedia_class': meta['dbpedia_class'],
                'dbpedia_url': meta['dbpedia_url']
            })

out_df = pd.DataFrame(output)
out_df.to_csv('t2dv2_table_columns_dbpedia.csv', index=False)
print("Done! Output written to t2dv2_table_columns_dbpedia.csv")


Done! Output written to t2dv2_table_columns_dbpedia.csv


In [4]:
import pandas as pd

def group_t2dv2_columns(input_path: str, output_path: str) -> pd.DataFrame:
    """
    Reads the Excel file at input_path, groups by ['index','name','area'],
    counts the number of rows per group, and writes the result to output_path.
    Returns the grouped DataFrame.
    """
    # 1. Load the data
    df = pd.read_excel(input_path)

    # 2. Group by index, name, area and count rows
    grouped = (
        df
        .groupby(['index', 'name', 'area'])
        .size()
        .reset_index(name='ColumnCount')
    )

    # 3. (Optional) Save to a new Excel file
    grouped.to_excel(output_path, index=False)
    return grouped

if __name__ == "__main__":
    input_file  = "t2dv2Columns.xlsx"
    output_file = "Grouped_t2dv2Columns.xlsx"
    result = group_t2dv2_columns(input_file, output_file)
    print("Grouped result:")
    print(result.head())


Grouped result:
                                 index                            name  \
0  10151359_0_8168779773862259178.json  10151359_0_8168779773862259178   
1  10579449_0_1681126353774891032.json  10579449_0_1681126353774891032   
2  10630177_0_4831842476649004753.json  10630177_0_4831842476649004753   
3  11278409_0_3742771475298785475.json  11278409_0_3742771475298785475   
4   1146722_1_7558140036342906956.json   1146722_1_7558140036342906956   

        area  ColumnCount  
0       Book            3  
1  Newspaper            3  
2   Building            7  
3    Company            5  
4   Mountain            7  


In [1]:
import pandas as pd
import numpy as np
import difflib
from gensim.models.fasttext import load_facebook_vectors

# Load DBpedia ontology terms
dbpedia_terms_df = pd.read_excel('dbpedia.xlsx')
dbpedia_labels = dbpedia_terms_df['cleaned_label'].astype(str).str.lower().tolist()
dbpedia_ids = dbpedia_terms_df['id'].astype(str).tolist()
dbpedia_label_to_id = dict(zip(dbpedia_labels, dbpedia_ids))

# Load FastText model (English)
fasttext_model = load_facebook_vectors('cc.en.300.bin')  # update to your FastText path

def get_embedding(text):
    words = text.split()
    vectors = [fasttext_model[w] for w in words if w in fasttext_model]
    if not vectors:
        return np.zeros(fasttext_model.vector_size)
    return np.mean(vectors, axis=0)

# Precompute embeddings for ontology labels
dbpedia_embeddings = {label: get_embedding(label) for label in dbpedia_labels}


In [2]:
def syntactic_match(header, labels_list, label_to_id, threshold=0.9):
    header = header.strip().lower()
    if header in label_to_id:
        return label_to_id[header]
    matches = difflib.get_close_matches(header, labels_list, n=1, cutoff=threshold)
    if matches:
        return label_to_id[matches[0]]
    return None

def semantic_match(header, labels_list, label_to_id, embeddings_dict, similarity_threshold=0.85):
    header_emb = get_embedding(header)
    best_sim = -1
    best_label = None
    for label, emb in embeddings_dict.items():
        sim = np.dot(header_emb, emb) / (np.linalg.norm(header_emb) * np.linalg.norm(emb) + 1e-8)
        if sim > best_sim:
            best_sim = sim
            best_label = label
    if best_sim >= similarity_threshold:
        return label_to_id[best_label], best_sim
    return None, best_sim

df = pd.read_csv('t2dv2_table_columns_dbpedia.csv')

main_column = []

for table_id, group in df.groupby('table_id'):
    dbpedia_class = group['dbpedia_class'].iloc[0]
    # Convert CamelCase to space-separated for a human-readable label
    human_label = ''.join([' ' + c if c.isupper() else c for c in dbpedia_class]).strip().lower()
    best_match = None
    best_score = -1
    match_type = None

    # First try syntactic match for each column
    for col in group['column_name'].dropna().astype(str):
        syn_match = syntactic_match(col, dbpedia_labels, dbpedia_label_to_id)
        if syn_match:
            best_match = col
            best_score = 1.0
            match_type = 'syntactic'
            break

    # Then semantic if no syntactic found
    if best_match is None:
        for col in group['column_name'].dropna().astype(str):
            sem_match, sim = semantic_match(col, dbpedia_labels, dbpedia_label_to_id, dbpedia_embeddings, similarity_threshold=0.8)
            if sim > best_score:
                best_match = col
                best_score = sim
                match_type = 'semantic'

    # Fallback: substring match to the human label
    if best_match is None:
        for col in group['column_name'].dropna().astype(str):
            if any(word in col.lower() for word in human_label.split()):
                best_match = col
                best_score = 0.5
                match_type = 'fallback_label'
                break

    main_column.append({
        'table_id': table_id,
        'dbpedia_class': dbpedia_class,
        'dbpedia_url': group['dbpedia_url'].iloc[0],
        'main_column': best_match if best_match is not None else '',
        'match_score': best_score,
        'match_type': match_type if match_type else ''
    })

df_main = pd.DataFrame(main_column)
df_main.to_csv('t2dv2_table_main_column_semantic.csv', index=False)
print("Done! Output written to t2dv2_table_main_column_semantic.csv")



Done! Output written to t2dv2_table_main_column_semantic.csv
