In [1]:
import pandas as pd
import numpy as np

from utils import (generate_prompt, 
                   generate_test_prompt, 
                    find_all_linear_names, 
                    predict_domain,
                    predict, 
                    evaluate, 
                    load_pretrained_model, 
                    generate_train_val_data,
                    initiate_trainer,
                    initiate_base_model)

import json

In [2]:
def correcting_semantic_type(semantic_type):
    return semantic_type.replace(',',';')

In [3]:
with open('cta_types_domain_reduced_5domain.json', 'r') as file:
    cta_type_domain = json.load(file)

In [4]:
def generate_test_prompt_NextiaJD_predict_domain(table):
    with open('cta_types_domain_reduced_5domain.json', 'r') as file:
        cta_type_domain = json.load(file)
    return f"""
            Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "Place".
            Task: Classify the table given to you with only one of the following domains that are separated with comma: {", ".join(cta_type_domain.keys())}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail. 3. Decide if describes a {", ".join(cta_type_domain.keys())}. 4. Answer only with the predicted domain. 
            Example 1: Table: [["Friends Pizza", 2525, Cash Visa MasterCard, 7:30 AM]]
            Domain: Restaurant
            Example 2: Table: [[Museum/Gallery, Vancouver; BC; V6J 2C7, Kitsilano]]
            Domain: Place
            Table: {table.iloc[:30,:].values}
            Domain: """.strip()

In [5]:
def generate_test_prompt_NextiaJD_few_shot(data_point, domain):
    with open('cta_types_domain_reduced_5domain.json', 'r') as file:
        cta_type_domain = json.load(file)
    return f"""
            Answer the question based on the task, instructions and examples below. If the question cannot be answered using the information provided answer with "I don't know".
            Task: Classify the text given to you with one of these classes that are separated with comma: {", ".join(cta_type_domain[domain])}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail.
            Example 1: Column: [Kitsilano, Strathcona, Downtown, UBC, Downtown, Mount Pleasant]
            label: addressLocality
            Example 2: Column: ['www.memorybc.ca/museum-of-15th-field-artillery-regiment','www.221a.ca/', 'https://www.facebook.com/ACMEstudiobuilding','http://gallery.ahva.ubc.ca/about/','http://www.mozaicoflamenco.com/', 'http://www.anzaclub.org','www.artbeatus.com', 'http://www.artsfactorysociety.ca/']
            label: URL
            Column: {data_point.values}
            label: """.strip()

In [6]:
def jaccard_similarity(set1, set2):
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
     
    return intersection / union

In [7]:
def generate_predictions(dataframe, model, tokenizer):
    X_test_domain = generate_test_prompt_NextiaJD_predict_domain(dataframe)
    domain = predict_domain(X_test_domain, model, tokenizer)
    print(domain)
    X_test = pd.DataFrame(dataframe.iloc[:30].apply(generate_test_prompt_NextiaJD_few_shot, args=(domain,), axis=0), columns=["prediction"])
    y_pred = predict(X_test, model, tokenizer, domain)
    predictions = pd.DataFrame({
                                "Column":dataframe.columns, 
                                "Predicted":y_pred
                                })
    return predictions, X_test

In [8]:
def generate_possible_joinable_columns(predictions_left, predictions_right, filename_left, filename_right):
    predicted_joinable_columns = []

    for i in range(predictions_left.shape[0]):
        column = predictions_left.iloc[i,0]
        predicted_semantic_type = predictions_left.iloc[i,1]
        for j in range(predictions_right.shape[0]):
            if(predicted_semantic_type == predictions_right.iloc[j,1]):
                predicted_joinable_columns.append([filename_left, column,
                                                   filename_right, predictions_right.iloc[j,:].Column, 
                                                   predicted_semantic_type])

    return predicted_joinable_columns

In [9]:
def calculate_similarities(sameSTColumns, df_left, df_right):
    similarity = []
    for i in range(sameSTColumns.shape[0]):
        try:
            similarity.append(jaccard_similarity(set(df_left[sameSTColumns.iloc[i,1]].unique()), set(df_right[sameSTColumns.iloc[i,3]].unique())))
        except:
            similarity.append(jaccard_similarity(set(df_right[sameSTColumns.iloc[i,1]].unique()), set(df_left[sameSTColumns.iloc[i,3]].unique())))

    return similarity

In [10]:
df_joinable_columns = pd.read_csv('joinable_columns_75containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [11]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [12]:
base_model_name = "meta-llama/Meta-Llama-3.1-8B"
model, tokenizer = initiate_base_model(base_model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
modules = find_all_linear_names(model)

In [14]:
ST_predictions_dict = {}
for i in range(len(all_joinable_files)):
    print(f'Making predictions for table {all_joinable_files[i]}. {i+1} out of {len(all_joinable_files)}')
    info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=info['delimiter'].values[0])
    prediction, X_test = generate_predictions(df, model, tokenizer)
    ST_predictions_dict[all_joinable_files[i]] = prediction

Making predictions for table community-centres.csv. 1 out of 14
Place


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.53it/s]


Making predictions for table community-gardens-and-food-trees.csv. 2 out of 14


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.65 GiB. GPU 

In [None]:
ST_predictions_dict

In [None]:
X_test.iloc[0,0]

In [None]:
prediction

In [None]:
all_predicted_joinable_columns = pd.DataFrame([])
similarity_calculations = 0
brute_force_calculations = 0

for i in range(len(all_joinable_files)-1):
    left_info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df_left = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=left_info['delimiter'].values[0])
    
    for j in range(i+1, len(all_joinable_files)):
        print(f'Calculating Similarities for tables {all_joinable_files[i]} and {all_joinable_files[j]}.')
        
        right_info = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[j]]
        df_right = pd.read_csv(f'datasets/{all_joinable_files[j]}', delimiter=right_info['delimiter'].values[0])
    
        predictions_left = ST_predictions_dict[all_joinable_files[i]]
        predictions_right = ST_predictions_dict[all_joinable_files[j]]
    
        predicted_joinable_columns = generate_possible_joinable_columns(predictions_left, predictions_right, all_joinable_files[i], all_joinable_files[j])
    
        try:
            sameSTColumns = pd.DataFrame(np.array(predicted_joinable_columns), columns=['FilenameLeft', 'ColumnLeft', 
                                                                                        'FilenameRight','ColumnRight',
                                                                                        'SemanticType'])
        except ValueError:
            print('No matches found, skipping to next column.')
            continue
            
        similarity = calculate_similarities(sameSTColumns, df_left, df_right)
        sameSTColumns['JaccardSimilarity'] = similarity
        joinableColumns = sameSTColumns[sameSTColumns['JaccardSimilarity'] >= 0.75]

        similarity_calculations += sameSTColumns.shape[0]
        brute_force_calculations += df_left.shape[1] * df_right.shape[1]
        
        if(len(joinableColumns) > 0):
            print(f'Adding {joinableColumns.shape[0]} columns')
        
        if(len(all_predicted_joinable_columns) == 0):
            all_predicted_joinable_columns = joinableColumns
        else:
            all_predicted_joinable_columns = pd.concat((all_predicted_joinable_columns, joinableColumns), axis=0)
            print(f'New size {all_predicted_joinable_columns.shape[0]}')

In [None]:
print(similarity_calculations, brute_force_calculations)

In [None]:
all_predicted_joinable_columns.shape

In [None]:
# for i in range(len(all_joinable_files)-1):   
#     for j in range(i+1, len(all_joinable_files)):
        

In [None]:
all_predicted_joinable_columns

In [None]:
df_joinable_columns

In [None]:
df_joinable_columns[df_joinable_columns['ds_name'] == 'cultural-spaces.csv']

In [None]:
all_predicted_joinable_columns[(all_predicted_joinable_columns['FilenameRight'] == 'cultural-spaces.csv') | (all_predicted_joinable_columns['FilenameLeft'] == 'cultural-spaces.csv')]

In [None]:
all_predicted_joinable_columns_joins = []
for i in range(len(all_predicted_joinable_columns)):
    all_predicted_joinable_columns_joins.append(';'.join(all_predicted_joinable_columns.iloc[i,:4].values))
all_predicted_joinable_columns_joins = np.array(all_predicted_joinable_columns_joins)
all_predicted_joinable_columns_joins

In [None]:
df_joinable_columns_joins = []
for i in range(len(df_joinable_columns)):
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,:4].values))
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,2:].values)+';'+';'.join(df_joinable_columns.iloc[i,:2].values))
df_joinable_columns_joins = np.array(df_joinable_columns_joins)
df_joinable_columns_joins = np.unique(df_joinable_columns_joins)
df_joinable_columns_joins

In [None]:
all_predicted_joinable_columns_joins.shape

In [None]:
df_joinable_columns_joins.shape

In [None]:
jaccard_similarity(set(all_predicted_joinable_columns_joins), set(df_joinable_columns_joins))

In [None]:
tp = 0
fp = 0
fn = 0

for i in range(len(all_predicted_joinable_columns_joins)):
    if(all_predicted_joinable_columns_joins[i] in df_joinable_columns_joins):
        tp += 1
    else:
        fp += 1
    fn = len(df_joinable_columns_joins) - (tp+fp)

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

In [None]:
ST_predictions_dict['eo4.csv']

In [None]:
ST_predictions_dict['eo_pr.csv']

In [None]:
ST_predictions_dict['cultural-spaces.csv']